|
| 1 | +""" |
| 2 | +Implement Kolmogorov–Smirnov test that can be used to compare two non-parametric variables |
| 3 | +https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test |
| 4 | +
|
| 5 | +Phong D. Le - le.duc.phong@gmail.com |
| 6 | +""" |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +from scipy.stats import iqr, tstd |
| 10 | +def KSTest(X, Y): |
| 11 | + """ |
| 12 | + Calculate Kolmogorov–Smirnov statistic of two samples |
| 13 | +
|
| 14 | + Formula: D = max(abs(CDF(X) - CDF(Y))) |
| 15 | + """ |
| 16 | + # Calculate the number of bins using Sturges' rule |
| 17 | + # Formula: bins = ceil(1 + log2(n)) |
| 18 | + bins1 = np.ceil(1 + np.log2(len(X))).astype('int') |
| 19 | + bins2 = np.ceil(1 + np.log2(len(Y))).astype('int') |
| 20 | + # Number of bins will be the average of two bins |
| 21 | + bins = np.ceil((bins1 + bins2)/2).astype('int') |
| 22 | + |
| 23 | + # Split data of two variables in the same range and the same number of bins |
| 24 | + amax = max(np.max(X), np.max(Y)) |
| 25 | + amin = min(np.min(X), np.min(Y)) |
| 26 | + histX, _ = np.histogram(X, bins=bins, range=[amin, amax]) |
| 27 | + histY, _ = np.histogram(Y, bins=bins, range=[amin, amax]) |
| 28 | + # Get the probability of bins |
| 29 | + px = histX / histX.sum() |
| 30 | + py = histY / histY.sum() |
| 31 | + # Get cumulative distribution probability |
| 32 | + cdfX = np.cumsum(px) |
| 33 | + cdfY = np.cumsum(py) |
| 34 | + |
| 35 | + return max(abs(cdfX - cdfY)) |
| 36 | + |
| 37 | + |
| 38 | +if __name__ == '__main__': |
| 39 | + # X, a random Gaussian distribution |
| 40 | + X = np.random.normal(0, 100, 1000) |
| 41 | + # Y, a random bivariate Gaussian distribution |
| 42 | + Y = np.concatenate([np.random.normal(0, 50, 500), |
| 43 | + np.random.normal(200, 70, 500)]) |
| 44 | + |
| 45 | + print('Kolmogorov–Smirnov test: D(X,Y) = {}'.format(KSTest(X, Y))) |
0 commit comments