Skip to content

Commit 26b9253

Browse files
authored
Create ks_test.py
1 parent 2e54c34 commit 26b9253

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

ks_test.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
Implement Kolmogorov–Smirnov test that can be used to compare two non-parametric variables
3+
https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
4+
5+
Phong D. Le - le.duc.phong@gmail.com
6+
"""
7+
8+
import numpy as np
9+
from scipy.stats import iqr, tstd
10+
def KSTest(X, Y):
11+
"""
12+
Calculate Kolmogorov–Smirnov statistic of two samples
13+
14+
Formula: D = max(abs(CDF(X) - CDF(Y)))
15+
"""
16+
# Calculate the number of bins using Sturges' rule
17+
# Formula: bins = ceil(1 + log2(n))
18+
bins1 = np.ceil(1 + np.log2(len(X))).astype('int')
19+
bins2 = np.ceil(1 + np.log2(len(Y))).astype('int')
20+
# Number of bins will be the average of two bins
21+
bins = np.ceil((bins1 + bins2)/2).astype('int')
22+
23+
# Split data of two variables in the same range and the same number of bins
24+
amax = max(np.max(X), np.max(Y))
25+
amin = min(np.min(X), np.min(Y))
26+
histX, _ = np.histogram(X, bins=bins, range=[amin, amax])
27+
histY, _ = np.histogram(Y, bins=bins, range=[amin, amax])
28+
# Get the probability of bins
29+
px = histX / histX.sum()
30+
py = histY / histY.sum()
31+
# Get cumulative distribution probability
32+
cdfX = np.cumsum(px)
33+
cdfY = np.cumsum(py)
34+
35+
return max(abs(cdfX - cdfY))
36+
37+
38+
if __name__ == '__main__':
39+
# X, a random Gaussian distribution
40+
X = np.random.normal(0, 100, 1000)
41+
# Y, a random bivariate Gaussian distribution
42+
Y = np.concatenate([np.random.normal(0, 50, 500),
43+
np.random.normal(200, 70, 500)])
44+
45+
print('Kolmogorov–Smirnov test: D(X,Y) = {}'.format(KSTest(X, Y)))

0 commit comments

Comments
 (0)