-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdparetoll.py
167 lines (151 loc) · 5.69 KB
/
dparetoll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import numpy as np
import scipy.stats as st
import pandas as pd
import scipy.interpolate as interpolate
import math
import scipy.optimize as opt
from scipy.optimize import curve_fit
import scipy.odr
from statsmodels.formula.api import ols
#need dpareto.R
def length(x):
if type(x) == float or type(x) == int or type(x) == np.int32 or type(x) == np.float64 or type(x) == np.float32 or type(x) == np.int64:
return 1
return len(x)
def qdpareto(p, theta, lowertail = True, logp = False):
if length(p) > 1:
p = np.array(p)
if (theta <= 0 or theta>=1):
return "theta must be between 0 and 1!"
if(logp):
p = np.exp(p)
if not lowertail:
p = 1-p
allp = []
for i in range(length(p)):
if p < 1 and p > 0:
if length(p) == 1:
allp.append(max(np.floor(np.exp(np.log(1-p)/np.log(theta))-2),0))
break
allp.append(max(np.floor(np.exp(np.log(1-p[i])/np.log(theta))-2),0))
elif p == 1:
allp.append(np.inf)
elif p == 0:
allp.append(0)
else:
allp.append(np.nan)
allp = np.array(allp)
#print(np.where(p==1))
#allp[np.where(allp==1)] = np.inf
#allp[np.where(allp==0)] = 0
#allp[np.where(allp>1) or np.where(allp<0)] = np.nan
if any(allp == None):
print("NaNs produced")
return allp
def ddpareto(x,theta,log=False):
if theta <= 0 or theta >= 1:
if theta <=0:
theta = 0
else:
theta = 1
n = [np.where(y < 0) for y in x]
temp = []
for i in range(len(n)):
temp.extend(n[i][0])
n = temp
x1 = x
x = [max(y,0) for y in x]
x = np.array(x)
p = theta**(np.log(1+x))-theta**(np.log(2+x))
if len(n) > 0:
p[n[0]] = 0
if log:
p = np.log(p)
for i in range(len(p)):
if x1[i] < 0:
p[i] = 0
else:
p[i] = p[i]
if not log:
p = [min(max(y,0),1) for y in p]
return np.array(p)
def dparetoll(x,theta=None):
'''
Maximum Likelihood Estimation for the Discrete Pareto Distribution
Description
Performs maximum likelihood estimation for the parameter of the discrete
Pareto distribution.
Usage
dparetoll(x, theta = None)
Parameters
----------
x:
A vector of raw data which is distributed according to a
Poisson-Lindley distribution.
theta:
Optional starting value for the parameter. If None, then the method of
moments estimator is used.
Details
The discrete Pareto distribution is a discretized of the continuous Type
II Pareto distribution (also called the Lomax distribution).
Returns
-------
dparetoll returns a dataframe with coefficient theta
References
----------
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Krishna, H. and Pundir, P. S. (2009), Discrete Burr and Discrete Pareto
Distributions, Statistical Methodology, 6, 177–188.
Young, D. S., Naghizadeh Qomi, M., and Kiapour, A. (2019), Approximate
Discrete Pareto Tolerance Limits for Characterizing Extremes in Count
Data, Statistica Neerlandica, 73, 4–21.
Examples
## Maximum likelihood estimation for randomly generated data
## from the discrete Pareto distribution.
dparetoll(x=[1,4,2,5,6,2,4,7,3,2])
dparetoll(x=[1,4,2,5,6,2,4,7,3,2],theta = 0.2)
'''
if theta is not None:
if theta >= 1 or theta <= 0:
return 'theta must be between 0 and 1. '
thtable = pd.DataFrame({'0':[0,0.050,0.089,0.126,0.164,0.203,0.244,0.286,0.332,0.380,
0.431,0.486,0.545,0.609,0.678,0.753,0.835,0.925,1.025,1.135,
1.258,1.395,1.549,1.723,1.921,2.146,2.404,2.701,3.044,3.442,
3.904,4.442,5.071,5.807,6.669,7.682,8.870],'1':np.arange(0,.3601,0.01)})
x = np.array(x)
inftest = np.where(x==np.inf)[0]
if len(inftest)>0:
noinf = np.where(x != np.inf)[0]
for i in range(len(inftest)):
x[inftest[i]] = np.max(x[noinf])
print("Values of x equal to 'Inf' are set to the maximum finite value.")
xbar = np.mean(x)
if theta is None:
if xbar <= max(thtable['0']):
ind = np.max(np.where(thtable['0']<xbar))
y = interpolate.interp1d(np.array(thtable.iloc[ind:ind+2]['0']), np.array(thtable.iloc[ind:ind+2]['1']))
theta = y(xbar)
else:
Shat = np.array(pd.DataFrame([np.mean(x>=x[i]) for i in range(len(x))]))
try:
theta = math.prod([y**((sum(np.log(1+x)))**(-1)) for y in Shat])
except TypeError:
return 'an x[i] in the inputted list is too large, do not exceed 17,999,999,999,999,999,999'
tmp = np.where(ddpareto(x, theta=theta,log=True)==-np.inf)
tmp = tmp[0]
xtmp = np.where(ddpareto(x, theta=theta,log=True)!=-np.inf)
maxx = max(x[xtmp[0]])
if length(tmp) > 0:
x[tmp] = maxx
print("Numerical overflow problem when calculating log-density of some x values. The problematic values are set to the maximum finite value calculated.")
def llf(theta):
return -sum(ddpareto(x,theta,log=True))
fit = opt.minimize(llf, x0 = theta, method = 'BFGS')['x']
fit = pd.DataFrame(fit)
fit.index = ['Coefficient']
fit.columns = ['theta']
return fit
# print(dparetoll(x=[0,1,0,4,0,1,0,0,1,0,1]))
# print(dparetoll(x=[0,1,0,4,0,1,0,0,1,0,1],theta = 0.9))