-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmvregtolregion.py
345 lines (277 loc) · 11.5 KB
/
mvregtolregion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import scipy.stats as st
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import patsy
def length(x):
if type(x) == float or type(x) == int or type(x) == np.int32 or type(x) == np.float64 or type(x) == np.float32 or type(x) == np.int64:
return 1
return len(x)
def rwishart(df, p):
'''
Random Wishart Distributed Matrices
Usage
rwishart(df, p)
Parameters
----------
df : int
The degrees of freedom for the Wishart matrix to be generated.
p : int
The dimension of the random Wishart matrix.
Returns
-------
X : matrix
Random generation of Wishart matrices.
References
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Yee, T. (2010), The VGAM Package for Categorical Data Analysis, Journal of
Statistical Software, 32, 1–34.
Example:
## Generate a 4x4 wishart matrix with 10 degrees of freedom.
rwishart(df = 10, p = 4)
'''
X = np.zeros(p*p)
shape = (p,p)
X = X.reshape(shape)
#X = pd.DataFrame(X)
#print(X.iloc[0][1])
chi2rvs = []
for i in range(p):
chi2rvs.append(np.sqrt(st.chi2.rvs(size = 1,df = df-i)))
np.fill_diagonal(X,chi2rvs)
if p > 1:
a = []
for i in range(1,p):
pseq = list(range(1,p))
a.extend(np.repeat(4*pseq[i-1], i))
# a is equivalent to rep(p*pseq,pseq)
#X = pd.DataFrame(X)
for i in range(p-1):
for j in range((p-1)-i):
X[i][j+1+i] = st.norm.rvs(size = 1)
X = np.dot(X.T,X)
return X
#https://www.geeksforgeeks.org/python-dividing-two-lists/
def divide_lists(list1, list2):
if not list1 or not list2:
return []
return [list1[0] / list2[0]] + divide_lists(list1[1:], list2[1:])
def mvregtolregion(mvreg, formI, df, newx = None, alpha = 0.05, P = 0.99, B = 1000):
'''
Multivariate (Multiple) Linear Regression Tolerance Regions
Description
Determines the appropriate tolerance factor for computing multivariate
(multiple) linear regression tolerance regions based on Monte Carlo
simulation.
Usage
mvregtolregion(mvreg, formI, df, newx = None, alpha = 0.05, P = 0.99,
B = 1000)
Parameters
----------
mvreg : multiple linear model
A multivariate (multiple) linear regression fit, having class mlm.
formI : string
The formula of the right-hand side. y = x*.
df : DataFrame
The DataFrame that holds all data.
newx : DataFrame, optional
An optional data frame of new values for which to approximate
k-factors. This must be a data frame with named columns that match
those in the data frame used for the mvreg fitted object. This can
only be done for new x's, not new y's. The default is None.
alpha : float, optional
The level chosen such that 1-alpha is the confidence level. The
default is 0.05.
P : float, optional
The proportion of the population to be covered by this tolerance
region. The default is 0.99.
B : int, optional
The number of iterations used for the Monte Carlo algorithm which
determines the tolerance factor. The number of iterations should be at
least as large as the default value of 1000.
Returns
-------
mvregtolregion returns a matrix where the first column is the k-factor,
the next q columns are the estimated responses from the least squares fit,
and the final m columns are the predictor values. The first n rows of the
matrix pertain to the raw data as specified by y and x. If values for newx
are specified, then there are is one additional row appended to this output
for each row in the matrix newx.
Details
-------
A basic sketch of how the algorithm works is as follows:
1. Generate independent chi-square random variables and Wishart
random matrices.
2. Compute the eigenvalues of the randomly generated Wishart matrices.
3. Iterate the above steps to generate a set of B sample values such
that the 100(1-alpha)-th percentile is an approximate tolerance factor.
References
----------
Anderson, T. W. (2003) An Introduction to Multivariate Statistical
Analysis, Third Edition, Wiley.
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Krishnamoorthy, K. and Mathew, T. (2009), Statistical Tolerance Regions:
Theory, Applications, and Computation, Wiley.
Krishnamoorthy, K. and Mondal, S. (2008), Tolerance Factors in Multiple
and Multivariate Linear Regressions, Communications in Statistics -
Simulation and Computation, 37, 546–559.
Examples
--------
grain = pd.DataFrame([40, 17, 9, 15, 6, 12, 5, 9],columns = ['grain'])
straw = pd.DataFrame([53, 19, 10, 29, 13, 27, 19, 30], columns = ['straw'])
flower = pd.DataFrame([90,21,45,67,23,28,60,99],columns = ['flower'])
fert = pd.DataFrame([24, 11, 5, 12, 7, 14, 11, 18],columns = ['fert'])
## Must be in order, y's on left, x on right
df = pd.concat([grain,straw,flower,fert],axis = 1)
#optional argument
newx = pd.DataFrame({'fert':[10,15,20]})
#######no need to touch
dfy = []
for i in range(length(df.iloc[0]-1)):
dfy.append(df.iloc[:,i])
dfy = dfy[:length(dfy)-1]
dfy = pd.DataFrame(dfy).T
##############
#your right side (x) of the equation, this must be changed to fit your equation
formulaI = 'fert + I(fert**2)'
#######no need to touch
lm = []
for i in range(length(dfy.iloc[0])):
formula = formulaI
formula = 'dfy.iloc[:,i] ~ ' + formula
lm.append(ols(formula,data=df).fit())
##############
mvregtolregion(lm, formulaI, df, newx, alpha = 0.01, P = 0.95, B = 5000)
'''
try:
mvreg[0].params
except:
return "mvreg must be a mlm object"
else:
if length(df.iloc[0]) == 2:
return 'mvreg must be a mlm object'
names = df.columns
X = pd.DataFrame(patsy.dmatrix(formI))
xformulanames = formI.split("+")
n = length(X.iloc[:,0])
q = length(mvreg)
yvars = names[:-1:1]
if all(X.iloc[:,0] == 1):
m = length(X.iloc[0]) -1
loc = []
for i in range(length(X.iloc[0]) -1):
loc.append(i+1)
x = X.loc[:,loc]
xvars = x.columns
else:
m = length(X.iloc[0])
x = X
xvars = X.columns
xbar = []
for i in range(length(X.iloc[0])-1):
xbar.append(np.mean(X.iloc[:,i+1]))
fm = n-m-1
Pn = np.zeros(shape=(n,n)) + 1
Pa = np.zeros(shape=(n,n))
np.fill_diagonal(Pa,1)
Pn = Pa - (Pn/n)
xvars = x.columns
yhat = []
for i in range(q):
yhat.append(mvreg[i].predict())
yhat = pd.DataFrame(np.array(yhat).T,columns = yvars)
if type(newx) == pd.core.frame.DataFrame:
nn = length(newx.iloc[:,0])
newxy = np.array(range(1,q*nn+1))
newxy = pd.DataFrame(newxy.reshape([q,nn]).T)
newxy = pd.concat([newxy, newx],axis=1)
newxy.columns = names
yhatn = []
for i in range(length(yvars)):
yhatn.append(mvreg[i].predict(newx))
yhatn = pd.DataFrame(yhatn).T
yhatn.columns = yvars
yhat = pd.concat([yhat,yhatn])
yhat.index = range(length(yhat.iloc[:,0]))
newx = pd.DataFrame(patsy.dmatrix(formI,data=newxy)).iloc[:, 1:]
xall = pd.concat([x,newx])
xall.columns = xformulanames
xall.index = range(length(xall.iloc[:,0]))
N = length(xall.iloc[:,0])
A = np.linalg.inv(np.linalg.multi_dot([x.T,Pn,x]))
d2 = []
for i in range(N):
d2.append((1/n)+np.linalg.multi_dot([xall.iloc[i]-xbar,A,xall.iloc[i]-xbar]))
H2 = []
L = []
c1 = []
c2 = []
c3 = []
for i in range(B):
H2.append(np.array(st.chi2.rvs(size = int(N*q),df=1)).reshape([q,N]))
H2[i] = np.array([t*d2 for t in H2[i]])
for i in range(B):
L.append(np.array(sorted(np.linalg.eigvals(rwishart(fm,q)),reverse=True)).T)
L = np.array(L)
for i in range(B):
c1.append(sum((1+H2[i]**2)/L[:,0][i]))
c2.append(sum((1+2*H2[i]**2)/L[:,0][i]**2))
c3.append(sum((1+3*H2[i]**2)/L[:,0][i]**3))
a = np.array(c2)**3/np.array(c3)**2
c1 = np.array(c1)
c2 = np.array(c2)
Tall = []
for i in range(N):
Tall.append((fm * ((np.sqrt(c2[:,i])/a[:,i]) * (st.chi2.ppf(P, a[:,i]) - a[:,i]) + c1[:,i])))
k = list(map(lambda p: np.quantile(p,1-alpha),Tall))
k = pd.DataFrame(k)
tol = pd.concat([k,yhat,xall],axis =1)
names = [n + '.hat' for n in names[0:-1]]
formI = formI.split("+")
returncol = []
returncol.append('kfactor')
returncol.extend(names)
returncol.extend(formI)
tol.columns = returncol
print(f"These are the {(1-alpha)*100}%/{P*100}% tolerance factors")
if type(newx) == pd.core.frame.DataFrame:
indexnames = [list(range(length(X))),list(range(length(newxy)))]
indexnames = [item for sublist in indexnames for item in sublist]
indexnames[:length(X)] = ['X'+str(i+1) for i in indexnames[:length(X)]]
indexnames[length(X):] = ['X'+str(i+1)+'.1' for i in indexnames[length(X):]]
#print(indexnames)
else:
indexnames = range(length(X))
indexnames = ['X'+str(i+1) for i in indexnames]
tol.index = indexnames#range(length(X)),range(length(newxy))
return tol
# example:
# grain = pd.DataFrame([40, 17, 9, 15, 6, 12, 5, 9],columns = ['grain'])
# straw = pd.DataFrame([53, 19, 10, 29, 13, 27, 19, 30], columns = ['straw'])
# flower = pd.DataFrame([90,21,45,67,23,28,60,99],columns = ['flower'])
# fert = pd.DataFrame([24, 11, 5, 12, 7, 14, 11, 18],columns = ['fert'])
# ## Must be in order, y's on left, x on right
# df = pd.concat([grain,straw,flower,fert],axis = 1)
# #optional argument
# newx = pd.DataFrame({'fert':[10,15,20]})
# #######no need to touch
# dfy = []
# for i in range(length(df.iloc[0]-1)):
# dfy.append(df.iloc[:,i])
# dfy = dfy[:length(dfy)-1]
# dfy = pd.DataFrame(dfy).T
# ##############
# #your right side (x) of the equation, this must be changed to fit your equation
# formulaI = 'fert + I(fert**2)'
# #######no need to touch
# lm = []
# for i in range(length(dfy.iloc[0])):
# formula = formulaI
# formula = 'dfy.iloc[:,i] ~ ' + formula
# lm.append(ols(formula,data=df).fit())
# ##############
# print(mvregtolregion(lm, formulaI, df, newx, alpha = 0.01, P = 0.95, B = 5000))