normtolint.py

import scipy.stats 
import scipy.integrate as integrate
import numpy as np
import scipy.optimize as opt
import pandas as pd
import warnings
import statistics
warnings.filterwarnings('ignore')

def Kfactor(n, f = None, alpha = 0.05, P = 0.99, side = 1, method = 'HE', m=50):
    K=None
    if f == None:
        f = n-1
    if (len((n,)*1)) != len((f,)*1) and (len((f,)*1) > 1):
        return 'Length of \'f\' needs to match length of \'n\'!'
    if (side != 1) and (side != 2):
        return 'Must specify one sided or two sided procedure'
    if side ==1:
        if method == 'AFIT':
            zp = scipy.stats.norm.ppf(P)
            zp2 = zp**2
            za2 = scipy.stats.norm.ppf(alpha)**2
            a = 1 - (za2)/(2*(n-1))
            b = zp2 - (za2/n)
            K = (zp+np.sqrt(zp2-a*b))/(a)
        else:
            zp = scipy.stats.norm.ppf(P)
            ncp = np.sqrt(n)*zp
            ta = scipy.stats.nct.ppf(1-alpha,df = f, nc=ncp) #students t noncentralized
            K = ta/np.sqrt(n)
    else:
        def Ktemp(n, f, alpha, P, method, m):
            chia = scipy.stats.chi2.ppf(alpha, df = f)
            k2 = np.sqrt(f*scipy.stats.ncx2.ppf(P,df=1,nc=(1/n))/chia) #noncentralized chi 2 (ncx2))
            if method == 'AFIT':
                zomp = -scipy.stats.norm.ppf((1-P)/2)
                v = f
                opon = (1+(1/n))
                chi2 = scipy.stats.chi2.ppf(1-alpha,v)
                k2 = zomp*np.sqrt((v*opon)/(chi2))
                return k2
            if method == 'HE':
                def TEMP4(n, f, P, alpha):
                    chia =  scipy.stats.chi2.ppf(alpha, df = f)
                    zp = scipy.stats.norm.ppf((1+P)/2)
                    za = scipy.stats.norm.ppf((2-alpha)/2)
                    dfcut = n**2*(1+(1/za**2))
                    V = 1 + (za**2)/n + ((3-zp**2)*za**4)/(6*n**2)
                    K1 = (zp * np.sqrt(V * (1 + (n * V/(2 * f)) * (1 + 1/za**2))))
                    G = (f-2-chia)/(2*(n+1)**2)
                    K2 = (zp * np.sqrt(((f * (1 + 1/n))/(chia)) * (1 + G)))
                    if f > dfcut:
                        K = K1
                    else:
                        K = K2
                        if K == np.nan or K == None:
                            K = 0
                    return K
                #TEMP5 = np.vectorize(TEMP4())
                K = TEMP4(n, f, P, alpha)
                return K
                
            elif method == 'HE2':
                zp = scipy.stats.norm.ppf((1+P)/2)
                K = zp * np.sqrt((1+1/n)*f/chia)
                return K
            
            elif method == 'WBE':
                r = 0.5
                delta = 1
                while abs(delta) > 0.00000001:
                    Pnew = scipy.stats.norm.cdf(1/np.sqrt(n)+r) - scipy.stats.norm.cdf(1/np.sqrt(n)-r)
                    delta = Pnew-P
                    diff = scipy.stats.norm.pdf(1/np.sqrt(n)+r) + scipy.stats.norm.pdf(1/np.sqrt(n)-r)
                    r = r-delta/diff
                K = r*np.sqrt(f/chia)
                return K
            
            elif method == 'ELL':
                if f < n**2:
                    print("Warning Message:\nThe ellison method should only be used for f appreciably larger than n^2")
                r = 0.5
                delta = 1
                zp = scipy.stats.norm.ppf((1+P)/2)
                while abs(delta) > 0.00000001:
                    Pnew = scipy.stats.norm.cdf(zp/np.sqrt(n)+r) - scipy.stats.norm.cdf(zp/np.sqrt(n)-r)
                    delta = Pnew - P
                    diff =  scipy.stats.norm.pdf(zp/np.sqrt(n)+r) +  scipy.stats.norm.pdf(zp/np.sqrt(n)-r)
                    r = r-delta/diff
                K = r*np.sqrt(f/chia)
                return K
            elif method == 'KM':
                K = k2
                return K
            elif method == 'OCT':
                delta = np.sqrt(n)*scipy.stats.norm.ppf((1+P)/2)
                def Fun1(z,P,ke,n,f1,delta):
                    return (2 * scipy.stats.norm.cdf(-delta + (ke * np.sqrt(n * z))/(np.sqrt(f1))) - 1) * scipy.stats.chi2.pdf(z,f1) 
                def Fun2(ke, P, n, f1, alpha, m, delta):
                    if n < 75:
                        return integrate.quad(Fun1,a = f1 * delta**2/(ke**2 * n), b = np.inf, args=(P,ke,n,f1,delta),limit = m)
                    else:
                        return integrate.quad(Fun1,a = f1 * delta**2/(ke**2 * n), b = n*1000, args=(P,ke,n,f1,delta),limit = m)
                def Fun3(ke,P,n,f1,alpha,m,delta):
                    f = Fun2(ke = ke, P = P, n = n, f1 = f1, alpha = alpha, m = m, delta = delta)
                    return abs(f[0] - (1-alpha))
                K = opt.minimize(fun=Fun3, x0=k2,args=(P,n,f,alpha,m,delta), method = 'L-BFGS-B')['x']
                return float(K)
            elif method == 'EXACT':
                def fun1(z,df1,P,X,n):
                    k = (scipy.stats.chi2.sf(df1*scipy.stats.ncx2.ppf(P,1,z**2)/X**2,df=df1)*np.exp(-0.5*n*z**2))
                    return k
                def fun2(X,df1,P,n,alpha,m):
                    return integrate.quad(fun1,a =0, b = 5, args=(df1,P,X,n),limit=m)
                def fun3(X,df1,P,n,alpha,m):
                    return np.sqrt(2*n/np.pi)*fun2(X,df1,P,n,alpha,m)[0]-(1-alpha)
                K = opt.brentq(f=fun3,a=0,b=k2+(1000)/n, args=(f,P,n,alpha,m))
                return K
        K = Ktemp(n=n,f=f,alpha=alpha,P=P,method=method,m=m)
    return K

def normtolint(x, alpha = 0.05, P = 0.99, side = 1, method = 'HE', m = 50, lognorm = False):
    '''
    normtolint(x, alpha = 0.05, P = 0.99, side = 1, method = ["HE", "HE2", "WBE", "ELL", "KM", "EXACT", "OCT"], m = 50, lognorm = False):
        
Parameters
----------
    x: list
        A vector of data which is distributed according to either a normal 
        distribution or a log-normal distribution.
    
    alpha: float, optional
        The level chosen such that 1-alpha is the confidence level.
        The default is 0.05.
    
    P: float, optional
        The proportion of the population to be covered by this tolerance 
        interval. The default is 0.99.
    
    side: 1 or 2, optional
        Whether a 1-sided or 2-sided tolerance interval is required 
        (determined by side = 1 or side = 2, respectively). The default is 1.
    
    method: string, optional
        The method for calculating the k-factors. The k-factor for the 1-sided 
        tolerance intervals is performed exactly and thus is the same for the 
        chosen method. 
            "AFIT" is the Air Force Institute of Technology's Howe method. 
            
            "HE" is the general Howe method and is often viewed as being 
            extremely accurate, even for small sample sizes. 
        
            "HE2" is a second method due to Howe, which performs similarly to the 
            Weissberg-Beatty method, but is computationally simpler. 
        
            "WBE" is the Weissberg-Beatty method 
            (also called the Wald-Wolfowitz method), which performs similarly to 
            the first Howe method for larger sample sizes. 
            
            "ELL" is the Ellison correction to the Weissberg-Beatty method when f 
            is appreciably larger than n^2. A warning message is displayed if f is
            not larger than n^2. "KM" is the Krishnamoorthy-Mathew approximation 
            to the exact solution, which works well for larger sample sizes. 
            
            "EXACT" computes the k-factor exactly by finding the integral solution 
            to the problem via the integrate function. Note the computation time 
            of this method is largely determined by m. 
            
            "OCT" is the Owen approach to compute the k-factor when controlling 
            the tails so that there is not more than (1-P)/2 of the data in each 
            tail of the distribution.
            
        The default is "HE"
    
    m: int, optional 
        The maximum number of subintervals to be used in the integrate 
        function. This is necessary only for method = "EXACT" and method = 
        "OCT". The larger the number, the more accurate the solution. Too low 
        of a value can result in an error. A large value can also cause the 
        function to be slow for method = "EXACT". The default is m = 50.

    lower: float, optional
        If TRUE, then the data is considered to be from a log-normal 
        distribution, in which case the output gives tolerance intervals for 
        the log-normal distribution. The default is False.
    
Details
    Recall that if the random variable X is distributed according to a 
    log-normal distribution, then the random variable Y = ln(X) is distributed 
    according to a normal distribution.
    
Returns
-------
  normtolint returns a data frame with items:
        
    alpha: 
        The specified significance level.
    P: 
        The proportion of the population covered by this tolerance interval.
    mean:
        The sample mean.
    1-sided.lower: 
        The 1-sided lower tolerance bound. This is given only if side = 1.
    1-sided.upper: 
        The 1-sided upper tolerance bound. This is given only if side = 1.
    2-sided.lower: 
        The 2-sided lower tolerance bound. This is given only if side = 2.
    2-sided.upper: 
        The 2-sided upper tolerance bound. This is given only if side = 2.

References
----------
    Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance 
        Intervals. Journal of Statistical Software, 36(5), 1-39. 
        URL http://www.jstatsoft.org/v36/i05/.
    
    Howe, W. G. (1969), Two-Sided Tolerance Limits for Normal Populations - 
        Some Improvements, Journal of the American Statistical Association, 
        64, 610–620.

    Wald, A. and Wolfowitz, J. (1946), Tolerance Limits for a Normal 
        Distribution, Annals of Mathematical Statistics, 17, 208–215.

    Weissberg, A. and Beatty, G. (1969), Tables of Tolerance Limit Factors 
        for Normal Distributions, Technometrics, 2, 483–500.
        
Examples
--------
    ## 95%/95% 2-sided normal tolerance intervals for a sample of size 100. 
    
        x = np.random.normal(size=100)
        
        # normtolint(x, alpha = 0.05, P = 0.95, side = 2, 
        #             method = "HE", lognorm = False)
    '''
    if lognorm:
        x = np.log(x)
    xbar = np.mean(x)
    s = statistics.stdev(x)
    try:
        n = len(x)
    except:
        n = 1
    K = Kfactor(n, alpha=alpha, P=P, side = side, method= method, m = m)
    lower = xbar-s*K
    upper = xbar+s*K
    if(lognorm):
        lower = np.exp(lower)
        upper = np.exp(upper)
        xbar = np.exp(xbar)
    if side == 1:
        temp = pd.DataFrame([[alpha,P, xbar,lower,upper]],columns=['alpha','P','mean','1-sided.lower','1-sided.upper'])
        return temp
    else:
        temp = pd.DataFrame([[alpha,P, xbar,lower,upper]],columns=['alpha','P','mean','2-sided.lower','2-sided.upper'])
        return temp

# x =  [6, 2, 1, 4, 8, 3, 3, 14, 2, 1, 21, 5, 18, 2, 3, 10, 8, 2, 
#                   11, 4, 16, 13, 17, 1, 7, 1, 1, 8, 19, 27, 2, 7, 7, 3, 1,
#                   15, 1, 16, 9, 9, 7, 29, 3, 10, 3, 1, 20, 8, 12, 6, 1, 5, 1,
#                   5, 23, 3, 3, 14, 6, 9, 1, 4, 5, 11, 5, 1, 5, 5, 4, 10, 1,
#                   12, 1, 3, 4, 2, 9, 2, 1, 5, 6, 8, 2, 1, 1, 1, 4, 6, 7, 26, 
#                   10, 2, 1, 2, 17, 4, 3, 2, 8, 2]
#x = scipy.stats.norm.rvs(size = 100000)

# F = False
# T = True
#print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "AFIT", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "HE", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "HE2", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "HE2", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "WBE", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "WBE", lognorm = T))

# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "ELL", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "ELL", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "KM", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "KM", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "EXACT", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "EXACT", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "OCT", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 2, method = "OCT", lognorm = T))

#print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "HE", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "HE", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "HE2", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "HE2", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "WBE", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "WBE", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "ELL", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "ELL", lognorm = T))

# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "KM", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "KM", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "EXACT", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "EXACT", lognorm = T))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "OCT", lognorm = F))
# print(normtolint(x, alpha = 0.05, P = 0.95, side = 1, method = "OCT", lognorm = T))