Functions
def	ks_2samp (data1, data2, binned=False)

def	chi2_2samp (data1, data2, normed=True, binned=True)

def	BDM_2samp (data1, data2, normed=True, binned=True)

def	CVM_2samp (data1, data2, normed=True, binned=False)

def	_anderson_ksamp_midrank_binned (data, Z, Zstar, k, n, N)

def	_anderson_ksamp_midrank (samples, Z, Zstar, k, n, N)

def	_anderson_ksamp_right (samples, Z, Zstar, k, n, N)

def	anderson_ksamp (data1, data2, binned=False, midrank=True)

def	_log_fac (m, n)

def	_log_binomial (n, k)

def	_loglikelihood (data1, data2, N_data1, N_data2, ratio, H="H0")

def	_zloglikelihood (data1, data2, N_data1, N_data2, H="H0")

def	likelihoodratio_ksamp (data1, data2, normed=True, binned=True)

def	likelihoodvalue_ksamp (data1, data2, normed=True, binned=True)

Variables
bool	DEBUG = False

list	statistic_seq = []

	Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))

	Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))

	BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))

	CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))

	Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))

	LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))

	LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))

Function Documentation

◆ _anderson_ksamp_midrank()

def python.test_statistics._anderson_ksamp_midrank	(	samples,
		Z,
		Zstar,
		k,
		n,
		N
	)

private

Compute A2akN equation 7 of Scholz and Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2aKN : float
    The A2aKN statistics of Scholz and Stephens 1987.

Definition at line 309 of file test_statistics.py.

 def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
     """
     Compute A2akN equation 7 of Scholz and Stephens.
  
     Parameters
     ----------
     samples : sequence of 1-D array_like
         Array of sample arrays.
     Z : array_like
         Sorted array of all observations.
     Zstar : array_like
         Sorted array of unique observations.
     k : int
         Number of samples.
     n : array_like
         Number of observations in each sample.
     N : int
         Total number of observations.
  
     Returns
     -------
     A2aKN : float
         The A2aKN statistics of Scholz and Stephens 1987.
     """
     
     if DEBUG:
         global statistic_seq
         statistic_seq = []
     
     A2akN = 0.
     Z_ssorted_left = Z.searchsorted(Zstar, 'left')
     if N == Zstar.size:
         lj = 1.
     else:
         lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
     Bj = Z_ssorted_left + lj / 2.
     for i in np.arange(0, k):
         s = np.sort(samples[i])
         s_ssorted_right = s.searchsorted(Zstar, side='right')
         Mij = s_ssorted_right.astype(float)
         fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
         Mij -= fij / 2.
         inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
         A2akN += inner.sum() / n[i]
         
         if DEBUG:
             statistic_seq.append(A2akN*(N-1.)/N)
         
     A2akN *= (N - 1.) / N
     return A2akN
  
  

◆ _anderson_ksamp_midrank_binned()

def python.test_statistics._anderson_ksamp_midrank_binned	(	data,
		Z,
		Zstar,
		k,
		n,
		N
	)

private

Definition at line 288 of file test_statistics.py.

 def _anderson_ksamp_midrank_binned(data, Z, Zstar, k, n, N):
     
     if DEBUG:
         global statistic_seq
         statistic_seq = []
     
     A2akN = 0.
     lj = Z
     Bj = Z.cumsum() - lj/2
     for i in np.arange(0, k):
         d = data[i]
         Mij = d.cumsum() - d/2
         inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
         A2akN += np.nan_to_num(inner).sum() / n[i]
         
         if DEBUG:
             statistic_seq.append(A2akN*(N-1.)/N)
         
     A2akN *= (N - 1.) / N
     return A2akN
  

◆ _anderson_ksamp_right()

def python.test_statistics._anderson_ksamp_right	(	samples,
		Z,
		Zstar,
		k,
		n,
		N
	)

private

Compute A2akN equation 6 of Scholz & Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2KN : float
    The A2KN statistics of Scholz and Stephens 1987.

Definition at line 361 of file test_statistics.py.

 def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
     """
     Compute A2akN equation 6 of Scholz & Stephens.
  
     Parameters
     ----------
     samples : sequence of 1-D array_like
         Array of sample arrays.
     Z : array_like
         Sorted array of all observations.
     Zstar : array_like
         Sorted array of unique observations.
     k : int
         Number of samples.
     n : array_like
         Number of observations in each sample.
     N : int
         Total number of observations.
  
     Returns
     -------
     A2KN : float
         The A2KN statistics of Scholz and Stephens 1987.
     """
  
     A2kN = 0.
     lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],  'left')
     Bj = lj.cumsum()
     for i in range(0, k):
         s = np.sort(samples[i])
         Mij = s.searchsorted(Zstar[:-1], side='right')
         inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
         A2kN += inner.sum() / n[i]
     return A2kN
  
  

◆ _log_binomial()

def python.test_statistics._log_binomial	(	n,
		k
	)

private

Definition at line 542 of file test_statistics.py.

 def _log_binomial(n,k): 
     if k > (n-k):
         return (_log_fac(n-k+1,n)-_log_fac(2,k))
     else:
         return (_log_fac(k+1,n)-_log_fac(2,n-k))
  

◆ _log_fac()

def python.test_statistics._log_fac	(	m,
		n
	)

private

Definition at line 539 of file test_statistics.py.

 def _log_fac(m,n):
     return np.log(np.arange(m,n+1)).sum()
 @np.vectorize

◆ _loglikelihood()

def python.test_statistics._loglikelihood	(	data1,
		data2,
		N_data1,
		N_data2,
		ratio,
		H = `"H0"`
	)

private

Definition at line 548 of file test_statistics.py.

 def _loglikelihood(data1, data2, N_data1, N_data2, ratio, H = "H0"):
     t = data1 + data2
     logC = 0.
     if not ratio:
         logC = float(sum(log(binomial(t[i], data2[i])) for i in range(t.shape[0])))
     if H=="H1":
         a = data2/data1
     if H=="H0":
         a = N_data2 / N_data1
     
     if DEBUG:
         global statistic_seq
         statistic_seq = -2 * (logC - scipy.special.xlogy(t, 1+a) + scipy.special.xlogy(data2, a))
     
     return -2 * (logC - scipy.special.xlogy(t, 1+a).sum() + scipy.special.xlogy(data2, a).sum())
  

◆ _zloglikelihood()

def python.test_statistics._zloglikelihood	(	data1,
		data2,
		N_data1,
		N_data2,
		H = `"H0"`
	)

private

Definition at line 564 of file test_statistics.py.

 def _zloglikelihood(data1, data2, N_data1, N_data2, H = "H0"):
     z_not_in_both = (data1 == 0) ^ (data2 == 0)
     data1 = data1[z_not_in_both]
     data2 = data2[z_not_in_both]
     t = data1 + data2
     if H == "H0":
         p = N_data1 / (N_data1 + N_data2)
         
         if DEBUG:
             global statistic_seq
             statistic_seq = np.concatenate([statistic_seq, -2 * scipy.special.xlogy(t[data2==0], p), scipy.special.xlogy(t[data1 == 0], 1-p)])
         
         return -2 * (scipy.special.xlogy(t[data2==0], p).sum() + scipy.special.xlogy(t[data1 == 0], 1-p).sum())
     if H == "H1":
         return 0.
  

◆ anderson_ksamp()

def python.test_statistics.anderson_ksamp	(	data1,
		data2,
		binned = `False`,
		midrank = `True`
	)

The Anderson-Darling test for 2 samples.

It tests the null hypothesis
that the 2 samples are drawn from the same population without
having to specify the distribution function of that population.
The critical values depend on the number of samples.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample data in arrays.

Returns
-------
statistic : float
    AD statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value.
    Always NaN in this case.

Raises
------
ValueError
    If less than 2 samples are provided, a sample is empty, or no
    distinct observations are in the samples.

See Also
--------
ks_2samp : 2 sample Kolmogorov-Smirnov test

Notes
-----
This code is modified from scipy.stats.anderson and extended with
supporting on frequency data. See [1]_ for further information.

[2]_ Defines three versions of the k-sample Anderson-Darling test:
one for continuous distributions and two for discrete
distributions, in which ties between samples may occur. The
default of this routine is to compute the version based on the
midrank empirical distribution function. This test is applicable
to continuous and discrete data. If midrank is set to False, the
right side empirical distribution is used for a test for discrete
data. According to [1]_, the two discrete test statistics differ
only slightly if a few collisions due to round-off errors occur in
the test not adjusted for ties between samples.

References
----------
.. [1] scipy.stats.anderson — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
.. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
       Anderson-Darling Tests, Journal of the American Statistical
       Association, Vol. 82, pp. 918-924.

Examples
--------
>>> from scipy import stats
>>> np.random.seed(314159)

The null hypothesis that the two random samples come from the same
distribution can be rejected at the 5% level because the returned
test value is greater than the critical value for 5% (1.961) but
not at the 2.5% level. The interpolation gives an approximate
significance level of 3.1%:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(loc=0.5, size=30)])
(2.4615796189876105,
  array([ 0.325,  1.226,  1.961,  2.718,  3.752]),
  0.03134990135800783)


The null hypothesis cannot be rejected for three samples from an
identical distribution. The approximate p-value (87%) has to be
computed by extrapolation and may not be very accurate:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(size=30), np.random.normal(size=20)])
(-0.73091722665244196,
  array([ 0.44925884,  1.3052767 ,  1.9434184 ,  2.57696569,  3.41634856]),
  0.8789283903979661)

Definition at line 398 of file test_statistics.py.

 def anderson_ksamp(data1, data2, binned=False, midrank=True):
     """The Anderson-Darling test for 2 samples.
  
     It tests the null hypothesis
     that the 2 samples are drawn from the same population without
     having to specify the distribution function of that population.
     The critical values depend on the number of samples.
  
     Parameters
     ----------
     samples : sequence of 1-D array_like
         Array of sample data in arrays.
     
     Returns
     -------
     statistic : float
         AD statistic
     pvalue : float
         two-tailed p-value
     ndf : NaN
         「Degrees of freedom」: the degrees of freedom for the p-value.
         Always NaN in this case.
  
     Raises
     ------
     ValueError
         If less than 2 samples are provided, a sample is empty, or no
         distinct observations are in the samples.
  
     See Also
     --------
     ks_2samp : 2 sample Kolmogorov-Smirnov test
  
     Notes
     -----
     This code is modified from scipy.stats.anderson and extended with
     supporting on frequency data. See [1]_ for further information.
  
     [2]_ Defines three versions of the k-sample Anderson-Darling test:
     one for continuous distributions and two for discrete
     distributions, in which ties between samples may occur. The
     default of this routine is to compute the version based on the
     midrank empirical distribution function. This test is applicable
     to continuous and discrete data. If midrank is set to False, the
     right side empirical distribution is used for a test for discrete
     data. According to [1]_, the two discrete test statistics differ
     only slightly if a few collisions due to round-off errors occur in
     the test not adjusted for ties between samples.
  
     References
     ----------
     .. [1] scipy.stats.anderson — SciPy Reference Guide
            http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
     .. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
            Anderson-Darling Tests, Journal of the American Statistical
            Association, Vol. 82, pp. 918-924.
  
     Examples
     --------
     >>> from scipy import stats
     >>> np.random.seed(314159)
  
     The null hypothesis that the two random samples come from the same
     distribution can be rejected at the 5% level because the returned
     test value is greater than the critical value for 5% (1.961) but
     not at the 2.5% level. The interpolation gives an approximate
     significance level of 3.1%:
  
     >>> stats.anderson_ksamp([np.random.normal(size=50),
     ... np.random.normal(loc=0.5, size=30)])
     (2.4615796189876105,
       array([ 0.325,  1.226,  1.961,  2.718,  3.752]),
       0.03134990135800783)
  
  
     The null hypothesis cannot be rejected for three samples from an
     identical distribution. The approximate p-value (87%) has to be
     computed by extrapolation and may not be very accurate:
  
     >>> stats.anderson_ksamp([np.random.normal(size=50),
     ... np.random.normal(size=30), np.random.normal(size=20)])
     (-0.73091722665244196,
       array([ 0.44925884,  1.3052767 ,  1.9434184 ,  2.57696569,  3.41634856]),
       0.8789283903979661)
  
     """
     samples = [data1, data2]
     k = len(samples)
     if (k < 2):
         raise ValueError("anderson_ksamp needs at least two samples")
  
     samples = list(map(np.asarray, samples))
     if binned:
         Z = Zstar = sum(samples)
         n = np.array([sample.sum() for sample in samples])
         N = n.sum()
         A2kN = _anderson_ksamp_midrank_binned(samples, Z, Zstar, k, n, N)
     else:
         Z = np.sort(np.hstack(samples))
         N = Z.size
         Zstar = np.unique(Z)
         if Zstar.size < 2:
             raise ValueError("anderson_ksamp needs more than one distinct "
                              "observation")
  
         n = np.array([sample.size for sample in samples])
         if any(n == 0):
             raise ValueError("anderson_ksamp encountered sample without "
                              "observations")
  
         if midrank:
             A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
         else:
             A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
  
     H = (1. / n).sum()
     hs_cs = (1. / np.arange(N - 1, 1, -1)).cumsum()
     h = hs_cs[-1] + 1
     g = (hs_cs / np.arange(2, N)).sum()
  
     a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
     b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
     c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
     d = (2*h + 6)*k**2 - 4*h*k
     sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
     m = k - 1
     A2 = (A2kN - m) / np.sqrt(sigmasq)
  
     # The b_i values are the interpolation coefficients from Table 2
     # of Scholz and Stephens 1987
     b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326])
     b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822])
     b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396])
     critical = b0 + b1 / np.sqrt(m) + b2 / m
     pf = np.polyfit(critical, np.log(np.array([0.25, 0.1, 0.05, 0.025, 0.01])), 2)
     if A2 < critical.min() or A2 > critical.max():
         warnings.warn("approximate p-value will be computed by extrapolation")  # noqa: B028 (no stacklevel needed)
  
     p = np.exp(np.polyval(pf, A2))
     return Anderson_ksampResult(A2, p, float('nan'))
  

◆ BDM_2samp()

def python.test_statistics.BDM_2samp	(	data1,
		data2,
		normed = `True`,
		binned = `True`
	)

The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.    

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional
    If True/False, shape/value comparison test.
binned : bool, optional
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The Bhattacharyya distance measure.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 196 of file test_statistics.py.

 def BDM_2samp(data1, data2, normed=True, binned=True):
     """
     The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
     from the same distribution.    
  
     Parameters
     ----------
     data1, data2 : sequence of 1-D ndarrays
         Input data. Observed frequencies in each category.
     normed : bool, optional
         If True/False, shape/value comparison test.
     binned : bool, optional
         If True/False, assume the type of input data is observed frequencies/samples.
     
     Returns
     -------
     statistic : float
         The Bhattacharyya distance measure.
     pvalue : float
         Corresponding p-value.
     ndf : int
         「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
         is computed using a chi-squared distribution with k - 1 degrees of freedom,
         where k is the number of observed frequencies without bins has zero counts 
         in both histograms.
     """
     if binned:
         filter = ~((data1 == 0.) & (data2 == 0.))
         data1 = data1[filter] 
         data2 = data2[filter] 
         ndf = data1.shape[0]
         N_data1 = data1.sum()
         N_data2 = data2.sum()
         bdm = np.sqrt(data1 * data2).sum()
         approx_chi2 = 2*N_data1+2*N_data2 - 4*bdm
         if normed:
             ndf -= 1
             bdm /= np.sqrt(N_data1 * N_data2)
     if DEBUG:
         global statistic_seq
         statistic_seq = 2*data1+2*data2 - 4*np.sqrt(data1 * data2)
     return BDM_2sampResult(bdm, scipy.stats.chi2.sf(approx_chi2, ndf), ndf)
  

◆ chi2_2samp()

def python.test_statistics.chi2_2samp	(	data1,
		data2,
		normed = `True`,
		binned = `True`
	)

The chi square test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The chi-squared test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Note
----
This code is modified from scipy.stats.chisquare and extended with supporting on
2 sample cases and shape comparison test.

Definition at line 142 of file test_statistics.py.

 def chi2_2samp(data1, data2, normed=True, binned=True):
     """
     The chi square test tests the null hypothesis that 2 given frequencies are drawn
     from the same distribution.
     
     Parameters
     ----------
     data1, data2 : sequence of 1-D ndarrays
         Input data. Observed frequencies in each category.
     normed : bool, optional, default: True
         If True/False, shape/value comparison test.
     binned : bool, optional, default: True
         If True/False, assume the type of input data is observed frequencies/samples.
     
     Returns
     -------
     statistic : float
         The chi-squared test statistic.
     pvalue : float
         Corresponding p-value.
     ndf : int
         「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
         is computed using a chi-squared distribution with k - 1 degrees of freedom,
         where k is the number of observed frequencies without bins has zero counts 
         in both histograms.
  
     Note
     ----
     This code is modified from scipy.stats.chisquare and extended with supporting on
     2 sample cases and shape comparison test. 
     """
     if binned is True:
         filter = ~((data1 == 0.) & (data2 == 0.))
         data1 = data1[filter] 
         data2 = data2[filter] 
         ndf = data1.shape[0]
         if not normed:
             chi2 = np.power(data1 - data2, 2).dot(np.power(data1 + data2, -1))
         else:
             ndf -= 1
             N_data1 = data1.sum()
             N_data2 = data2.sum()
             data1 /= N_data1
             data2 /= N_data2
             chi2 = np.power(data1 - data2, 2).dot(np.power(data1/N_data1 + data2/N_data2, -1))
     if DEBUG:
         global statistic_seq
         if not normed:
             statistic_seq = np.power(data1 - data2, 2) * (np.power(data1 + data2, -1))
         else:
             statistic_seq = np.power(data1 - data2, 2) * np.power(data1/N_data1 + data2/N_data2, -1)
     return Chi2_2sampResult(chi2, scipy.stats.chi2.sf(chi2, ndf), ndf)
  

◆ CVM_2samp()

def python.test_statistics.CVM_2samp	(	data1,
		data2,
		normed = `True`,
		binned = `False`
	)

Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1 ,data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    CVM statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Definition at line 240 of file test_statistics.py.

 def CVM_2samp(data1, data2, normed=True, binned=False):
     """
     Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.
  
     This is a two-sided test for the null hypothesis that 2 independent samples
     are drawn from the same continuous distribution.
  
     Parameters
     ----------
     data1 ,data2 : sequence of 1-D ndarrays
         Input data. Can be either observed frequencies in each category or array
         of sample observations.
     binned : bool, optional, default: False
         If True/False, assume the type of input data is observed frequencies/samples.
     
     Returns
     -------
     statistic : float
         CVM statistic
     pvalue : float
         two-tailed p-value
     ndf : NaN
         「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
     """
     N1 = data1.shape[0]
     N2 = data2.shape[0]
     N = N1 + N2
     if binned:
         E1 = data1.sum()
         E2 = data2.sum()
         cdf1 = np.cumsum(data1) / E1
         cdf2 = np.cumsum(data2) / E2
         weights = data1 + data2
     else:
         E1 = N1
         E2 = N2
         data_all = np.concatenate([data1, data2])
         cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*N1)
         cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*N2)
         weights = 1.
     cvm = (weights * (cdf1 - cdf2)**2).sum() * E1 * E2 / (E1 + E2)**2
     mean = 1./6 + 1./6/N
     var = (N + 1.) * (4.*N1*N2*N - 3*(N1**2+N2**2) - 2*N1*N2) / (4 * N1 * N2) /N**2/45
     if DEBUG:
         global statistic_seq
         statistic_seq = (weights * (cdf1 - cdf2)**2) * E1 * E2 / (E1 + E2)**2
     return CVM_2sampResult(cvm, 1-distr.CVM(cvm, mean, var), float("nan"))
  

◆ ks_2samp()

def python.test_statistics.ks_2samp	(	data1,
		data2,
		binned = `False`
	)

Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    KS statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Notes
-----
This code is modified from scipy.stats.ks_2samp and extended with supporting on
frequency data. See [1]_ for further information.

This tests whether 2 samples are drawn from the same distribution. Note
that, like in the case of the one-sample K-S test, the distribution is
assumed to be continuous.

This is the two-sided test, one-sided tests are not implemented.
The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

If the K-S statistic is small or the p-value is high, then we cannot
reject the hypothesis that the distributions of the two samples
are the same.

References
----------
.. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
.. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].

Examples
--------
>>> from scipy import stats
>>> np.random.seed(12345678)  #fix random seed to get the same result
>>> n1 = 200  # size of first sample
>>> n2 = 300  # size of second sample

For a different distribution, we can reject the null hypothesis since the
pvalue is below 1%:

>>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
>>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
>>> stats.ks_2samp(rvs1, rvs2)
(0.20833333333333337, 4.6674975515806989e-005)

For a slightly different distribution, we cannot reject the null hypothesis
at a 10% or lower alpha since the p-value at 0.144 is higher than 10%

>>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs3)
(0.10333333333333333, 0.14498781825751686)

For an identical distribution, we cannot reject the null hypothesis since
the p-value is high, 41%:

>>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs4)
(0.07999999999999996, 0.41126949729859719)

Definition at line 22 of file test_statistics.py.

 def ks_2samp(data1, data2, binned=False):
     """
     Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.
  
     This is a two-sided test for the null hypothesis that 2 independent samples
     are drawn from the same continuous distribution.
  
     Parameters
     ----------
     data1, data2 : sequence of 1-D ndarrays
         Input data. Can be either observed frequencies in each category or array
         of sample observations.
     binned : bool, optional, default: False
         If True/False, assume the type of input data is observed frequencies/samples.
  
     Returns
     -------
     statistic : float
         KS statistic
     pvalue : float
         two-tailed p-value
     ndf : NaN
         「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
  
     Notes
     -----
     This code is modified from scipy.stats.ks_2samp and extended with supporting on
     frequency data. See [1]_ for further information.
  
     This tests whether 2 samples are drawn from the same distribution. Note
     that, like in the case of the one-sample K-S test, the distribution is
     assumed to be continuous.
  
     This is the two-sided test, one-sided tests are not implemented.
     The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
  
     If the K-S statistic is small or the p-value is high, then we cannot
     reject the hypothesis that the distributions of the two samples
     are the same.
  
     References
     ----------
     .. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
            http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
     .. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].
  
     Examples
     --------
     >>> from scipy import stats
     >>> np.random.seed(12345678)  #fix random seed to get the same result
     >>> n1 = 200  # size of first sample
     >>> n2 = 300  # size of second sample
  
     For a different distribution, we can reject the null hypothesis since the
     pvalue is below 1%:
  
     >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
     >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
     >>> stats.ks_2samp(rvs1, rvs2)
     (0.20833333333333337, 4.6674975515806989e-005)
  
     For a slightly different distribution, we cannot reject the null hypothesis
     at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
  
     >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
     >>> stats.ks_2samp(rvs1, rvs3)
     (0.10333333333333333, 0.14498781825751686)
  
     For an identical distribution, we cannot reject the null hypothesis since
     the p-value is high, 41%:
  
     >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
     >>> stats.ks_2samp(rvs1, rvs4)
     (0.07999999999999996, 0.41126949729859719)
  
     """
     if binned is True:
         cdf1 = np.cumsum(data1)
         cdf2 = np.cumsum(data2)
         n1 = cdf1[-1]
         n2 = cdf2[-1]
         cdf1 /= n1
         cdf2 /= n2
         ndf = data1.shape[0]
     else:
         n1 = data1.shape[0]
         n2 = data2.shape[0]
         ndf = float("nan")
         if binned is False:
             data1 = np.sort(data1)
             data2 = np.sort(data2)
             data_all = np.concatenate([data1, data2])
             cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*n1)
             cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*n2)
         if binned == 'kde':
             data1_kde = sm.nonparametric.KDEUnivariate(data1)
             data2_kde = sm.nonparametric.KDEUnivariate(data2)
             data1_kde.fit()
             data2_kde.fit()
             i, min_sup = min(enumerate([data1_kde.support, data1_kde.support]), key = lambda data: data[1].shape[0])
             if i == 0:
                 cdf1 = data1_kde.cdf
                 cdf2 = scipy.interpolate.interp1d(data2_kde.support, data2_kde.cdf)(min_sup)
             else:
                 cdf1 = scipy.interpolate.interp1d(data1_kde.support, data1_kde.cdf)(min_sup)
                 cdf2 = data2_kde.cdf
     d = np.max(np.absolute(cdf1 - cdf2))
     # Note: d absolute not signed distance
     en = np.sqrt(n1 * n2 / float(n1 + n2))
     try:
         prob = scipy.stats.distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
     except Exception:
         prob = 1.0
     if DEBUG:
         global statistic_seq
         statistic_seq = np.absolute(cdf1 - cdf2)
     return Ks_2sampResult(d, prob, ndf)
  
  

◆ likelihoodratio_ksamp()

def python.test_statistics.likelihoodratio_ksamp	(	data1,
		data2,
		normed = `True`,
		binned = `True`
	)

The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-ratio test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 581 of file test_statistics.py.

 def likelihoodratio_ksamp(data1, data2, normed=True, binned=True):
     """
     The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
     drawn from the same distribution.
     
     Parameters
     ----------
     data1, data2 : sequence of 1-D ndarrays
         Input data. Observed frequencies in each category.
     normed : bool, optional, default: True
         If True/False, shape/value comparison test.
     binned : bool, optional, default: True
         If True/False, assume the type of input data is observed frequencies/samples.
     
     Returns
     -------
     statistic : float
         The llike-ratio test statistic.
     pvalue : float
         Corresponding p-value.
     ndf : int
         「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
         is computed using a chi-squared distribution with k - 1 degrees of freedom,
         where k is the number of observed frequencies without bins has zero counts 
         in both histograms.
     """
     not_all_zeros = (data1 != 0) & (data2 != 0)
     ndf = sum((data1 != 0) | (data2 != 0))
     N_data1 = data1.sum()
     N_data2 = data2.sum()
     if normed:
         ndf -= 1
     llike_H0 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H0") + _zloglikelihood(data1[~not_all_zeros], data2[~not_all_zeros], N_data1, N_data2, H="H0")
     if DEBUG:
         global statistic_seq
         STATISTIC_SEQ = statistic_seq
     llike_H1 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H1")
     if DEBUG:
         STATISTIC_SEQ[:statistic_seq.shape[0]] -= statistic_seq
         statistic_seq = STATISTIC_SEQ
     llike_ratio = llike_H0 - llike_H1
     return LikelihoodRatio_ksampResult(llike_ratio, scipy.stats.chi2.sf(llike_ratio, ndf), ndf)
  

◆ likelihoodvalue_ksamp()

def python.test_statistics.likelihoodvalue_ksamp	(	data1,
		data2,
		normed = `True`,
		binned = `True`
	)

The likelihood-value test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-value test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 625 of file test_statistics.py.

 def likelihoodvalue_ksamp(data1, data2, normed=True, binned=True):
     """
     The likelihood-value test tests the null hypothesis that 2 given frequencies are
     drawn from the same distribution.
     
     Parameters
     ----------
     data1, data2 : sequence of 1-D ndarrays
         Input data. Observed frequencies in each category.
     normed : bool, optional, default: True
         If True/False, shape/value comparison test.
     binned : bool, optional, default: True
         If True/False, assume the type of input data is observed frequencies/samples.
     
     Returns
     -------
     statistic : float
         The llike-value test statistic.
     pvalue : float
         Corresponding p-value.
     ndf : int
         「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
         is computed using a chi-squared distribution with k - 1 degrees of freedom,
         where k is the number of observed frequencies without bins has zero counts 
         in both histograms.
     """
     no_zeros = (data1 != 0) | (data2 != 0)
     ndf = no_zeros.sum()
     if normed:
         ndf -= 1
     llike = _loglikelihood(data1[no_zeros], data2[no_zeros], data1.sum(), data2.sum(), ratio=False, H="H0") 
     return LikelihoodValue_ksampResult(llike, scipy.stats.chi2.sf(llike, ndf), ndf)

Variable Documentation

◆ Anderson_ksampResult

python.test_statistics.Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 397 of file test_statistics.py.

◆ BDM_2sampResult

python.test_statistics.BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 195 of file test_statistics.py.

◆ Chi2_2sampResult

python.test_statistics.Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 141 of file test_statistics.py.

◆ CVM_2sampResult

python.test_statistics.CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 239 of file test_statistics.py.

◆ DEBUG

bool python.test_statistics.DEBUG = False

Definition at line 17 of file test_statistics.py.

◆ Ks_2sampResult

python.test_statistics.Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 21 of file test_statistics.py.

◆ LikelihoodRatio_ksampResult

python.test_statistics.LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 580 of file test_statistics.py.

◆ LikelihoodValue_ksampResult

python.test_statistics.LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 624 of file test_statistics.py.

◆ statistic_seq

list python.test_statistics.statistic_seq = []

Definition at line 19 of file test_statistics.py.

Functions

Variables

Function Documentation

◆ _anderson_ksamp_midrank()

◆ _anderson_ksamp_midrank_binned()

◆ _anderson_ksamp_right()

◆ _log_binomial()

◆ _log_fac()

◆ _loglikelihood()

◆ _zloglikelihood()

◆ anderson_ksamp()

◆ BDM_2samp()

◆ chi2_2samp()

◆ CVM_2samp()

◆ ks_2samp()

◆ likelihoodratio_ksamp()

◆ likelihoodvalue_ksamp()

Variable Documentation

◆ Anderson_ksampResult

◆ BDM_2sampResult

◆ Chi2_2sampResult

◆ CVM_2sampResult

◆ DEBUG

◆ Ks_2sampResult

◆ LikelihoodRatio_ksampResult

◆ LikelihoodValue_ksampResult

◆ statistic_seq