ATLAS Offline Software
Functions | Variables
python.test_statistics Namespace Reference

Functions

def ks_2samp (data1, data2, binned=False)
 
def chi2_2samp (data1, data2, normed=True, binned=True)
 
def BDM_2samp (data1, data2, normed=True, binned=True)
 
def CVM_2samp (data1, data2, normed=True, binned=False)
 
def _anderson_ksamp_midrank_binned (data, Z, Zstar, k, n, N)
 
def _anderson_ksamp_midrank (samples, Z, Zstar, k, n, N)
 
def _anderson_ksamp_right (samples, Z, Zstar, k, n, N)
 
def anderson_ksamp (data1, data2, binned=False, midrank=True)
 
def _log_fac (m, n)
 
def _log_binomial (n, k)
 
def _loglikelihood (data1, data2, N_data1, N_data2, ratio, H="H0")
 
def _zloglikelihood (data1, data2, N_data1, N_data2, H="H0")
 
def likelihoodratio_ksamp (data1, data2, normed=True, binned=True)
 
def likelihoodvalue_ksamp (data1, data2, normed=True, binned=True)
 

Variables

bool DEBUG = False
 
list statistic_seq = []
 
 Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))
 
 Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))
 
 BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))
 
 CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))
 
 Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))
 
 LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))
 
 LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))
 

Function Documentation

◆ _anderson_ksamp_midrank()

def python.test_statistics._anderson_ksamp_midrank (   samples,
  Z,
  Zstar,
  k,
  n,
  N 
)
private
Compute A2akN equation 7 of Scholz and Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2aKN : float
    The A2aKN statistics of Scholz and Stephens 1987.

Definition at line 309 of file test_statistics.py.

309 def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
310  """
311  Compute A2akN equation 7 of Scholz and Stephens.
312 
313  Parameters
314  ----------
315  samples : sequence of 1-D array_like
316  Array of sample arrays.
317  Z : array_like
318  Sorted array of all observations.
319  Zstar : array_like
320  Sorted array of unique observations.
321  k : int
322  Number of samples.
323  n : array_like
324  Number of observations in each sample.
325  N : int
326  Total number of observations.
327 
328  Returns
329  -------
330  A2aKN : float
331  The A2aKN statistics of Scholz and Stephens 1987.
332  """
333 
334  if DEBUG:
335  global statistic_seq
336  statistic_seq = []
337 
338  A2akN = 0.
339  Z_ssorted_left = Z.searchsorted(Zstar, 'left')
340  if N == Zstar.size:
341  lj = 1.
342  else:
343  lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
344  Bj = Z_ssorted_left + lj / 2.
345  for i in np.arange(0, k):
346  s = np.sort(samples[i])
347  s_ssorted_right = s.searchsorted(Zstar, side='right')
348  Mij = s_ssorted_right.astype(float)
349  fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
350  Mij -= fij / 2.
351  inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
352  A2akN += inner.sum() / n[i]
353 
354  if DEBUG:
355  statistic_seq.append(A2akN*(N-1.)/N)
356 
357  A2akN *= (N - 1.) / N
358  return A2akN
359 
360 

◆ _anderson_ksamp_midrank_binned()

def python.test_statistics._anderson_ksamp_midrank_binned (   data,
  Z,
  Zstar,
  k,
  n,
  N 
)
private

Definition at line 288 of file test_statistics.py.

288 def _anderson_ksamp_midrank_binned(data, Z, Zstar, k, n, N):
289 
290  if DEBUG:
291  global statistic_seq
292  statistic_seq = []
293 
294  A2akN = 0.
295  lj = Z
296  Bj = Z.cumsum() - lj/2
297  for i in np.arange(0, k):
298  d = data[i]
299  Mij = d.cumsum() - d/2
300  inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
301  A2akN += np.nan_to_num(inner).sum() / n[i]
302 
303  if DEBUG:
304  statistic_seq.append(A2akN*(N-1.)/N)
305 
306  A2akN *= (N - 1.) / N
307  return A2akN
308 

◆ _anderson_ksamp_right()

def python.test_statistics._anderson_ksamp_right (   samples,
  Z,
  Zstar,
  k,
  n,
  N 
)
private
Compute A2akN equation 6 of Scholz & Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2KN : float
    The A2KN statistics of Scholz and Stephens 1987.

Definition at line 361 of file test_statistics.py.

361 def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
362  """
363  Compute A2akN equation 6 of Scholz & Stephens.
364 
365  Parameters
366  ----------
367  samples : sequence of 1-D array_like
368  Array of sample arrays.
369  Z : array_like
370  Sorted array of all observations.
371  Zstar : array_like
372  Sorted array of unique observations.
373  k : int
374  Number of samples.
375  n : array_like
376  Number of observations in each sample.
377  N : int
378  Total number of observations.
379 
380  Returns
381  -------
382  A2KN : float
383  The A2KN statistics of Scholz and Stephens 1987.
384  """
385 
386  A2kN = 0.
387  lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], 'left')
388  Bj = lj.cumsum()
389  for i in range(0, k):
390  s = np.sort(samples[i])
391  Mij = s.searchsorted(Zstar[:-1], side='right')
392  inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
393  A2kN += inner.sum() / n[i]
394  return A2kN
395 
396 

◆ _log_binomial()

def python.test_statistics._log_binomial (   n,
  k 
)
private

Definition at line 542 of file test_statistics.py.

542 def _log_binomial(n,k):
543  if k > (n-k):
544  return (_log_fac(n-k+1,n)-_log_fac(2,k))
545  else:
546  return (_log_fac(k+1,n)-_log_fac(2,n-k))
547 

◆ _log_fac()

def python.test_statistics._log_fac (   m,
  n 
)
private

Definition at line 539 of file test_statistics.py.

539 def _log_fac(m,n):
540  return np.log(np.arange(m,n+1)).sum()
541 @np.vectorize

◆ _loglikelihood()

def python.test_statistics._loglikelihood (   data1,
  data2,
  N_data1,
  N_data2,
  ratio,
  H = "H0" 
)
private

Definition at line 548 of file test_statistics.py.

548 def _loglikelihood(data1, data2, N_data1, N_data2, ratio, H = "H0"):
549  t = data1 + data2
550  logC = 0.
551  if not ratio:
552  logC = float(sum(log(binomial(t[i], data2[i])) for i in range(t.shape[0])))
553  if H=="H1":
554  a = data2/data1
555  if H=="H0":
556  a = N_data2 / N_data1
557 
558  if DEBUG:
559  global statistic_seq
560  statistic_seq = -2 * (logC - scipy.special.xlogy(t, 1+a) + scipy.special.xlogy(data2, a))
561 
562  return -2 * (logC - scipy.special.xlogy(t, 1+a).sum() + scipy.special.xlogy(data2, a).sum())
563 

◆ _zloglikelihood()

def python.test_statistics._zloglikelihood (   data1,
  data2,
  N_data1,
  N_data2,
  H = "H0" 
)
private

Definition at line 564 of file test_statistics.py.

564 def _zloglikelihood(data1, data2, N_data1, N_data2, H = "H0"):
565  z_not_in_both = (data1 == 0) ^ (data2 == 0)
566  data1 = data1[z_not_in_both]
567  data2 = data2[z_not_in_both]
568  t = data1 + data2
569  if H == "H0":
570  p = N_data1 / (N_data1 + N_data2)
571 
572  if DEBUG:
573  global statistic_seq
574  statistic_seq = np.concatenate([statistic_seq, -2 * scipy.special.xlogy(t[data2==0], p), scipy.special.xlogy(t[data1 == 0], 1-p)])
575 
576  return -2 * (scipy.special.xlogy(t[data2==0], p).sum() + scipy.special.xlogy(t[data1 == 0], 1-p).sum())
577  if H == "H1":
578  return 0.
579 

◆ anderson_ksamp()

def python.test_statistics.anderson_ksamp (   data1,
  data2,
  binned = False,
  midrank = True 
)
The Anderson-Darling test for 2 samples.

It tests the null hypothesis
that the 2 samples are drawn from the same population without
having to specify the distribution function of that population.
The critical values depend on the number of samples.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample data in arrays.

Returns
-------
statistic : float
    AD statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value.
    Always NaN in this case.

Raises
------
ValueError
    If less than 2 samples are provided, a sample is empty, or no
    distinct observations are in the samples.

See Also
--------
ks_2samp : 2 sample Kolmogorov-Smirnov test

Notes
-----
This code is modified from scipy.stats.anderson and extended with
supporting on frequency data. See [1]_ for further information.

[2]_ Defines three versions of the k-sample Anderson-Darling test:
one for continuous distributions and two for discrete
distributions, in which ties between samples may occur. The
default of this routine is to compute the version based on the
midrank empirical distribution function. This test is applicable
to continuous and discrete data. If midrank is set to False, the
right side empirical distribution is used for a test for discrete
data. According to [1]_, the two discrete test statistics differ
only slightly if a few collisions due to round-off errors occur in
the test not adjusted for ties between samples.

References
----------
.. [1] scipy.stats.anderson — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
.. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
       Anderson-Darling Tests, Journal of the American Statistical
       Association, Vol. 82, pp. 918-924.

Examples
--------
>>> from scipy import stats
>>> np.random.seed(314159)

The null hypothesis that the two random samples come from the same
distribution can be rejected at the 5% level because the returned
test value is greater than the critical value for 5% (1.961) but
not at the 2.5% level. The interpolation gives an approximate
significance level of 3.1%:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(loc=0.5, size=30)])
(2.4615796189876105,
  array([ 0.325,  1.226,  1.961,  2.718,  3.752]),
  0.03134990135800783)


The null hypothesis cannot be rejected for three samples from an
identical distribution. The approximate p-value (87%) has to be
computed by extrapolation and may not be very accurate:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(size=30), np.random.normal(size=20)])
(-0.73091722665244196,
  array([ 0.44925884,  1.3052767 ,  1.9434184 ,  2.57696569,  3.41634856]),
  0.8789283903979661)

Definition at line 398 of file test_statistics.py.

398 def anderson_ksamp(data1, data2, binned=False, midrank=True):
399  """The Anderson-Darling test for 2 samples.
400 
401  It tests the null hypothesis
402  that the 2 samples are drawn from the same population without
403  having to specify the distribution function of that population.
404  The critical values depend on the number of samples.
405 
406  Parameters
407  ----------
408  samples : sequence of 1-D array_like
409  Array of sample data in arrays.
410 
411  Returns
412  -------
413  statistic : float
414  AD statistic
415  pvalue : float
416  two-tailed p-value
417  ndf : NaN
418  「Degrees of freedom」: the degrees of freedom for the p-value.
419  Always NaN in this case.
420 
421  Raises
422  ------
423  ValueError
424  If less than 2 samples are provided, a sample is empty, or no
425  distinct observations are in the samples.
426 
427  See Also
428  --------
429  ks_2samp : 2 sample Kolmogorov-Smirnov test
430 
431  Notes
432  -----
433  This code is modified from scipy.stats.anderson and extended with
434  supporting on frequency data. See [1]_ for further information.
435 
436  [2]_ Defines three versions of the k-sample Anderson-Darling test:
437  one for continuous distributions and two for discrete
438  distributions, in which ties between samples may occur. The
439  default of this routine is to compute the version based on the
440  midrank empirical distribution function. This test is applicable
441  to continuous and discrete data. If midrank is set to False, the
442  right side empirical distribution is used for a test for discrete
443  data. According to [1]_, the two discrete test statistics differ
444  only slightly if a few collisions due to round-off errors occur in
445  the test not adjusted for ties between samples.
446 
447  References
448  ----------
449  .. [1] scipy.stats.anderson — SciPy Reference Guide
450  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
451  .. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
452  Anderson-Darling Tests, Journal of the American Statistical
453  Association, Vol. 82, pp. 918-924.
454 
455  Examples
456  --------
457  >>> from scipy import stats
458  >>> np.random.seed(314159)
459 
460  The null hypothesis that the two random samples come from the same
461  distribution can be rejected at the 5% level because the returned
462  test value is greater than the critical value for 5% (1.961) but
463  not at the 2.5% level. The interpolation gives an approximate
464  significance level of 3.1%:
465 
466  >>> stats.anderson_ksamp([np.random.normal(size=50),
467  ... np.random.normal(loc=0.5, size=30)])
468  (2.4615796189876105,
469  array([ 0.325, 1.226, 1.961, 2.718, 3.752]),
470  0.03134990135800783)
471 
472 
473  The null hypothesis cannot be rejected for three samples from an
474  identical distribution. The approximate p-value (87%) has to be
475  computed by extrapolation and may not be very accurate:
476 
477  >>> stats.anderson_ksamp([np.random.normal(size=50),
478  ... np.random.normal(size=30), np.random.normal(size=20)])
479  (-0.73091722665244196,
480  array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856]),
481  0.8789283903979661)
482 
483  """
484  samples = [data1, data2]
485  k = len(samples)
486  if (k < 2):
487  raise ValueError("anderson_ksamp needs at least two samples")
488 
489  samples = list(map(np.asarray, samples))
490  if binned:
491  Z = Zstar = sum(samples)
492  n = np.array([sample.sum() for sample in samples])
493  N = n.sum()
494  A2kN = _anderson_ksamp_midrank_binned(samples, Z, Zstar, k, n, N)
495  else:
496  Z = np.sort(np.hstack(samples))
497  N = Z.size
498  Zstar = np.unique(Z)
499  if Zstar.size < 2:
500  raise ValueError("anderson_ksamp needs more than one distinct "
501  "observation")
502 
503  n = np.array([sample.size for sample in samples])
504  if any(n == 0):
505  raise ValueError("anderson_ksamp encountered sample without "
506  "observations")
507 
508  if midrank:
509  A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
510  else:
511  A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
512 
513  H = (1. / n).sum()
514  hs_cs = (1. / np.arange(N - 1, 1, -1)).cumsum()
515  h = hs_cs[-1] + 1
516  g = (hs_cs / np.arange(2, N)).sum()
517 
518  a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
519  b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
520  c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
521  d = (2*h + 6)*k**2 - 4*h*k
522  sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
523  m = k - 1
524  A2 = (A2kN - m) / np.sqrt(sigmasq)
525 
526  # The b_i values are the interpolation coefficients from Table 2
527  # of Scholz and Stephens 1987
528  b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326])
529  b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822])
530  b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396])
531  critical = b0 + b1 / np.sqrt(m) + b2 / m
532  pf = np.polyfit(critical, np.log(np.array([0.25, 0.1, 0.05, 0.025, 0.01])), 2)
533  if A2 < critical.min() or A2 > critical.max():
534  warnings.warn("approximate p-value will be computed by extrapolation") # noqa: B028 (no stacklevel needed)
535 
536  p = np.exp(np.polyval(pf, A2))
537  return Anderson_ksampResult(A2, p, float('nan'))
538 

◆ BDM_2samp()

def python.test_statistics.BDM_2samp (   data1,
  data2,
  normed = True,
  binned = True 
)
The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.    

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional
    If True/False, shape/value comparison test.
binned : bool, optional
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The Bhattacharyya distance measure.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 196 of file test_statistics.py.

196 def BDM_2samp(data1, data2, normed=True, binned=True):
197  """
198  The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
199  from the same distribution.
200 
201  Parameters
202  ----------
203  data1, data2 : sequence of 1-D ndarrays
204  Input data. Observed frequencies in each category.
205  normed : bool, optional
206  If True/False, shape/value comparison test.
207  binned : bool, optional
208  If True/False, assume the type of input data is observed frequencies/samples.
209 
210  Returns
211  -------
212  statistic : float
213  The Bhattacharyya distance measure.
214  pvalue : float
215  Corresponding p-value.
216  ndf : int
217  「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
218  is computed using a chi-squared distribution with k - 1 degrees of freedom,
219  where k is the number of observed frequencies without bins has zero counts
220  in both histograms.
221  """
222  if binned:
223  filter = ~((data1 == 0.) & (data2 == 0.))
224  data1 = data1[filter]
225  data2 = data2[filter]
226  ndf = data1.shape[0]
227  N_data1 = data1.sum()
228  N_data2 = data2.sum()
229  bdm = np.sqrt(data1 * data2).sum()
230  approx_chi2 = 2*N_data1+2*N_data2 - 4*bdm
231  if normed:
232  ndf -= 1
233  bdm /= np.sqrt(N_data1 * N_data2)
234  if DEBUG:
235  global statistic_seq
236  statistic_seq = 2*data1+2*data2 - 4*np.sqrt(data1 * data2)
237  return BDM_2sampResult(bdm, scipy.stats.chi2.sf(approx_chi2, ndf), ndf)
238 

◆ chi2_2samp()

def python.test_statistics.chi2_2samp (   data1,
  data2,
  normed = True,
  binned = True 
)
The chi square test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The chi-squared test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Note
----
This code is modified from scipy.stats.chisquare and extended with supporting on
2 sample cases and shape comparison test. 

Definition at line 142 of file test_statistics.py.

142 def chi2_2samp(data1, data2, normed=True, binned=True):
143  """
144  The chi square test tests the null hypothesis that 2 given frequencies are drawn
145  from the same distribution.
146 
147  Parameters
148  ----------
149  data1, data2 : sequence of 1-D ndarrays
150  Input data. Observed frequencies in each category.
151  normed : bool, optional, default: True
152  If True/False, shape/value comparison test.
153  binned : bool, optional, default: True
154  If True/False, assume the type of input data is observed frequencies/samples.
155 
156  Returns
157  -------
158  statistic : float
159  The chi-squared test statistic.
160  pvalue : float
161  Corresponding p-value.
162  ndf : int
163  「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
164  is computed using a chi-squared distribution with k - 1 degrees of freedom,
165  where k is the number of observed frequencies without bins has zero counts
166  in both histograms.
167 
168  Note
169  ----
170  This code is modified from scipy.stats.chisquare and extended with supporting on
171  2 sample cases and shape comparison test.
172  """
173  if binned is True:
174  filter = ~((data1 == 0.) & (data2 == 0.))
175  data1 = data1[filter]
176  data2 = data2[filter]
177  ndf = data1.shape[0]
178  if not normed:
179  chi2 = np.power(data1 - data2, 2).dot(np.power(data1 + data2, -1))
180  else:
181  ndf -= 1
182  N_data1 = data1.sum()
183  N_data2 = data2.sum()
184  data1 /= N_data1
185  data2 /= N_data2
186  chi2 = np.power(data1 - data2, 2).dot(np.power(data1/N_data1 + data2/N_data2, -1))
187  if DEBUG:
188  global statistic_seq
189  if not normed:
190  statistic_seq = np.power(data1 - data2, 2) * (np.power(data1 + data2, -1))
191  else:
192  statistic_seq = np.power(data1 - data2, 2) * np.power(data1/N_data1 + data2/N_data2, -1)
193  return Chi2_2sampResult(chi2, scipy.stats.chi2.sf(chi2, ndf), ndf)
194 

◆ CVM_2samp()

def python.test_statistics.CVM_2samp (   data1,
  data2,
  normed = True,
  binned = False 
)
Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1 ,data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    CVM statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Definition at line 240 of file test_statistics.py.

240 def CVM_2samp(data1, data2, normed=True, binned=False):
241  """
242  Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.
243 
244  This is a two-sided test for the null hypothesis that 2 independent samples
245  are drawn from the same continuous distribution.
246 
247  Parameters
248  ----------
249  data1 ,data2 : sequence of 1-D ndarrays
250  Input data. Can be either observed frequencies in each category or array
251  of sample observations.
252  binned : bool, optional, default: False
253  If True/False, assume the type of input data is observed frequencies/samples.
254 
255  Returns
256  -------
257  statistic : float
258  CVM statistic
259  pvalue : float
260  two-tailed p-value
261  ndf : NaN
262  「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
263  """
264  N1 = data1.shape[0]
265  N2 = data2.shape[0]
266  N = N1 + N2
267  if binned:
268  E1 = data1.sum()
269  E2 = data2.sum()
270  cdf1 = np.cumsum(data1) / E1
271  cdf2 = np.cumsum(data2) / E2
272  weights = data1 + data2
273  else:
274  E1 = N1
275  E2 = N2
276  data_all = np.concatenate([data1, data2])
277  cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*N1)
278  cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*N2)
279  weights = 1.
280  cvm = (weights * (cdf1 - cdf2)**2).sum() * E1 * E2 / (E1 + E2)**2
281  mean = 1./6 + 1./6/N
282  var = (N + 1.) * (4.*N1*N2*N - 3*(N1**2+N2**2) - 2*N1*N2) / (4 * N1 * N2) /N**2/45
283  if DEBUG:
284  global statistic_seq
285  statistic_seq = (weights * (cdf1 - cdf2)**2) * E1 * E2 / (E1 + E2)**2
286  return CVM_2sampResult(cvm, 1-distr.CVM(cvm, mean, var), float("nan"))
287 

◆ ks_2samp()

def python.test_statistics.ks_2samp (   data1,
  data2,
  binned = False 
)
Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    KS statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Notes
-----
This code is modified from scipy.stats.ks_2samp and extended with supporting on
frequency data. See [1]_ for further information.

This tests whether 2 samples are drawn from the same distribution. Note
that, like in the case of the one-sample K-S test, the distribution is
assumed to be continuous.

This is the two-sided test, one-sided tests are not implemented.
The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

If the K-S statistic is small or the p-value is high, then we cannot
reject the hypothesis that the distributions of the two samples
are the same.

References
----------
.. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
.. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].

Examples
--------
>>> from scipy import stats
>>> np.random.seed(12345678)  #fix random seed to get the same result
>>> n1 = 200  # size of first sample
>>> n2 = 300  # size of second sample

For a different distribution, we can reject the null hypothesis since the
pvalue is below 1%:

>>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
>>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
>>> stats.ks_2samp(rvs1, rvs2)
(0.20833333333333337, 4.6674975515806989e-005)

For a slightly different distribution, we cannot reject the null hypothesis
at a 10% or lower alpha since the p-value at 0.144 is higher than 10%

>>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs3)
(0.10333333333333333, 0.14498781825751686)

For an identical distribution, we cannot reject the null hypothesis since
the p-value is high, 41%:

>>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs4)
(0.07999999999999996, 0.41126949729859719)

Definition at line 22 of file test_statistics.py.

22 def ks_2samp(data1, data2, binned=False):
23  """
24  Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.
25 
26  This is a two-sided test for the null hypothesis that 2 independent samples
27  are drawn from the same continuous distribution.
28 
29  Parameters
30  ----------
31  data1, data2 : sequence of 1-D ndarrays
32  Input data. Can be either observed frequencies in each category or array
33  of sample observations.
34  binned : bool, optional, default: False
35  If True/False, assume the type of input data is observed frequencies/samples.
36 
37  Returns
38  -------
39  statistic : float
40  KS statistic
41  pvalue : float
42  two-tailed p-value
43  ndf : NaN
44  「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
45 
46  Notes
47  -----
48  This code is modified from scipy.stats.ks_2samp and extended with supporting on
49  frequency data. See [1]_ for further information.
50 
51  This tests whether 2 samples are drawn from the same distribution. Note
52  that, like in the case of the one-sample K-S test, the distribution is
53  assumed to be continuous.
54 
55  This is the two-sided test, one-sided tests are not implemented.
56  The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
57 
58  If the K-S statistic is small or the p-value is high, then we cannot
59  reject the hypothesis that the distributions of the two samples
60  are the same.
61 
62  References
63  ----------
64  .. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
65  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
66  .. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].
67 
68  Examples
69  --------
70  >>> from scipy import stats
71  >>> np.random.seed(12345678) #fix random seed to get the same result
72  >>> n1 = 200 # size of first sample
73  >>> n2 = 300 # size of second sample
74 
75  For a different distribution, we can reject the null hypothesis since the
76  pvalue is below 1%:
77 
78  >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
79  >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
80  >>> stats.ks_2samp(rvs1, rvs2)
81  (0.20833333333333337, 4.6674975515806989e-005)
82 
83  For a slightly different distribution, we cannot reject the null hypothesis
84  at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
85 
86  >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
87  >>> stats.ks_2samp(rvs1, rvs3)
88  (0.10333333333333333, 0.14498781825751686)
89 
90  For an identical distribution, we cannot reject the null hypothesis since
91  the p-value is high, 41%:
92 
93  >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
94  >>> stats.ks_2samp(rvs1, rvs4)
95  (0.07999999999999996, 0.41126949729859719)
96 
97  """
98  if binned is True:
99  cdf1 = np.cumsum(data1)
100  cdf2 = np.cumsum(data2)
101  n1 = cdf1[-1]
102  n2 = cdf2[-1]
103  cdf1 /= n1
104  cdf2 /= n2
105  ndf = data1.shape[0]
106  else:
107  n1 = data1.shape[0]
108  n2 = data2.shape[0]
109  ndf = float("nan")
110  if binned is False:
111  data1 = np.sort(data1)
112  data2 = np.sort(data2)
113  data_all = np.concatenate([data1, data2])
114  cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*n1)
115  cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*n2)
116  if binned == 'kde':
117  data1_kde = sm.nonparametric.KDEUnivariate(data1)
118  data2_kde = sm.nonparametric.KDEUnivariate(data2)
119  data1_kde.fit()
120  data2_kde.fit()
121  i, min_sup = min(enumerate([data1_kde.support, data1_kde.support]), key = lambda data: data[1].shape[0])
122  if i == 0:
123  cdf1 = data1_kde.cdf
124  cdf2 = scipy.interpolate.interp1d(data2_kde.support, data2_kde.cdf)(min_sup)
125  else:
126  cdf1 = scipy.interpolate.interp1d(data1_kde.support, data1_kde.cdf)(min_sup)
127  cdf2 = data2_kde.cdf
128  d = np.max(np.absolute(cdf1 - cdf2))
129  # Note: d absolute not signed distance
130  en = np.sqrt(n1 * n2 / float(n1 + n2))
131  try:
132  prob = scipy.stats.distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
133  except Exception:
134  prob = 1.0
135  if DEBUG:
136  global statistic_seq
137  statistic_seq = np.absolute(cdf1 - cdf2)
138  return Ks_2sampResult(d, prob, ndf)
139 
140 

◆ likelihoodratio_ksamp()

def python.test_statistics.likelihoodratio_ksamp (   data1,
  data2,
  normed = True,
  binned = True 
)
The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-ratio test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 581 of file test_statistics.py.

581 def likelihoodratio_ksamp(data1, data2, normed=True, binned=True):
582  """
583  The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
584  drawn from the same distribution.
585 
586  Parameters
587  ----------
588  data1, data2 : sequence of 1-D ndarrays
589  Input data. Observed frequencies in each category.
590  normed : bool, optional, default: True
591  If True/False, shape/value comparison test.
592  binned : bool, optional, default: True
593  If True/False, assume the type of input data is observed frequencies/samples.
594 
595  Returns
596  -------
597  statistic : float
598  The llike-ratio test statistic.
599  pvalue : float
600  Corresponding p-value.
601  ndf : int
602  「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
603  is computed using a chi-squared distribution with k - 1 degrees of freedom,
604  where k is the number of observed frequencies without bins has zero counts
605  in both histograms.
606  """
607  not_all_zeros = (data1 != 0) & (data2 != 0)
608  ndf = sum((data1 != 0) | (data2 != 0))
609  N_data1 = data1.sum()
610  N_data2 = data2.sum()
611  if normed:
612  ndf -= 1
613  llike_H0 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H0") + _zloglikelihood(data1[~not_all_zeros], data2[~not_all_zeros], N_data1, N_data2, H="H0")
614  if DEBUG:
615  global statistic_seq
616  STATISTIC_SEQ = statistic_seq
617  llike_H1 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H1")
618  if DEBUG:
619  STATISTIC_SEQ[:statistic_seq.shape[0]] -= statistic_seq
620  statistic_seq = STATISTIC_SEQ
621  llike_ratio = llike_H0 - llike_H1
622  return LikelihoodRatio_ksampResult(llike_ratio, scipy.stats.chi2.sf(llike_ratio, ndf), ndf)
623 

◆ likelihoodvalue_ksamp()

def python.test_statistics.likelihoodvalue_ksamp (   data1,
  data2,
  normed = True,
  binned = True 
)
The likelihood-value test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-value test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 625 of file test_statistics.py.

625 def likelihoodvalue_ksamp(data1, data2, normed=True, binned=True):
626  """
627  The likelihood-value test tests the null hypothesis that 2 given frequencies are
628  drawn from the same distribution.
629 
630  Parameters
631  ----------
632  data1, data2 : sequence of 1-D ndarrays
633  Input data. Observed frequencies in each category.
634  normed : bool, optional, default: True
635  If True/False, shape/value comparison test.
636  binned : bool, optional, default: True
637  If True/False, assume the type of input data is observed frequencies/samples.
638 
639  Returns
640  -------
641  statistic : float
642  The llike-value test statistic.
643  pvalue : float
644  Corresponding p-value.
645  ndf : int
646  「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
647  is computed using a chi-squared distribution with k - 1 degrees of freedom,
648  where k is the number of observed frequencies without bins has zero counts
649  in both histograms.
650  """
651  no_zeros = (data1 != 0) | (data2 != 0)
652  ndf = no_zeros.sum()
653  if normed:
654  ndf -= 1
655  llike = _loglikelihood(data1[no_zeros], data2[no_zeros], data1.sum(), data2.sum(), ratio=False, H="H0")
656  return LikelihoodValue_ksampResult(llike, scipy.stats.chi2.sf(llike, ndf), ndf)

Variable Documentation

◆ Anderson_ksampResult

python.test_statistics.Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 397 of file test_statistics.py.

◆ BDM_2sampResult

python.test_statistics.BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 195 of file test_statistics.py.

◆ Chi2_2sampResult

python.test_statistics.Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 141 of file test_statistics.py.

◆ CVM_2sampResult

python.test_statistics.CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 239 of file test_statistics.py.

◆ DEBUG

bool python.test_statistics.DEBUG = False

Definition at line 17 of file test_statistics.py.

◆ Ks_2sampResult

python.test_statistics.Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 21 of file test_statistics.py.

◆ LikelihoodRatio_ksampResult

python.test_statistics.LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 580 of file test_statistics.py.

◆ LikelihoodValue_ksampResult

python.test_statistics.LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 624 of file test_statistics.py.

◆ statistic_seq

list python.test_statistics.statistic_seq = []

Definition at line 19 of file test_statistics.py.

python.test_statistics.LikelihoodValue_ksampResult
LikelihoodValue_ksampResult
Definition: test_statistics.py:624
python.test_statistics._log_binomial
def _log_binomial(n, k)
Definition: test_statistics.py:542
python.test_statistics._loglikelihood
def _loglikelihood(data1, data2, N_data1, N_data2, ratio, H="H0")
Definition: test_statistics.py:548
python.test_statistics.chi2_2samp
def chi2_2samp(data1, data2, normed=True, binned=True)
Definition: test_statistics.py:142
python.test_statistics.likelihoodratio_ksamp
def likelihoodratio_ksamp(data1, data2, normed=True, binned=True)
Definition: test_statistics.py:581
python.test_statistics.Anderson_ksampResult
Anderson_ksampResult
Definition: test_statistics.py:397
python.test_statistics.BDM_2samp
def BDM_2samp(data1, data2, normed=True, binned=True)
Definition: test_statistics.py:196
convertTimingResiduals.sum
sum
Definition: convertTimingResiduals.py:55
python.test_statistics._anderson_ksamp_midrank_binned
def _anderson_ksamp_midrank_binned(data, Z, Zstar, k, n, N)
Definition: test_statistics.py:288
plotBeamSpotVxVal.range
range
Definition: plotBeamSpotVxVal.py:195
python.test_statistics.LikelihoodRatio_ksampResult
LikelihoodRatio_ksampResult
Definition: test_statistics.py:580
histSizes.list
def list(name, path='/')
Definition: histSizes.py:38
min
#define min(a, b)
Definition: cfImp.cxx:40
python.test_statistics.likelihoodvalue_ksamp
def likelihoodvalue_ksamp(data1, data2, normed=True, binned=True)
Definition: test_statistics.py:625
python.test_statistics._anderson_ksamp_right
def _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
Definition: test_statistics.py:361
python.test_statistics.Ks_2sampResult
Ks_2sampResult
Definition: test_statistics.py:21
python.test_statistics.BDM_2sampResult
BDM_2sampResult
Definition: test_statistics.py:195
python.test_statistics.CVM_2samp
def CVM_2samp(data1, data2, normed=True, binned=False)
Definition: test_statistics.py:240
python.test_statistics.ks_2samp
def ks_2samp(data1, data2, binned=False)
Definition: test_statistics.py:22
python.CaloCondTools.log
log
Definition: CaloCondTools.py:20
python.test_statistics._zloglikelihood
def _zloglikelihood(data1, data2, N_data1, N_data2, H="H0")
Definition: test_statistics.py:564
dot
Definition: dot.py:1
python.test_statistics.Chi2_2sampResult
Chi2_2sampResult
Definition: test_statistics.py:141
python.test_statistics.anderson_ksamp
def anderson_ksamp(data1, data2, binned=False, midrank=True)
Definition: test_statistics.py:398
python.test_statistics.CVM_2sampResult
CVM_2sampResult
Definition: test_statistics.py:239
python.test_statistics._log_fac
def _log_fac(m, n)
Definition: test_statistics.py:539
readCCLHist.float
float
Definition: readCCLHist.py:83
python.test_statistics._anderson_ksamp_midrank
def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
Definition: test_statistics.py:309