ATLAS Offline Software
Loading...
Searching...
No Matches
python.test_statistics Namespace Reference

Functions

 ks_2samp (data1, data2, binned=False)
 chi2_2samp (data1, data2, normed=True, binned=True)
 BDM_2samp (data1, data2, normed=True, binned=True)
 CVM_2samp (data1, data2, normed=True, binned=False)
 _anderson_ksamp_midrank_binned (data, Z, Zstar, k, n, N)
 _anderson_ksamp_midrank (samples, Z, Zstar, k, n, N)
 _anderson_ksamp_right (samples, Z, Zstar, k, n, N)
 anderson_ksamp (data1, data2, binned=False, midrank=True)
 _log_fac (m, n)
 _log_binomial (n, k)
 _loglikelihood (data1, data2, N_data1, N_data2, ratio, H="H0")
 _zloglikelihood (data1, data2, N_data1, N_data2, H="H0")
 likelihoodratio_ksamp (data1, data2, normed=True, binned=True)
 likelihoodvalue_ksamp (data1, data2, normed=True, binned=True)

Variables

bool DEBUG = False
list statistic_seq = []
 Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))
 Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))
 BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))
 CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))
 Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))
 LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))
 LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))

Detailed Description

Test statistic functions.

Function Documentation

◆ _anderson_ksamp_midrank()

python.test_statistics._anderson_ksamp_midrank ( samples,
Z,
Zstar,
k,
n,
N )
protected
Compute A2akN equation 7 of Scholz and Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2aKN : float
    The A2aKN statistics of Scholz and Stephens 1987.

Definition at line 309 of file test_statistics.py.

309def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
310 """
311 Compute A2akN equation 7 of Scholz and Stephens.
312
313 Parameters
314 ----------
315 samples : sequence of 1-D array_like
316 Array of sample arrays.
317 Z : array_like
318 Sorted array of all observations.
319 Zstar : array_like
320 Sorted array of unique observations.
321 k : int
322 Number of samples.
323 n : array_like
324 Number of observations in each sample.
325 N : int
326 Total number of observations.
327
328 Returns
329 -------
330 A2aKN : float
331 The A2aKN statistics of Scholz and Stephens 1987.
332 """
333
334 if DEBUG:
335 global statistic_seq
336 statistic_seq = []
337
338 A2akN = 0.
339 Z_ssorted_left = Z.searchsorted(Zstar, 'left')
340 if N == Zstar.size:
341 lj = 1.
342 else:
343 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
344 Bj = Z_ssorted_left + lj / 2.
345 for i in np.arange(0, k):
346 s = np.sort(samples[i])
347 s_ssorted_right = s.searchsorted(Zstar, side='right')
348 Mij = s_ssorted_right.astype(float)
349 fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
350 Mij -= fij / 2.
351 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
352 A2akN += inner.sum() / n[i]
353
354 if DEBUG:
355 statistic_seq.append(A2akN*(N-1.)/N)
356
357 A2akN *= (N - 1.) / N
358 return A2akN
359
360

◆ _anderson_ksamp_midrank_binned()

python.test_statistics._anderson_ksamp_midrank_binned ( data,
Z,
Zstar,
k,
n,
N )
protected

Definition at line 288 of file test_statistics.py.

288def _anderson_ksamp_midrank_binned(data, Z, Zstar, k, n, N):
289
290 if DEBUG:
291 global statistic_seq
292 statistic_seq = []
293
294 A2akN = 0.
295 lj = Z
296 Bj = Z.cumsum() - lj/2
297 for i in np.arange(0, k):
298 d = data[i]
299 Mij = d.cumsum() - d/2
300 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
301 A2akN += np.nan_to_num(inner).sum() / n[i]
302
303 if DEBUG:
304 statistic_seq.append(A2akN*(N-1.)/N)
305
306 A2akN *= (N - 1.) / N
307 return A2akN
308

◆ _anderson_ksamp_right()

python.test_statistics._anderson_ksamp_right ( samples,
Z,
Zstar,
k,
n,
N )
protected
Compute A2akN equation 6 of Scholz & Stephens.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample arrays.
Z : array_like
    Sorted array of all observations.
Zstar : array_like
    Sorted array of unique observations.
k : int
    Number of samples.
n : array_like
    Number of observations in each sample.
N : int
    Total number of observations.

Returns
-------
A2KN : float
    The A2KN statistics of Scholz and Stephens 1987.

Definition at line 361 of file test_statistics.py.

361def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
362 """
363 Compute A2akN equation 6 of Scholz & Stephens.
364
365 Parameters
366 ----------
367 samples : sequence of 1-D array_like
368 Array of sample arrays.
369 Z : array_like
370 Sorted array of all observations.
371 Zstar : array_like
372 Sorted array of unique observations.
373 k : int
374 Number of samples.
375 n : array_like
376 Number of observations in each sample.
377 N : int
378 Total number of observations.
379
380 Returns
381 -------
382 A2KN : float
383 The A2KN statistics of Scholz and Stephens 1987.
384 """
385
386 A2kN = 0.
387 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], 'left')
388 Bj = lj.cumsum()
389 for i in range(0, k):
390 s = np.sort(samples[i])
391 Mij = s.searchsorted(Zstar[:-1], side='right')
392 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
393 A2kN += inner.sum() / n[i]
394 return A2kN
395
396

◆ _log_binomial()

python.test_statistics._log_binomial ( n,
k )
protected

Definition at line 542 of file test_statistics.py.

542def _log_binomial(n,k):
543 if k > (n-k):
544 return (_log_fac(n-k+1,n)-_log_fac(2,k))
545 else:
546 return (_log_fac(k+1,n)-_log_fac(2,n-k))
547

◆ _log_fac()

python.test_statistics._log_fac ( m,
n )
protected

Definition at line 539 of file test_statistics.py.

539def _log_fac(m,n):
540 return np.log(np.arange(m,n+1)).sum()
541@np.vectorize

◆ _loglikelihood()

python.test_statistics._loglikelihood ( data1,
data2,
N_data1,
N_data2,
ratio,
H = "H0" )
protected

Definition at line 548 of file test_statistics.py.

548def _loglikelihood(data1, data2, N_data1, N_data2, ratio, H = "H0"):
549 t = data1 + data2
550 logC = 0.
551 if not ratio:
552 logC = float(sum(log(binomial(t[i], data2[i])) for i in range(t.shape[0])))
553 if H=="H1":
554 a = data2/data1
555 if H=="H0":
556 a = N_data2 / N_data1
557
558 if DEBUG:
559 global statistic_seq
560 statistic_seq = -2 * (logC - scipy.special.xlogy(t, 1+a) + scipy.special.xlogy(data2, a))
561
562 return -2 * (logC - scipy.special.xlogy(t, 1+a).sum() + scipy.special.xlogy(data2, a).sum())
563

◆ _zloglikelihood()

python.test_statistics._zloglikelihood ( data1,
data2,
N_data1,
N_data2,
H = "H0" )
protected

Definition at line 564 of file test_statistics.py.

564def _zloglikelihood(data1, data2, N_data1, N_data2, H = "H0"):
565 z_not_in_both = (data1 == 0) ^ (data2 == 0)
566 data1 = data1[z_not_in_both]
567 data2 = data2[z_not_in_both]
568 t = data1 + data2
569 if H == "H0":
570 p = N_data1 / (N_data1 + N_data2)
571
572 if DEBUG:
573 global statistic_seq
574 statistic_seq = np.concatenate([statistic_seq, -2 * scipy.special.xlogy(t[data2==0], p), scipy.special.xlogy(t[data1 == 0], 1-p)])
575
576 return -2 * (scipy.special.xlogy(t[data2==0], p).sum() + scipy.special.xlogy(t[data1 == 0], 1-p).sum())
577 if H == "H1":
578 return 0.
579

◆ anderson_ksamp()

python.test_statistics.anderson_ksamp ( data1,
data2,
binned = False,
midrank = True )
The Anderson-Darling test for 2 samples.

It tests the null hypothesis
that the 2 samples are drawn from the same population without
having to specify the distribution function of that population.
The critical values depend on the number of samples.

Parameters
----------
samples : sequence of 1-D array_like
    Array of sample data in arrays.

Returns
-------
statistic : float
    AD statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value.
    Always NaN in this case.

Raises
------
ValueError
    If less than 2 samples are provided, a sample is empty, or no
    distinct observations are in the samples.

See Also
--------
ks_2samp : 2 sample Kolmogorov-Smirnov test

Notes
-----
This code is modified from scipy.stats.anderson and extended with
supporting on frequency data. See [1]_ for further information.

[2]_ Defines three versions of the k-sample Anderson-Darling test:
one for continuous distributions and two for discrete
distributions, in which ties between samples may occur. The
default of this routine is to compute the version based on the
midrank empirical distribution function. This test is applicable
to continuous and discrete data. If midrank is set to False, the
right side empirical distribution is used for a test for discrete
data. According to [1]_, the two discrete test statistics differ
only slightly if a few collisions due to round-off errors occur in
the test not adjusted for ties between samples.

References
----------
.. [1] scipy.stats.anderson — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
.. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
       Anderson-Darling Tests, Journal of the American Statistical
       Association, Vol. 82, pp. 918-924.

Examples
--------
>>> from scipy import stats
>>> np.random.seed(314159)

The null hypothesis that the two random samples come from the same
distribution can be rejected at the 5% level because the returned
test value is greater than the critical value for 5% (1.961) but
not at the 2.5% level. The interpolation gives an approximate
significance level of 3.1%:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(loc=0.5, size=30)])
(2.4615796189876105,
  array([ 0.325,  1.226,  1.961,  2.718,  3.752]),
  0.03134990135800783)


The null hypothesis cannot be rejected for three samples from an
identical distribution. The approximate p-value (87%) has to be
computed by extrapolation and may not be very accurate:

>>> stats.anderson_ksamp([np.random.normal(size=50),
... np.random.normal(size=30), np.random.normal(size=20)])
(-0.73091722665244196,
  array([ 0.44925884,  1.3052767 ,  1.9434184 ,  2.57696569,  3.41634856]),
  0.8789283903979661)

Definition at line 398 of file test_statistics.py.

398def anderson_ksamp(data1, data2, binned=False, midrank=True):
399 """The Anderson-Darling test for 2 samples.
400
401 It tests the null hypothesis
402 that the 2 samples are drawn from the same population without
403 having to specify the distribution function of that population.
404 The critical values depend on the number of samples.
405
406 Parameters
407 ----------
408 samples : sequence of 1-D array_like
409 Array of sample data in arrays.
410
411 Returns
412 -------
413 statistic : float
414 AD statistic
415 pvalue : float
416 two-tailed p-value
417 ndf : NaN
418 「Degrees of freedom」: the degrees of freedom for the p-value.
419 Always NaN in this case.
420
421 Raises
422 ------
423 ValueError
424 If less than 2 samples are provided, a sample is empty, or no
425 distinct observations are in the samples.
426
427 See Also
428 --------
429 ks_2samp : 2 sample Kolmogorov-Smirnov test
430
431 Notes
432 -----
433 This code is modified from scipy.stats.anderson and extended with
434 supporting on frequency data. See [1]_ for further information.
435
436 [2]_ Defines three versions of the k-sample Anderson-Darling test:
437 one for continuous distributions and two for discrete
438 distributions, in which ties between samples may occur. The
439 default of this routine is to compute the version based on the
440 midrank empirical distribution function. This test is applicable
441 to continuous and discrete data. If midrank is set to False, the
442 right side empirical distribution is used for a test for discrete
443 data. According to [1]_, the two discrete test statistics differ
444 only slightly if a few collisions due to round-off errors occur in
445 the test not adjusted for ties between samples.
446
447 References
448 ----------
449 .. [1] scipy.stats.anderson — SciPy Reference Guide
450 http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html
451 .. [2] Scholz, F. W and Stephens, M. A. (1987), K-Sample
452 Anderson-Darling Tests, Journal of the American Statistical
453 Association, Vol. 82, pp. 918-924.
454
455 Examples
456 --------
457 >>> from scipy import stats
458 >>> np.random.seed(314159)
459
460 The null hypothesis that the two random samples come from the same
461 distribution can be rejected at the 5% level because the returned
462 test value is greater than the critical value for 5% (1.961) but
463 not at the 2.5% level. The interpolation gives an approximate
464 significance level of 3.1%:
465
466 >>> stats.anderson_ksamp([np.random.normal(size=50),
467 ... np.random.normal(loc=0.5, size=30)])
468 (2.4615796189876105,
469 array([ 0.325, 1.226, 1.961, 2.718, 3.752]),
470 0.03134990135800783)
471
472
473 The null hypothesis cannot be rejected for three samples from an
474 identical distribution. The approximate p-value (87%) has to be
475 computed by extrapolation and may not be very accurate:
476
477 >>> stats.anderson_ksamp([np.random.normal(size=50),
478 ... np.random.normal(size=30), np.random.normal(size=20)])
479 (-0.73091722665244196,
480 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856]),
481 0.8789283903979661)
482
483 """
484 samples = [data1, data2]
485 k = len(samples)
486 if (k < 2):
487 raise ValueError("anderson_ksamp needs at least two samples")
488
489 samples = list(map(np.asarray, samples))
490 if binned:
491 Z = Zstar = sum(samples)
492 n = np.array([sample.sum() for sample in samples])
493 N = n.sum()
494 A2kN = _anderson_ksamp_midrank_binned(samples, Z, Zstar, k, n, N)
495 else:
496 Z = np.sort(np.hstack(samples))
497 N = Z.size
498 Zstar = np.unique(Z)
499 if Zstar.size < 2:
500 raise ValueError("anderson_ksamp needs more than one distinct "
501 "observation")
502
503 n = np.array([sample.size for sample in samples])
504 if any(n == 0):
505 raise ValueError("anderson_ksamp encountered sample without "
506 "observations")
507
508 if midrank:
509 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
510 else:
511 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
512
513 H = (1. / n).sum()
514 hs_cs = (1. / np.arange(N - 1, 1, -1)).cumsum()
515 h = hs_cs[-1] + 1
516 g = (hs_cs / np.arange(2, N)).sum()
517
518 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
519 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
520 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
521 d = (2*h + 6)*k**2 - 4*h*k
522 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
523 m = k - 1
524 A2 = (A2kN - m) / np.sqrt(sigmasq)
525
526 # The b_i values are the interpolation coefficients from Table 2
527 # of Scholz and Stephens 1987
528 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326])
529 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822])
530 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396])
531 critical = b0 + b1 / np.sqrt(m) + b2 / m
532 pf = np.polyfit(critical, np.log(np.array([0.25, 0.1, 0.05, 0.025, 0.01])), 2)
533 if A2 < critical.min() or A2 > critical.max():
534 warnings.warn("approximate p-value will be computed by extrapolation") # noqa: B028 (no stacklevel needed)
535
536 p = np.exp(np.polyval(pf, A2))
537 return Anderson_ksampResult(A2, p, float('nan'))
538
STL class.

◆ BDM_2samp()

python.test_statistics.BDM_2samp ( data1,
data2,
normed = True,
binned = True )
The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.    

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional
    If True/False, shape/value comparison test.
binned : bool, optional
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The Bhattacharyya distance measure.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 196 of file test_statistics.py.

196def BDM_2samp(data1, data2, normed=True, binned=True):
197 """
198 The Bhattacharyya test tests the null hypothesis that 2 given frequencies are drawn
199 from the same distribution.
200
201 Parameters
202 ----------
203 data1, data2 : sequence of 1-D ndarrays
204 Input data. Observed frequencies in each category.
205 normed : bool, optional
206 If True/False, shape/value comparison test.
207 binned : bool, optional
208 If True/False, assume the type of input data is observed frequencies/samples.
209
210 Returns
211 -------
212 statistic : float
213 The Bhattacharyya distance measure.
214 pvalue : float
215 Corresponding p-value.
216 ndf : int
217 「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
218 is computed using a chi-squared distribution with k - 1 degrees of freedom,
219 where k is the number of observed frequencies without bins has zero counts
220 in both histograms.
221 """
222 if binned:
223 filter = ~((data1 == 0.) & (data2 == 0.))
224 data1 = data1[filter]
225 data2 = data2[filter]
226 ndf = data1.shape[0]
227 N_data1 = data1.sum()
228 N_data2 = data2.sum()
229 bdm = np.sqrt(data1 * data2).sum()
230 approx_chi2 = 2*N_data1+2*N_data2 - 4*bdm
231 if normed:
232 ndf -= 1
233 bdm /= np.sqrt(N_data1 * N_data2)
234 if DEBUG:
235 global statistic_seq
236 statistic_seq = 2*data1+2*data2 - 4*np.sqrt(data1 * data2)
237 return BDM_2sampResult(bdm, scipy.stats.chi2.sf(approx_chi2, ndf), ndf)
238

◆ chi2_2samp()

python.test_statistics.chi2_2samp ( data1,
data2,
normed = True,
binned = True )
The chi square test tests the null hypothesis that 2 given frequencies are drawn
from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The chi-squared test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Note
----
This code is modified from scipy.stats.chisquare and extended with supporting on
2 sample cases and shape comparison test. 

Definition at line 142 of file test_statistics.py.

142def chi2_2samp(data1, data2, normed=True, binned=True):
143 """
144 The chi square test tests the null hypothesis that 2 given frequencies are drawn
145 from the same distribution.
146
147 Parameters
148 ----------
149 data1, data2 : sequence of 1-D ndarrays
150 Input data. Observed frequencies in each category.
151 normed : bool, optional, default: True
152 If True/False, shape/value comparison test.
153 binned : bool, optional, default: True
154 If True/False, assume the type of input data is observed frequencies/samples.
155
156 Returns
157 -------
158 statistic : float
159 The chi-squared test statistic.
160 pvalue : float
161 Corresponding p-value.
162 ndf : int
163 「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
164 is computed using a chi-squared distribution with k - 1 degrees of freedom,
165 where k is the number of observed frequencies without bins has zero counts
166 in both histograms.
167
168 Note
169 ----
170 This code is modified from scipy.stats.chisquare and extended with supporting on
171 2 sample cases and shape comparison test.
172 """
173 if binned is True:
174 filter = ~((data1 == 0.) & (data2 == 0.))
175 data1 = data1[filter]
176 data2 = data2[filter]
177 ndf = data1.shape[0]
178 if not normed:
179 chi2 = np.power(data1 - data2, 2).dot(np.power(data1 + data2, -1))
180 else:
181 ndf -= 1
182 N_data1 = data1.sum()
183 N_data2 = data2.sum()
184 data1 /= N_data1
185 data2 /= N_data2
186 chi2 = np.power(data1 - data2, 2).dot(np.power(data1/N_data1 + data2/N_data2, -1))
187 if DEBUG:
188 global statistic_seq
189 if not normed:
190 statistic_seq = np.power(data1 - data2, 2) * (np.power(data1 + data2, -1))
191 else:
192 statistic_seq = np.power(data1 - data2, 2) * np.power(data1/N_data1 + data2/N_data2, -1)
193 return Chi2_2sampResult(chi2, scipy.stats.chi2.sf(chi2, ndf), ndf)
194
Definition dot.py:1

◆ CVM_2samp()

python.test_statistics.CVM_2samp ( data1,
data2,
normed = True,
binned = False )
Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1 ,data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    CVM statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Definition at line 240 of file test_statistics.py.

240def CVM_2samp(data1, data2, normed=True, binned=False):
241 """
242 Computes the Cram ́er-von-Mises statistic on 2 samples/frequencies.
243
244 This is a two-sided test for the null hypothesis that 2 independent samples
245 are drawn from the same continuous distribution.
246
247 Parameters
248 ----------
249 data1 ,data2 : sequence of 1-D ndarrays
250 Input data. Can be either observed frequencies in each category or array
251 of sample observations.
252 binned : bool, optional, default: False
253 If True/False, assume the type of input data is observed frequencies/samples.
254
255 Returns
256 -------
257 statistic : float
258 CVM statistic
259 pvalue : float
260 two-tailed p-value
261 ndf : NaN
262 「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
263 """
264 N1 = data1.shape[0]
265 N2 = data2.shape[0]
266 N = N1 + N2
267 if binned:
268 E1 = data1.sum()
269 E2 = data2.sum()
270 cdf1 = np.cumsum(data1) / E1
271 cdf2 = np.cumsum(data2) / E2
272 weights = data1 + data2
273 else:
274 E1 = N1
275 E2 = N2
276 data_all = np.concatenate([data1, data2])
277 cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*N1)
278 cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*N2)
279 weights = 1.
280 cvm = (weights * (cdf1 - cdf2)**2).sum() * E1 * E2 / (E1 + E2)**2
281 mean = 1./6 + 1./6/N
282 var = (N + 1.) * (4.*N1*N2*N - 3*(N1**2+N2**2) - 2*N1*N2) / (4 * N1 * N2) /N**2/45
283 if DEBUG:
284 global statistic_seq
285 statistic_seq = (weights * (cdf1 - cdf2)**2) * E1 * E2 / (E1 + E2)**2
286 return CVM_2sampResult(cvm, 1-distr.CVM(cvm, mean, var), float("nan"))
287

◆ ks_2samp()

python.test_statistics.ks_2samp ( data1,
data2,
binned = False )
Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.

This is a two-sided test for the null hypothesis that 2 independent samples
are drawn from the same continuous distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Can be either observed frequencies in each category or array
    of sample observations.
binned : bool, optional, default: False
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    KS statistic
pvalue : float
    two-tailed p-value
ndf : NaN
    「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.

Notes
-----
This code is modified from scipy.stats.ks_2samp and extended with supporting on
frequency data. See [1]_ for further information.

This tests whether 2 samples are drawn from the same distribution. Note
that, like in the case of the one-sample K-S test, the distribution is
assumed to be continuous.

This is the two-sided test, one-sided tests are not implemented.
The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

If the K-S statistic is small or the p-value is high, then we cannot
reject the hypothesis that the distributions of the two samples
are the same.

References
----------
.. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
       http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
.. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].

Examples
--------
>>> from scipy import stats
>>> np.random.seed(12345678)  #fix random seed to get the same result
>>> n1 = 200  # size of first sample
>>> n2 = 300  # size of second sample

For a different distribution, we can reject the null hypothesis since the
pvalue is below 1%:

>>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
>>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
>>> stats.ks_2samp(rvs1, rvs2)
(0.20833333333333337, 4.6674975515806989e-005)

For a slightly different distribution, we cannot reject the null hypothesis
at a 10% or lower alpha since the p-value at 0.144 is higher than 10%

>>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs3)
(0.10333333333333333, 0.14498781825751686)

For an identical distribution, we cannot reject the null hypothesis since
the p-value is high, 41%:

>>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
>>> stats.ks_2samp(rvs1, rvs4)
(0.07999999999999996, 0.41126949729859719)

Definition at line 22 of file test_statistics.py.

22def ks_2samp(data1, data2, binned=False):
23 """
24 Computes the Kolmogorov-Smirnov statistic on 2 samples/frequencies.
25
26 This is a two-sided test for the null hypothesis that 2 independent samples
27 are drawn from the same continuous distribution.
28
29 Parameters
30 ----------
31 data1, data2 : sequence of 1-D ndarrays
32 Input data. Can be either observed frequencies in each category or array
33 of sample observations.
34 binned : bool, optional, default: False
35 If True/False, assume the type of input data is observed frequencies/samples.
36
37 Returns
38 -------
39 statistic : float
40 KS statistic
41 pvalue : float
42 two-tailed p-value
43 ndf : NaN
44 「Degrees of freedom」: the degrees of freedom for the p-value. Always NaN in this case.
45
46 Notes
47 -----
48 This code is modified from scipy.stats.ks_2samp and extended with supporting on
49 frequency data. See [1]_ for further information.
50
51 This tests whether 2 samples are drawn from the same distribution. Note
52 that, like in the case of the one-sample K-S test, the distribution is
53 assumed to be continuous.
54
55 This is the two-sided test, one-sided tests are not implemented.
56 The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
57
58 If the K-S statistic is small or the p-value is high, then we cannot
59 reject the hypothesis that the distributions of the two samples
60 are the same.
61
62 References
63 ----------
64 .. [1] scipy.stats.mstats.ks_2samp — SciPy Reference Guide
65 http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.ks_2samp.html
66 .. [2] Frank C. Porter. (2008). Testing Consistency of Two Histograms, arXiv:0804.0380 [physics.data-an].
67
68 Examples
69 --------
70 >>> from scipy import stats
71 >>> np.random.seed(12345678) #fix random seed to get the same result
72 >>> n1 = 200 # size of first sample
73 >>> n2 = 300 # size of second sample
74
75 For a different distribution, we can reject the null hypothesis since the
76 pvalue is below 1%:
77
78 >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
79 >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
80 >>> stats.ks_2samp(rvs1, rvs2)
81 (0.20833333333333337, 4.6674975515806989e-005)
82
83 For a slightly different distribution, we cannot reject the null hypothesis
84 at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
85
86 >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
87 >>> stats.ks_2samp(rvs1, rvs3)
88 (0.10333333333333333, 0.14498781825751686)
89
90 For an identical distribution, we cannot reject the null hypothesis since
91 the p-value is high, 41%:
92
93 >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
94 >>> stats.ks_2samp(rvs1, rvs4)
95 (0.07999999999999996, 0.41126949729859719)
96
97 """
98 if binned is True:
99 cdf1 = np.cumsum(data1)
100 cdf2 = np.cumsum(data2)
101 n1 = cdf1[-1]
102 n2 = cdf2[-1]
103 cdf1 /= n1
104 cdf2 /= n2
105 ndf = data1.shape[0]
106 else:
107 n1 = data1.shape[0]
108 n2 = data2.shape[0]
109 ndf = float("nan")
110 if binned is False:
111 data1 = np.sort(data1)
112 data2 = np.sort(data2)
113 data_all = np.concatenate([data1, data2])
114 cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*n1)
115 cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*n2)
116 if binned == 'kde':
117 data1_kde = sm.nonparametric.KDEUnivariate(data1)
118 data2_kde = sm.nonparametric.KDEUnivariate(data2)
119 data1_kde.fit()
120 data2_kde.fit()
121 i, min_sup = min(enumerate([data1_kde.support, data1_kde.support]), key = lambda data: data[1].shape[0])
122 if i == 0:
123 cdf1 = data1_kde.cdf
124 cdf2 = scipy.interpolate.interp1d(data2_kde.support, data2_kde.cdf)(min_sup)
125 else:
126 cdf1 = scipy.interpolate.interp1d(data1_kde.support, data1_kde.cdf)(min_sup)
127 cdf2 = data2_kde.cdf
128 d = np.max(np.absolute(cdf1 - cdf2))
129 # Note: d absolute not signed distance
130 en = np.sqrt(n1 * n2 / float(n1 + n2))
131 try:
132 prob = scipy.stats.distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
133 except Exception:
134 prob = 1.0
135 if DEBUG:
136 global statistic_seq
137 statistic_seq = np.absolute(cdf1 - cdf2)
138 return Ks_2sampResult(d, prob, ndf)
139
140
#define min(a, b)
Definition cfImp.cxx:40

◆ likelihoodratio_ksamp()

python.test_statistics.likelihoodratio_ksamp ( data1,
data2,
normed = True,
binned = True )
The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-ratio test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 581 of file test_statistics.py.

581def likelihoodratio_ksamp(data1, data2, normed=True, binned=True):
582 """
583 The likelihood-ratio test tests the null hypothesis that 2 given frequencies are
584 drawn from the same distribution.
585
586 Parameters
587 ----------
588 data1, data2 : sequence of 1-D ndarrays
589 Input data. Observed frequencies in each category.
590 normed : bool, optional, default: True
591 If True/False, shape/value comparison test.
592 binned : bool, optional, default: True
593 If True/False, assume the type of input data is observed frequencies/samples.
594
595 Returns
596 -------
597 statistic : float
598 The llike-ratio test statistic.
599 pvalue : float
600 Corresponding p-value.
601 ndf : int
602 「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
603 is computed using a chi-squared distribution with k - 1 degrees of freedom,
604 where k is the number of observed frequencies without bins has zero counts
605 in both histograms.
606 """
607 not_all_zeros = (data1 != 0) & (data2 != 0)
608 ndf = sum((data1 != 0) | (data2 != 0))
609 N_data1 = data1.sum()
610 N_data2 = data2.sum()
611 if normed:
612 ndf -= 1
613 llike_H0 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H0") + _zloglikelihood(data1[~not_all_zeros], data2[~not_all_zeros], N_data1, N_data2, H="H0")
614 if DEBUG:
615 global statistic_seq
616 STATISTIC_SEQ = statistic_seq
617 llike_H1 = _loglikelihood(data1[not_all_zeros], data2[not_all_zeros], N_data1, N_data2, ratio=True, H="H1")
618 if DEBUG:
619 STATISTIC_SEQ[:statistic_seq.shape[0]] -= statistic_seq
620 statistic_seq = STATISTIC_SEQ
621 llike_ratio = llike_H0 - llike_H1
622 return LikelihoodRatio_ksampResult(llike_ratio, scipy.stats.chi2.sf(llike_ratio, ndf), ndf)
623

◆ likelihoodvalue_ksamp()

python.test_statistics.likelihoodvalue_ksamp ( data1,
data2,
normed = True,
binned = True )
The likelihood-value test tests the null hypothesis that 2 given frequencies are
drawn from the same distribution.

Parameters
----------
data1, data2 : sequence of 1-D ndarrays
    Input data. Observed frequencies in each category.
normed : bool, optional, default: True
    If True/False, shape/value comparison test.
binned : bool, optional, default: True
    If True/False, assume the type of input data is observed frequencies/samples.

Returns
-------
statistic : float
    The llike-value test statistic.
pvalue : float
    Corresponding p-value.
ndf : int
    「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
    is computed using a chi-squared distribution with k - 1 degrees of freedom,
    where k is the number of observed frequencies without bins has zero counts 
    in both histograms.

Definition at line 625 of file test_statistics.py.

625def likelihoodvalue_ksamp(data1, data2, normed=True, binned=True):
626 """
627 The likelihood-value test tests the null hypothesis that 2 given frequencies are
628 drawn from the same distribution.
629
630 Parameters
631 ----------
632 data1, data2 : sequence of 1-D ndarrays
633 Input data. Observed frequencies in each category.
634 normed : bool, optional, default: True
635 If True/False, shape/value comparison test.
636 binned : bool, optional, default: True
637 If True/False, assume the type of input data is observed frequencies/samples.
638
639 Returns
640 -------
641 statistic : float
642 The llike-value test statistic.
643 pvalue : float
644 Corresponding p-value.
645 ndf : int
646 「Degrees of freedom」: the degrees of freedom for the p-value. The p-value
647 is computed using a chi-squared distribution with k - 1 degrees of freedom,
648 where k is the number of observed frequencies without bins has zero counts
649 in both histograms.
650 """
651 no_zeros = (data1 != 0) | (data2 != 0)
652 ndf = no_zeros.sum()
653 if normed:
654 ndf -= 1
655 llike = _loglikelihood(data1[no_zeros], data2[no_zeros], data1.sum(), data2.sum(), ratio=False, H="H0")
656 return LikelihoodValue_ksampResult(llike, scipy.stats.chi2.sf(llike, ndf), ndf)

Variable Documentation

◆ Anderson_ksampResult

python.test_statistics.Anderson_ksampResult = namedtuple('Anderson_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 397 of file test_statistics.py.

◆ BDM_2sampResult

python.test_statistics.BDM_2sampResult = namedtuple('BDM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 195 of file test_statistics.py.

◆ Chi2_2sampResult

python.test_statistics.Chi2_2sampResult = namedtuple('Chi2_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 141 of file test_statistics.py.

◆ CVM_2sampResult

python.test_statistics.CVM_2sampResult = namedtuple('CVM_2sampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 239 of file test_statistics.py.

◆ DEBUG

bool python.test_statistics.DEBUG = False

Definition at line 17 of file test_statistics.py.

◆ Ks_2sampResult

python.test_statistics.Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue','ndf'))

Definition at line 21 of file test_statistics.py.

◆ LikelihoodRatio_ksampResult

python.test_statistics.LikelihoodRatio_ksampResult = namedtuple('LikelihoodRatio_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 580 of file test_statistics.py.

◆ LikelihoodValue_ksampResult

python.test_statistics.LikelihoodValue_ksampResult = namedtuple('LikelihoodValue_ksampResult', ('statistic', 'pvalue', 'ndf'))

Definition at line 624 of file test_statistics.py.

◆ statistic_seq

list python.test_statistics.statistic_seq = []

Definition at line 19 of file test_statistics.py.