ATLAS Offline Software
Loading...
Searching...
No Matches
power_of_test.py
Go to the documentation of this file.
1# Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2
3from scipy.stats import rv_discrete
4import test_statistics as TS
5import numpy as np
6
7# If True, import IPyParallel package to do the parallel computation.
8# And if IPyParallel is not installed, automatically turns it False.
9PARALLEL = True
10if PARALLEL:
11 try:
12 from ipyparallel import Client
13 except ImportError:
14 PARALLEL = False
15# IPyParallel configurations.
16# For example, to activate SSH mode:
17# ipp_profile = {
18# 'sshserver':'<username>@<host>',
19# 'profile':'<IPyParallel profile directory>'
20# }
21# Leave it empty if you don't want to make changes.
22ipp_profile = {'sshserver':'kingman@140.114.94.171',
23 'profile':'ssh@nthu'}
24
25def _hist_fill(data, bins, entries = 0, freeze = None):
26 if type(bins)==int:
27 if bins==0:
28 bins = np.arange(data.size + 1, dtype = float)
29 else:
30 bins = np.linspace(data.min(), data.max(), bins)
31 binc = (bins[1:] + bins[:-1]) / 2
32 if freeze:
33 return lambda : data
34 else:
35 rv = rv_discrete(values = (binc, data/data.sum()))
36 return lambda : np.histogram(rv.rvs(size=data.sum() if entries==0 else entries), bins)[0].astype(float)
37
38def rvs_pairs(data1, data2, bins = 0, kind = 'binbybin', bin_mean = 1, size = 1, entries = 0, freeze = False):
39 """
40 Compute the histogram pairs from the random variates of the given 2 samples/frequencies for size_times.
41
42 Parameters
43 ----------
44 data1, data2 : sequence of 1-D ndarrays
45 Input data. Observed samples or frequencies.
46 bins : [scalar|1-D array], optional, default : 0
47 If bins is an int, it defines the number of equal-width bins in the given
48 range. If bins is zero, the given data will be assumed to be frequency data,
49 and it defines the bin index. If bins is an 1-D array, it defines the bin
50 edges, including the rightmost edge, allowing for non-uniform bin widths.
51 kind : [str|(str,str)], optional, default : "binbybin"
52 Bin-by-bin test
53 bin_mean : [scalar|(scalar,scalar)], optional, default : 1
54 The desired mean count in each bin of the output histograms. For example,
55 if a histogram has 10000 entries and 100 bins, the bin_mean is 100. It is
56 ignored when entries is given.
57 size : int, optional, default : 1
58 The number of repetitions for computing the histograms.
59 entries : [scalar|(scalar,scalar)], optional, default : 0
60 The desired entries of the output histograms. If entries is 0, the entries
61 size of the input data will be used instead.
62 freeze : [bool,(bool,bool)], optional. default : False
63 if True, the output histogram will be the frozen distribution of the input
64 data
65
66 Yields
67 ------
68 hist1, hist2 : (1-D array, 1-D array)
69 Histogram pairs
70 """
71 data1 = data1.astype(float)
72 data2 = data2.astype(float)
73 entries = np.resize(entries, 2)
74 freeze = np.resize(freeze, 2)
75 hist1 = _hist_fill(data1, bins, entries[0], freeze[0])
76 hist2 = _hist_fill(data2, bins, entries[1], freeze[1])
77 for _ in range(size):
78 yield hist1(), hist2()
79
80def power_of_test(data1, data2, rvs_func = 'rvs_pairs', tests = ['chi2_2samp'], rvs_key = {}, test_key = {}, parallel=None, sync=True):
81 """Compute the corresponding p-values for each histrogram pairs from the random variates of the given 2 samples/frequencies for size_times.
82
83 Parameters
84 ----------
85 data1, data2 : sequence of 1-D ndarrays
86 Input data. Observed samples or frequencies.
87 rvs_func : [callable|str], optional, default : "rvs_pairs"
88 The random variates function. The rvs_func can be either a callable or one of the following strings::
89
90 String Description
91 "rvs_pairs" Compute the histogram pairs from the random
92 variates of the given 2 samples/frequencies
93 for size_times.
94
95 tests : ([callable|str],...), optional, default : ["chi2_2samp"]
96 A list of *test* statistical functions. The *test* can be either a callable or one of the following strings::
97
98 String Description
99 "chi2_2samp" Read TS.chi2_2samp for further information.
100 "BDM_2samp" Read TS.BDM_2samp for further information.
101 "likelihoodratio_ksamp" Read TS.likelihoodratio_ksamp for further information.
102 "likelihoodvalue_ksamp" Read TS.likelihoodvalue_ksamp for further information.
103 "ks_2samp" Read TS.ks_2samp for further information.
104 "anderson_ksamp" Read TS.anderson_ksamp for further information.
105 "CVM_2samp" Read TS.CVM_2samp for further information.
106
107 rvs_key : dict, optional, default : {}
108 Keyword arguments for the rvs function, rvs_func.
109 test_key : dict, optional
110 Keyword arguments for the test statistical function, test.
111 parallel : bool, optional, default : None
112 If True, import IPyParallel package to do the parallel computation. If parallel is None,
113 the global variable PARALLEL will be used instead.
114 sync : bool, optional, default : True
115 When sync is False, an IPyParallel AsyncResult Object will be returned instead. Onyl affect
116 when parallel is True.
117
118 Returns
119 -------
120 [p1, p2, ...] : 1-D array
121 The corresponding p-values for each histogram pairs.
122 """
123 if parallel is None: parallel = PARALLEL
124 if parallel:
125 try:
126 global client
127 client=Client(**ipp_profile)
128 size = rvs_key['size']
129 N = len(client)
130 jobs = []
131 for i in range(N):
132 rvs_key['size'] = (size//N + 1) if (i < size % N) else size//N
133 jobs.append(client[client.ids[i]].apply_async(power_of_test, data1, data2, rvs_func, tests, rvs_key, test_key, False))
134 ars = client._asyncresult_from_jobs(jobs)
135 if sync:
136 ars.wait_interactive()
137 ret = {}
138 for key, val in ars.get():
139 ret.setdefault(key, []).extend(val)
140 else:
141 return ars
142 finally:
143 client.close()
144 return ret
145 if type(rvs_func) is str:
146 rvs_func = globals()[rvs_func]
147 if type(tests) not in (list, tuple):
148 tests = [tests]
149 tests = [(t, getattr(TS, t)) if type(t)==str else (str(t), t) for t in tests]
150 ret = {}
151 for rvs1, rvs2 in rvs_func(data1,data2,**rvs_key):
152 for tname, test in tests:
153 ret.setdefault(tname, []).append(test(rvs1, rvs2, binned=True, **test_key).pvalue)
154 return ret
rvs_pairs(data1, data2, bins=0, kind='binbybin', bin_mean=1, size=1, entries=0, freeze=False)
_hist_fill(data, bins, entries=0, freeze=None)