ATLAS Offline Software
power_of_test.py
Go to the documentation of this file.
1 # Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2 
3 from scipy.stats import rv_discrete
4 import test_statistics as TS
5 import numpy as np
6 
7 # If True, import IPyParallel package to do the parallel computation.
8 # And if IPyParallel is not installed, automatically turns it False.
9 PARALLEL = True
10 if PARALLEL:
11  try:
12  from ipyparallel import Client
13  except ImportError:
14  PARALLEL = False
15 # IPyParallel configurations.
16 # For example, to activate SSH mode:
17 # ipp_profile = {
18 # 'sshserver':'<username>@<host>',
19 # 'profile':'<IPyParallel profile directory>'
20 # }
21 # Leave it empty if you don't want to make changes.
22 ipp_profile = {'sshserver':'kingman@140.114.94.171',
23  'profile':'ssh@nthu'}
24 
25 def _hist_fill(data, bins, entries = 0, freeze = None):
26  if type(bins)==int:
27  if bins==0:
28  bins = np.arange(data.size + 1, dtype = float)
29  else:
30  bins = np.linspace(data.min(), data.max(), bins)
31  binc = (bins[1:] + bins[:-1]) / 2
32  if freeze:
33  return lambda : data
34  else:
35  rv = rv_discrete(values = (binc, data/data.sum()))
36  return lambda : np.histogram(rv.rvs(size=data.sum() if entries==0 else entries), bins)[0].astype(float)
37 
38 def rvs_pairs(data1, data2, bins = 0, kind = 'binbybin', bin_mean = 1, size = 1, entries = 0, freeze = False):
39  """
40  Compute the histogram pairs from the random variates of the given 2 samples/frequencies for size_times.
41 
42  Parameters
43  ----------
44  data1, data2 : sequence of 1-D ndarrays
45  Input data. Observed samples or frequencies.
46  bins : [scalar|1-D array], optional, default : 0
47  If bins is an int, it defines the number of equal-width bins in the given
48  range. If bins is zero, the given data will be assumed to be frequency data,
49  and it defines the bin index. If bins is an 1-D array, it defines the bin
50  edges, including the rightmost edge, allowing for non-uniform bin widths.
51  kind : [str|(str,str)], optional, default : "binbybin"
52  Bin-by-bin test
53  bin_mean : [scalar|(scalar,scalar)], optional, default : 1
54  The desired mean count in each bin of the output histograms. For example,
55  if a histogram has 10000 entries and 100 bins, the bin_mean is 100. It is
56  ignored when entries is given.
57  size : int, optional, default : 1
58  The number of repetitions for computing the histograms.
59  entries : [scalar|(scalar,scalar)], optional, default : 0
60  The desired entries of the output histograms. If entries is 0, the entries
61  size of the input data will be used instead.
62  freeze : [bool,(bool,bool)], optional. default : False
63  if True, the output histogram will be the frozen distribution of the input
64  data
65 
66  Yields
67  ------
68  hist1, hist2 : (1-D array, 1-D array)
69  Histogram pairs
70  """
71  data1 = data1.astype(float)
72  data2 = data2.astype(float)
73  entries = np.resize(entries, 2)
74  freeze = np.resize(freeze, 2)
75  hist1 = _hist_fill(data1, bins, entries[0], freeze[0])
76  hist2 = _hist_fill(data2, bins, entries[1], freeze[1])
77  for _ in range(size):
78  yield hist1(), hist2()
79 
80 def power_of_test(data1, data2, rvs_func = 'rvs_pairs', tests = ['chi2_2samp'], rvs_key = {}, test_key = {}, parallel=None, sync=True):
81  """Compute the corresponding p-values for each histrogram pairs from the random variates of the given 2 samples/frequencies for size_times.
82 
83  Parameters
84  ----------
85  data1, data2 : sequence of 1-D ndarrays
86  Input data. Observed samples or frequencies.
87  rvs_func : [callable|str], optional, default : "rvs_pairs"
88  The random variates function. The rvs_func can be either a callable or one of the following strings::
89 
90  String Description
91  "rvs_pairs" Compute the histogram pairs from the random
92  variates of the given 2 samples/frequencies
93  for size_times.
94 
95  tests : ([callable|str],...), optional, default : ["chi2_2samp"]
96  A list of *test* statistical functions. The *test* can be either a callable or one of the following strings::
97 
98  String Description
99  "chi2_2samp" Read TS.chi2_2samp for further information.
100  "BDM_2samp" Read TS.BDM_2samp for further information.
101  "likelihoodratio_ksamp" Read TS.likelihoodratio_ksamp for further information.
102  "likelihoodvalue_ksamp" Read TS.likelihoodvalue_ksamp for further information.
103  "ks_2samp" Read TS.ks_2samp for further information.
104  "anderson_ksamp" Read TS.anderson_ksamp for further information.
105  "CVM_2samp" Read TS.CVM_2samp for further information.
106 
107  rvs_key : dict, optional, default : {}
108  Keyword arguments for the rvs function, rvs_func.
109  test_key : dict, optional
110  Keyword arguments for the test statistical function, test.
111  parallel : bool, optional, default : None
112  If True, import IPyParallel package to do the parallel computation. If parallel is None,
113  the global variable PARALLEL will be used instead.
114  sync : bool, optional, default : True
115  When sync is False, an IPyParallel AsyncResult Object will be returned instead. Onyl affect
116  when parallel is True.
117 
118  Returns
119  -------
120  [p1, p2, ...] : 1-D array
121  The corresponding p-values for each histogram pairs.
122  """
123  if parallel is None: parallel = PARALLEL
124  if parallel:
125  try:
126  global client
127  client=Client(**ipp_profile)
128  size = rvs_key['size']
129  N = len(client)
130  jobs = []
131  for i in range(N):
132  rvs_key['size'] = (size//N + 1) if (i < size % N) else size//N
133  jobs.append(client[client.ids[i]].apply_async(power_of_test, data1, data2, rvs_func, tests, rvs_key, test_key, False))
134  ars = client._asyncresult_from_jobs(jobs)
135  if sync:
136  ars.wait_interactive()
137  ret = {}
138  for key, val in ars.get():
139  ret.setdefault(key, []).extend(val)
140  else:
141  return ars
142  finally:
143  client.close()
144  return ret
145  if type(rvs_func) is str:
146  rvs_func = globals()[rvs_func]
147  if type(tests) not in (list, tuple):
148  tests = [tests]
149  tests = [(t, getattr(TS, t)) if type(t)==str else (str(t), t) for t in tests]
150  ret = {}
151  for rvs1, rvs2 in rvs_func(data1,data2,**rvs_key):
152  for tname, test in tests:
153  ret.setdefault(tname, []).append(test(rvs1, rvs2, binned=True, **test_key).pvalue)
154  return ret
python.power_of_test.power_of_test
def power_of_test(data1, data2, rvs_func='rvs_pairs', tests=['chi2_2samp'], rvs_key={}, test_key={}, parallel=None, sync=True)
Definition: power_of_test.py:80
FakeBkgTools::Client
Client
Definition: FakeBkgInternals.h:141
TrigInDetValidation_Base.test
test
Definition: TrigInDetValidation_Base.py:144
dumpHVPathFromNtuple.append
bool append
Definition: dumpHVPathFromNtuple.py:91
python.power_of_test.rvs_pairs
def rvs_pairs(data1, data2, bins=0, kind='binbybin', bin_mean=1, size=1, entries=0, freeze=False)
Definition: power_of_test.py:38
plotBeamSpotVxVal.range
range
Definition: plotBeamSpotVxVal.py:195
CompareRootFiles.hist1
hist1
Definition: CompareRootFiles.py:36
python.CaloScaleNoiseConfig.type
type
Definition: CaloScaleNoiseConfig.py:78
python.power_of_test._hist_fill
def _hist_fill(data, bins, entries=0, freeze=None)
Definition: power_of_test.py:25
str
Definition: BTagTrackIpAccessor.cxx:11
CompareRootFiles.hist2
hist2
Definition: CompareRootFiles.py:37