ATLAS Offline Software
GrubbsOutlierTest.cxx
Go to the documentation of this file.
1 /*
2  Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3 */
4 
16 #include <dqm_core/AlgorithmConfig.h>
17 #include <dqm_core/AlgorithmManager.h>
18 
21 #include <ers/ers.h>
22 
23 // STL
24 #include <sstream>
25 
26 // ROOT
27 #include <TH1.h>
28 #include <TF1.h>
29 #include <TMath.h>
30 #include <TClass.h>
31 #include <cmath>
32 
33 #ifndef GTESTPARS
34 #define GTESTPARS 5
35 #endif
36 
38 
40  : m_name ( name ) {
41  dqm_core::AlgorithmManager::instance().registerAlgorithm("OutlierTest_"+ name, this );
42 }
43 
46 {
47  return new GrubbsOutlierTest( m_name );
48 }
49 
50 
53  const TObject & object,
54  const dqm_core::AlgorithmConfig & config )
55 {
56  const TH1 * histogram;
57 
58  // check if it inherits from 2d histogram
59  if ( object.IsA()->InheritsFrom("TH2") ) {
60  throw dqm_core::BadConfig( ERS_HERE , name , " dimension > 1 ");
61  }
62 
63  // check if its a 1d histogram or not
64  if ( object.IsA()->InheritsFrom("TH1") ) {
65  histogram = (TH1*)&object;
66  } else {
67  throw dqm_core::BadConfig( ERS_HERE ,name , " does not inherit from TH1");
68  }
69 
70  // Configure the DQ algorithm
71  double grThr, reThr; // Green and Red thresholds
72 
73  // param 1: skip bin mod(N)
74  const int skipmod = (int)dqm_algorithms::tools::GetFirstFromMap("SkipBinModulo", config.getParameters(), histogram->GetNbinsX() + 10);
75  if (skipmod < 1) {
76  std::ostringstream msg;
77  msg << " Parameter \"SkipBinModulo\" is out of bounds. Value given = " << skipmod;
78  msg << "\n Range of values acceptable: integer >= 1";
79  msg << "\n Default Value: skip none. Do not set \"SkipBinModulo\" in your config to use default";
80  throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
81  }
82 
83  // param 2: significance level
84  const double signL = dqm_algorithms::tools::GetFirstFromMap("Significance", config.getParameters(), 0.04);
85  if (signL < 0.01 || signL > 0.95) {
86  std::ostringstream msg;
87  msg << " Parameter \"Significance\" is out of bounds. Value given = " << signL;
88  msg << "\n Range of values acceptable: floating point number [0.01, 0.95]";
89  msg << "\n Default Value: 0.04. Do not set \"Significance\" in your config to use default";
90  throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
91  }
92 
93  // param 3: min stat per bin
94  const double minStat = dqm_algorithms::tools::GetFirstFromMap("MinStatPerBin", config.getParameters(), 10);
95  if (minStat < 0) {
96  std::ostringstream msg;
97  msg << " Parameter \"MinStatPerBin\" is out of bounds. Value given = " << signL;
98  msg << "\n Range of values acceptable: integer >= 0";
99  msg << "\n Default Value: 10. Do not set \"MinStatPerBin\" in your config to use default";
100  throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
101  }
102 
103  // param 4,5: min and max bins
104  // the following is a duplicate of dqm_algorithms::tools::GetBinRange, but with different parameter names
105  const double notFound = -99999;
106  const double xmin = dqm_algorithms::tools::GetFirstFromMap("Min", config.getParameters(), notFound);
107  const double xmax = dqm_algorithms::tools::GetFirstFromMap("Max", config.getParameters(), notFound);
108  const int minbin = (xmin == notFound) ? 1 : histogram->GetXaxis()->FindBin(xmin);
109  const int maxbin = (xmax == notFound) ? histogram->GetNbinsX() : histogram->GetXaxis()->FindBin(xmax);
110  ERS_DEBUG(1, "xmin = " << xmin << ", xmax = " << xmax << " , minbin = " << minbin << ", maxbin = " << maxbin);
111 
112  // param 6: include bins with entries < minStat as outliers?
113  const bool useMinStat = static_cast<bool>(dqm_algorithms::tools::GetFirstFromMap("IsMinStatOutlier", config.getParameters(), 1));
114  ERS_DEBUG(1, "useMinStat = " << useMinStat);
115 
116  // Get Green and Red Thresholds
117  try {
118 
119  grThr = dqm_algorithms::tools::GetFromMap("Threshold",config.getGreenThresholds() );
120  reThr = dqm_algorithms::tools::GetFromMap("Threshold",config.getRedThresholds() );
121 
122  } catch ( dqm_core::Exception & ex ) {
123  throw dqm_core::BadConfig(ERS_HERE,name,"Paramter: 'Threshold' is mandatory, cannot continue");
124  }
125 
126  if ( grThr > 1.0 || reThr > 1.0) {
127  throw dqm_core::BadConfig(ERS_HERE,m_name,"Configuration Error: Threshold>100%");
128  }
129 
130  //loop over bins and store values
131  int binc = 0;
132  std::vector<float> binCountVec;
133  std::vector<size_t> binIndices;
134 
135  // find mean, sigma max and min binvals
136  for(int k = minbin; k < maxbin; k++) {
137  // skip every N'th bin
138  if (k % skipmod == 0) {}
139  else {
140  float binval = histogram->GetBinContent(k);
141  binCountVec.push_back(binval);
142  if((binval <= minStat) && useMinStat) {
143  binIndices.push_back(binc);
144  }
145  binc++;
146  } // else
147  } // for k
148 
149  // find number of (and fraction of) outliers
150  int minOutliers = binIndices.size();
151  int maxOutliers, nOutliers = minOutliers;
152  int nbinsDenominator = binCountVec.size();
153 
155 
156  std::ostringstream msg;
157  try {
158  // invoke Grubbs' test
159  maxOutliers = GrubbsTest(binCountVec,binIndices,signL);
160 
161  // debug
162  msg<< "maxOutliers = " << maxOutliers << "\t bincountVecsize = " << nbinsDenominator << "\t";
163  if(binIndices.size() <= binCountVec.size())
164  for(size_t g= 0; g < binIndices.size(); g++)
165  msg << binIndices[g] << ":" << binCountVec[binIndices[g]] <<"\n";
166  ERS_DEBUG(1, msg.str());
167 
168  } catch ( dqm_core::Exception & ex ) {
169  msg << " Exception caught in GrubbsTest \n";
170  throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
171  }
172 
173  // no. of outliers
174  if(maxOutliers >= 0) {
175  nOutliers += (useMinStat ? maxOutliers : 0);
176  }
177  result->tags_.insert(std::make_pair("NumOfOutliers",nOutliers));
178 
179  // fraction of outliers
180  float totalBins = nbinsDenominator > 0 ? nbinsDenominator : histogram->GetNbinsX();
181  float outlierFraction = 0.;
182 #ifndef __APPLE__
183  bool IsInf = isinf(float(nOutliers) / totalBins);
184  bool IsNaN = isnan(float(nOutliers) / totalBins);
185 #else
186  bool IsInf = std::isinf(float(nOutliers) / totalBins);
187  bool IsNaN = std::isnan(float(nOutliers) / totalBins);
188 #endif
189  if(!IsInf || !IsNaN)
190  outlierFraction = float(nOutliers) / totalBins;
191 
192  result->tags_.insert(std::make_pair("OutlierFraction",outlierFraction));
193  msg.str("");
194  msg << "Outlier fraction = "<< nOutliers << "/" << totalBins << " = " << outlierFraction;
195  ERS_DEBUG(1, msg.str());
196 
197 
198  // compare red/yellow/green thresholds with outlierfraction
199  // and return results
200  if (reThr> grThr) {
201  if ( outlierFraction>reThr )
202  {
203  ERS_DEBUG(1,"[RED] Result : "<<outlierFraction);
204  result->status_=dqm_core::Result::Red;
205  return result;
206  }
207  else if ( outlierFraction > grThr )
208  {
209  ERS_DEBUG(1,"[YELLOW] Result : "<<outlierFraction);
210  result->status_=dqm_core::Result::Yellow;
211  return result;
212  }
213  }else {
214  if ( outlierFraction < reThr )
215  {
216  ERS_DEBUG(1,"[RED] Result : "<<outlierFraction);
217  result->status_=dqm_core::Result::Red;
218  return result;
219  }
220  else if ( outlierFraction < grThr )
221  {
222  ERS_DEBUG(1,"[YELLOW] Result : "<<outlierFraction);
223  result->status_=dqm_core::Result::Yellow;
224  return result;
225  }
226  }
227 
228  // return (default) result
229  ERS_DEBUG(1,"[GREEN] Result");
231  return result;
232 
233 }
234 
235 // Algorithm implementation
236 int dqm_algorithms::GrubbsOutlierTest::GrubbsTest(std::vector<float>& vec, std::vector<size_t>&binIndices, float signLevel) {
237  // return codes:
238  // >= 0 : number of outliers found
239  // -1 : input vector empty
240  // -2 : less then two bins have occupancy (need >= 2 bins filled for this test)
241  // -3 : significance level > 100% (nonsensical)
242  // -4 : significance level < 0.1% (is never achievable in most cases)
243 
244  // negative return codes
245  if(vec.empty()) return -1;
246 
247  size_t nItems = vec.size();
248  if(vec.size() < 2) return -2;
249 
250  float epsilon = 1.e-5;
251  if(signLevel > 1.0) return -3;
252 
253  if((signLevel-0.001) < epsilon) return -4;
254 
255  bool notEmpty = !binIndices.empty();
256 
257  // find mean
258  float mean = 0.;
259  for(size_t j = 0; j < nItems; j++) mean += vec[j];
260  mean /= nItems;
261 
262  // find standard deviation (sd)
263  float sigma = 0.;
264  for(size_t j = 0; j < nItems; j++) {
265  float diff = (vec[j]-mean);
266  sigma += diff*diff;
267  }
268  sigma /= (nItems-1);
269  sigma = std::sqrt(sigma);
270 
271  float zval = 0., tval = 0., prob = 0.;
272  int retval = 0;
273  // loop over bins and...
274  for(size_t j = 0; j < nItems; j++) {
275 
276  // define test statistic z = |mean - val|/sigma
277  zval = std::abs(mean - vec[j])/sigma;
278 
279  // find out the critical value (tval) for the test statistic
280  tval = std::sqrt(nItems*(nItems-2)*(zval*zval)/((nItems-1)*(nItems-1)-nItems*(zval*zval)));
281 
282  // determine the probability (two sided student's t pdf with N-2 d.o.f) of tval being
283  // far away (determined by significance level) from others so as to be considered an outlier
284  prob = 2*(1. - TMath::StudentI(tval,nItems-2));
285 
286  // if p-value falls below significance level, this bin is an outlier
287  if(prob < signLevel) {
288  if(notEmpty) {
289  bool storeMe = true;
290  for(size_t k = 0; k < binIndices.size(); k++) {
291  if(binIndices[k] == j) { storeMe = false; break; }
292  } // for
293  if(storeMe) {
294  binIndices.push_back(j);
295  retval++;
296  }
297  } else {
298  binIndices.push_back(j);
299  retval ++;
300  }
301  } // if prob
302  } // for j
303 
304  // return #of outliers found
305  return retval;
306 
307 }
308 
309  void
311  out << "-------------- Grubbs' Test for Outliers ----------------------------------------------------" << std::endl;
312  out << "Identify outlier bins in a 1d histogram. Given N = {n1, n2, n3...} where n1 = frequency in 1st bin etc..\n";
313  out << "Define test statistic: Z_i = |n_i - mean| / sigma, where mean = sum(n_i)/sizeof(N) and sigma = standard deviation of N\n";
314  out << "For each Z_i calculate the critical value (tval) of the test-statistic and find out its corresponding p-value. if p-value\n";
315  out << "p-value is calculated using \"two-sided t-distribution\" pdf with sizeof(N)-2 degrees of freedom.\n";
316  out << "If p-value <= significance, reject null hypothesis (H_0 = no outliers).\n";
317  out << "----------------------------------------------------------------------------------------------" << std::endl;
318  out << "Parameters : IsMinStatOutlier, Min, Max, MinStatPerBin, Significance, SkipBinModulo\n";
319  out << "----------------------------------------------------------------------------------------------" << std::endl;
320  out << "IsMinStatOutlier: If bin contents are less than MinStatPerBin, is it an outlier? [type = bool] [values = 0, 1] [ default: 1]\n";
321  out << "Min, Max : Process bin numbers corresponding to Min, Max (on x-axis). [type = float] [default: all bins]\n";
322  out << "MinStatPerBin : Min number of entries per bin in order to be considered for testing. [type = integer] [values >= 0] [default = 9]\n";
323  out << "Significance : Null hypothesis (no outliers) is rejected if critical value > significance [type = float] [range = [0.04, 0.99], default = 0.04]\n";
324  out << "SkipBinModulo : Skip every N'th bin. [type = integer] [values = [0, #of bins in 1d hist], default: skip none]\n";
325  out << "----------------------------------------------------------------------------------------------" << std::endl;
326 }
pdg_comparison.sigma
sigma
Definition: pdg_comparison.py:324
mean
void mean(std::vector< double > &bins, std::vector< double > &values, const std::vector< std::string > &files, const std::string &histname, const std::string &tplotname, const std::string &label="")
Definition: dependence.cxx:254
get_generator_info.result
result
Definition: get_generator_info.py:21
IsA
#define IsA
Declare the TObject style functions.
Definition: xAODTEventBranch.h:59
CaloCellPos2Ntuple.int
int
Definition: CaloCellPos2Ntuple.py:24
GrubbsOutlierTest.h
outlier-bins in 1d histogram
dqm_algorithms::GrubbsOutlierTest::execute
dqm_core::Result * execute(const std::string &, const TObject &, const dqm_core::AlgorithmConfig &)
Definition: GrubbsOutlierTest.cxx:52
python.AthDsoLogger.out
out
Definition: AthDsoLogger.py:71
mc.diff
diff
Definition: mc.SFGenPy8_MuMu_DD.py:14
vec
std::vector< size_t > vec
Definition: CombinationsGeneratorTest.cxx:12
covarianceTool.prob
prob
Definition: covarianceTool.py:678
dqm_algorithms::GrubbsOutlierTest::printDescription
void printDescription(std::ostream &out)
Definition: GrubbsOutlierTest.cxx:310
GrubbsTest
dqm_algorithms::GrubbsOutlierTest GrubbsTest("Grubbs")
config
Definition: PhysicsAnalysis/AnalysisCommon/AssociationUtils/python/config.py:1
dqm_algorithms::GrubbsOutlierTest::GrubbsTest
int GrubbsTest(std::vector< float > &vec, std::vector< size_t > &binIndices, float signLevel)
Definition: GrubbsOutlierTest.cxx:236
LArCellBinning_test.retval
def retval
Definition: LArCellBinning_test.py:112
instance
std::map< std::string, double > instance
Definition: Run_To_Get_Tags.h:8
xmin
double xmin
Definition: listroot.cxx:60
python.CaloCondTools.g
g
Definition: CaloCondTools.py:15
Result
ICscStripFitter::Result Result
Definition: CalibCscStripFitter.cxx:13
dqm_algorithms::GrubbsOutlierTest::clone
GrubbsOutlierTest * clone()
Definition: GrubbsOutlierTest.cxx:45
python.handimod.Green
int Green
Definition: handimod.py:524
python.handimod.Red
Red
Definition: handimod.py:551
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:195
TH1
Definition: rootspy.cxx:268
xmax
double xmax
Definition: listroot.cxx:61
AlgorithmHelper.h
dqm_algorithms::tools::GetFromMap
const T & GetFromMap(const std::string &pname, const std::map< std::string, T > &params)
Definition: AlgorithmHelper.h:114
dqm_algorithms::GrubbsOutlierTest
Definition: GrubbsOutlierTest.h:22
dqm_algorithms::GrubbsOutlierTest::GrubbsOutlierTest
GrubbsOutlierTest(const std::string &name)
Definition: GrubbsOutlierTest.cxx:39
dqm_algorithms::tools::GetFirstFromMap
double GetFirstFromMap(const std::string &paramName, const std::map< std::string, double > &params)
Definition: AlgorithmHelper.cxx:339
python.AutoConfigFlags.msg
msg
Definition: AutoConfigFlags.py:7
readCCLHist.float
float
Definition: readCCLHist.py:83
histogram
std::string histogram
Definition: chains.cxx:52
fitman.k
k
Definition: fitman.py:528