ATLAS Offline Software
Loading...
Searching...
No Matches
GrubbsOutlierTest.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3*/
4
15
16#include <dqm_core/AlgorithmConfig.h>
17#include <dqm_core/AlgorithmManager.h>
18
21#include <ers/ers.h>
22
23// STL
24#include <sstream>
25
26// ROOT
27#include <TH1.h>
28#include <TF1.h>
29#include <TMath.h>
30#include <TClass.h>
31#include <cmath>
32
33#ifndef GTESTPARS
34#define GTESTPARS 5
35#endif
36
38
40 : m_name ( name ) {
41 dqm_core::AlgorithmManager::instance().registerAlgorithm("OutlierTest_"+ name, this );
42}
43
49
50
51dqm_core::Result *
53 const TObject & object,
54 const dqm_core::AlgorithmConfig & config )
55{
56 const TH1 * histogram;
57
58 // check if it inherits from 2d histogram
59 if ( object.IsA()->InheritsFrom("TH2") ) {
60 throw dqm_core::BadConfig( ERS_HERE , name , " dimension > 1 ");
61 }
62
63 // check if its a 1d histogram or not
64 if ( object.IsA()->InheritsFrom("TH1") ) {
65 histogram = static_cast<const TH1*>(&object);
66 } else {
67 throw dqm_core::BadConfig( ERS_HERE ,name , " does not inherit from TH1");
68 }
69
70 // Configure the DQ algorithm
71 double grThr, reThr; // Green and Red thresholds
72
73 // param 1: skip bin mod(N)
74 const int skipmod = (int)dqm_algorithms::tools::GetFirstFromMap("SkipBinModulo", config.getParameters(), histogram->GetNbinsX() + 10);
75 if (skipmod < 1) {
76 std::ostringstream msg;
77 msg << " Parameter \"SkipBinModulo\" is out of bounds. Value given = " << skipmod;
78 msg << "\n Range of values acceptable: integer >= 1";
79 msg << "\n Default Value: skip none. Do not set \"SkipBinModulo\" in your config to use default";
80 throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
81 }
82
83 // param 2: significance level
84 const double signL = dqm_algorithms::tools::GetFirstFromMap("Significance", config.getParameters(), 0.04);
85 if (signL < 0.01 || signL > 0.95) {
86 std::ostringstream msg;
87 msg << " Parameter \"Significance\" is out of bounds. Value given = " << signL;
88 msg << "\n Range of values acceptable: floating point number [0.01, 0.95]";
89 msg << "\n Default Value: 0.04. Do not set \"Significance\" in your config to use default";
90 throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
91 }
92
93 // param 3: min stat per bin
94 const double minStat = dqm_algorithms::tools::GetFirstFromMap("MinStatPerBin", config.getParameters(), 10);
95 if (minStat < 0) {
96 std::ostringstream msg;
97 msg << " Parameter \"MinStatPerBin\" is out of bounds. Value given = " << signL;
98 msg << "\n Range of values acceptable: integer >= 0";
99 msg << "\n Default Value: 10. Do not set \"MinStatPerBin\" in your config to use default";
100 throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
101 }
102
103 // param 4,5: min and max bins
104 // the following is a duplicate of dqm_algorithms::tools::GetBinRange, but with different parameter names
105 const double notFound = -99999;
106 const double xmin = dqm_algorithms::tools::GetFirstFromMap("Min", config.getParameters(), notFound);
107 const double xmax = dqm_algorithms::tools::GetFirstFromMap("Max", config.getParameters(), notFound);
108 const int minbin = (xmin == notFound) ? 1 : histogram->GetXaxis()->FindBin(xmin);
109 const int maxbin = (xmax == notFound) ? histogram->GetNbinsX() : histogram->GetXaxis()->FindBin(xmax);
110 ERS_DEBUG(1, "xmin = " << xmin << ", xmax = " << xmax << " , minbin = " << minbin << ", maxbin = " << maxbin);
111
112 // param 6: include bins with entries < minStat as outliers?
113 const bool useMinStat = static_cast<bool>(dqm_algorithms::tools::GetFirstFromMap("IsMinStatOutlier", config.getParameters(), 1));
114 ERS_DEBUG(1, "useMinStat = " << useMinStat);
115
116 // Get Green and Red Thresholds
117 try {
118
119 grThr = dqm_algorithms::tools::GetFromMap("Threshold",config.getGreenThresholds() );
120 reThr = dqm_algorithms::tools::GetFromMap("Threshold",config.getRedThresholds() );
121
122 } catch ( dqm_core::Exception & ex ) {
123 throw dqm_core::BadConfig(ERS_HERE,name,"Paramter: 'Threshold' is mandatory, cannot continue");
124 }
125
126 if ( grThr > 1.0 || reThr > 1.0) {
127 throw dqm_core::BadConfig(ERS_HERE,m_name,"Configuration Error: Threshold>100%");
128 }
129
130 //loop over bins and store values
131 int binc = 0;
132 std::vector<float> binCountVec;
133 std::vector<size_t> binIndices;
134
135 // find mean, sigma max and min binvals
136 for(int k = minbin; k < maxbin; k++) {
137 // skip every N'th bin
138 if (k % skipmod == 0) {}
139 else {
140 float binval = histogram->GetBinContent(k);
141 binCountVec.push_back(binval);
142 if((binval <= minStat) && useMinStat) {
143 binIndices.push_back(binc);
144 }
145 binc++;
146 } // else
147 } // for k
148
149 // find number of (and fraction of) outliers
150 int minOutliers = binIndices.size();
151 int maxOutliers, nOutliers = minOutliers;
152 int nbinsDenominator = binCountVec.size();
153
154 dqm_core::Result * result = new dqm_core::Result;
155
156 std::ostringstream msg;
157 try {
158 // invoke Grubbs' test
159 maxOutliers = GrubbsTest(binCountVec,binIndices,signL);
160
161 // debug
162 msg<< "maxOutliers = " << maxOutliers << "\t bincountVecsize = " << nbinsDenominator << "\t";
163 if(binIndices.size() <= binCountVec.size())
164 for(size_t g= 0; g < binIndices.size(); g++)
165 msg << binIndices[g] << ":" << binCountVec[binIndices[g]] <<"\n";
166 ERS_DEBUG(1, msg.str());
167
168 } catch ( dqm_core::Exception & ex ) {
169 msg << " Exception caught in GrubbsTest \n";
170 throw dqm_core::BadConfig(ERS_HERE,name,msg.str());
171 }
172
173 // no. of outliers
174 if(maxOutliers >= 0) {
175 nOutliers += (useMinStat ? maxOutliers : 0);
176 }
177 result->tags_.insert(std::make_pair("NumOfOutliers",nOutliers));
178
179 // fraction of outliers
180 float totalBins = nbinsDenominator > 0 ? nbinsDenominator : histogram->GetNbinsX();
181 float outlierFraction = 0.;
182#ifndef __APPLE__
183 bool IsInf = isinf(float(nOutliers) / totalBins);
184 bool IsNaN = isnan(float(nOutliers) / totalBins);
185#else
186 bool IsInf = std::isinf(float(nOutliers) / totalBins);
187 bool IsNaN = std::isnan(float(nOutliers) / totalBins);
188#endif
189 if(!IsInf || !IsNaN)
190 outlierFraction = float(nOutliers) / totalBins;
191
192 result->tags_.insert(std::make_pair("OutlierFraction",outlierFraction));
193 msg.str("");
194 msg << "Outlier fraction = "<< nOutliers << "/" << totalBins << " = " << outlierFraction;
195 ERS_DEBUG(1, msg.str());
196
197
198 // compare red/yellow/green thresholds with outlierfraction
199 // and return results
200 if (reThr> grThr) {
201 if ( outlierFraction>reThr )
202 {
203 ERS_DEBUG(1,"[RED] Result : "<<outlierFraction);
204 result->status_=dqm_core::Result::Red;
205 return result;
206 }
207 else if ( outlierFraction > grThr )
208 {
209 ERS_DEBUG(1,"[YELLOW] Result : "<<outlierFraction);
210 result->status_=dqm_core::Result::Yellow;
211 return result;
212 }
213 }else {
214 if ( outlierFraction < reThr )
215 {
216 ERS_DEBUG(1,"[RED] Result : "<<outlierFraction);
217 result->status_=dqm_core::Result::Red;
218 return result;
219 }
220 else if ( outlierFraction < grThr )
221 {
222 ERS_DEBUG(1,"[YELLOW] Result : "<<outlierFraction);
223 result->status_=dqm_core::Result::Yellow;
224 return result;
225 }
226 }
227
228 // return (default) result
229 ERS_DEBUG(1,"[GREEN] Result");
230 result->status_=dqm_core::Result::Green;
231 return result;
232
233}
234
235// Algorithm implementation
236int dqm_algorithms::GrubbsOutlierTest::GrubbsTest(std::vector<float>& vec, std::vector<size_t>&binIndices, float signLevel) {
237 // return codes:
238 // >= 0 : number of outliers found
239 // -1 : input vector empty
240 // -2 : less then two bins have occupancy (need >= 2 bins filled for this test)
241 // -3 : significance level > 100% (nonsensical)
242 // -4 : significance level < 0.1% (is never achievable in most cases)
243
244 // negative return codes
245 if(vec.empty()) return -1;
246
247 size_t nItems = vec.size();
248 if(vec.size() < 2) return -2;
249
250 float epsilon = 1.e-5;
251 if(signLevel > 1.0) return -3;
252
253 if((signLevel-0.001) < epsilon) return -4;
254
255 bool notEmpty = !binIndices.empty();
256
257 // find mean
258 float mean = 0.;
259 for(size_t j = 0; j < nItems; j++) mean += vec[j];
260 mean /= nItems;
261
262 // find standard deviation (sd)
263 float sigma = 0.;
264 for(size_t j = 0; j < nItems; j++) {
265 float diff = (vec[j]-mean);
266 sigma += diff*diff;
267 }
268 sigma /= (nItems-1);
269 sigma = std::sqrt(sigma);
270
271 float zval = 0., tval = 0., prob = 0.;
272 int retval = 0;
273 // loop over bins and...
274 for(size_t j = 0; j < nItems; j++) {
275
276 // define test statistic z = |mean - val|/sigma
277 zval = std::abs(mean - vec[j])/sigma;
278
279 // find out the critical value (tval) for the test statistic
280 tval = std::sqrt(nItems*(nItems-2)*(zval*zval)/((nItems-1)*(nItems-1)-nItems*(zval*zval)));
281
282 // determine the probability (two sided student's t pdf with N-2 d.o.f) of tval being
283 // far away (determined by significance level) from others so as to be considered an outlier
284 prob = 2*(1. - TMath::StudentI(tval,nItems-2));
285
286 // if p-value falls below significance level, this bin is an outlier
287 if(prob < signLevel) {
288 if(notEmpty) {
289 bool storeMe = true;
290 for(size_t k = 0; k < binIndices.size(); k++) {
291 if(binIndices[k] == j) { storeMe = false; break; }
292 } // for
293 if(storeMe) {
294 binIndices.push_back(j);
295 retval++;
296 }
297 } else {
298 binIndices.push_back(j);
299 retval ++;
300 }
301 } // if prob
302 } // for j
303
304 // return #of outliers found
305 return retval;
306
307}
308
309 void
311 out << "-------------- Grubbs' Test for Outliers ----------------------------------------------------" << std::endl;
312 out << "Identify outlier bins in a 1d histogram. Given N = {n1, n2, n3...} where n1 = frequency in 1st bin etc..\n";
313 out << "Define test statistic: Z_i = |n_i - mean| / sigma, where mean = sum(n_i)/sizeof(N) and sigma = standard deviation of N\n";
314 out << "For each Z_i calculate the critical value (tval) of the test-statistic and find out its corresponding p-value. if p-value\n";
315 out << "p-value is calculated using \"two-sided t-distribution\" pdf with sizeof(N)-2 degrees of freedom.\n";
316 out << "If p-value <= significance, reject null hypothesis (H_0 = no outliers).\n";
317 out << "----------------------------------------------------------------------------------------------" << std::endl;
318 out << "Parameters : IsMinStatOutlier, Min, Max, MinStatPerBin, Significance, SkipBinModulo\n";
319 out << "----------------------------------------------------------------------------------------------" << std::endl;
320 out << "IsMinStatOutlier: If bin contents are less than MinStatPerBin, is it an outlier? [type = bool] [values = 0, 1] [ default: 1]\n";
321 out << "Min, Max : Process bin numbers corresponding to Min, Max (on x-axis). [type = float] [default: all bins]\n";
322 out << "MinStatPerBin : Min number of entries per bin in order to be considered for testing. [type = integer] [values >= 0] [default = 9]\n";
323 out << "Significance : Null hypothesis (no outliers) is rejected if critical value > significance [type = float] [range = [0.04, 0.99], default = 0.04]\n";
324 out << "SkipBinModulo : Skip every N'th bin. [type = integer] [values = [0, #of bins in 1d hist], default: skip none]\n";
325 out << "----------------------------------------------------------------------------------------------" << std::endl;
326}
std::vector< size_t > vec
dqm_algorithms::GrubbsOutlierTest GrubbsTest("Grubbs")
outlier-bins in 1d histogram
void diff(const Jet &rJet1, const Jet &rJet2, std::map< std::string, double > varDiff)
Difference between jets - Non-Class function required by trigger.
Definition Jet.cxx:631
std::string histogram
Definition chains.cxx:52
void mean(std::vector< double > &bins, std::vector< double > &values, const std::vector< std::string > &files, const std::string &histname, const std::string &tplotname, const std::string &label="")
double xmax
Definition listroot.cxx:61
double xmin
Definition listroot.cxx:60
double GetFirstFromMap(const std::string &paramName, const std::map< std::string, double > &params)
const T & GetFromMap(const std::string &pname, const std::map< std::string, T > &params)
void printDescription(std::ostream &out)
dqm_core::Result * execute(const std::string &, const TObject &, const dqm_core::AlgorithmConfig &)
GrubbsOutlierTest(const std::string &name)
int GrubbsTest(std::vector< float > &vec, std::vector< size_t > &binIndices, float signLevel)
MsgStream & msg
Definition testRead.cxx:32
#define IsA
Declare the TObject style functions.