ATLAS Offline Software
Loading...
Searching...
No Matches
OutlierAndFlatnessTest.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3*/
4
5// Checks and removes outliers from TH1-Histos and tests the remaining distribution for flatness. Originally by Steffen Schaepe.
6
9
10#include "dqm_core/AlgorithmManager.h"
11#include "dqm_core/exceptions.h"
12#include "dqm_core/AlgorithmConfig.h"
13#include "dqm_core/Result.h"
14
15#include "TH1.h"
16#include "TF1.h"
17#include "TH1C.h"
18#include "TH2C.h"
19#include "TClass.h"
20
21#include <memory>
22#include <cmath>
23
24namespace
25{
26dqm_algorithms::OutlierAndFlatnessTest OutlierAndFlatnessTest("OutlierAndFlatnessTest");
27}
28
30{
31 dqm_core::AlgorithmManager::instance().registerAlgorithm(name, this);
32}
33
38
39dqm_core::Result *dqm_algorithms::OutlierAndFlatnessTest::execute(const std::string &name, const TObject &object, const dqm_core::AlgorithmConfig &config)
40{
41 if (!object.IsA()->InheritsFrom("TH1")) {
42 throw dqm_core::BadConfig(ERS_HERE, name, "does not inherit from TH1");
43 }
44 std::unique_ptr<TH1> histogram(static_cast<TH1 *>(object.Clone())); // we just checked that this is really a TH1, so we can safely type-cast the pointer
45 if (histogram->GetDimension() > 2) {
46 throw dqm_core::BadConfig(ERS_HERE, name, "dimension > 2");
47 }
48 const bool isOneDimensional = (histogram->GetDimension() == 1); // can only be 1 or 2 after the previous check
49
50 // check for minimum statistics
51
52 const double minstat = tools::GetFirstFromMap("MinStat", config.getParameters(), 1);
53 if (histogram->GetEntries() < minstat) {
54 dqm_core::Result *result = new dqm_core::Result(dqm_core::Result::Undefined);
55 result->tags_["InsufficientEntries"] = histogram->GetEntries();
56 return result;
57 }
58
59 // check if histogram conatins only zeros
60
61 const bool checkZeroContent = static_cast<bool>(tools::GetFirstFromMap("CheckZeroContent", config.getParameters(), 1));
62 if (checkZeroContent && (histogram->Integral() == 0) && (histogram->GetEntries() > 0)) {
63 ERS_DEBUG(1, "Histogram " << histogram->GetName() << " is filled with zeros!");
64 dqm_core::Result* result = new dqm_core::Result(dqm_core::Result::Red);
65 result->tags_["Integral"] = histogram->Integral();
66 return result;
67 }
68
69 // get configuration parameters and set defaults
70
71 const bool checkFlatness = static_cast<bool>(tools::GetFirstFromMap("CheckFlatness", config.getParameters(), 1)); // default: true
72 const bool fitCircular = static_cast<bool>(tools::GetFirstFromMap("FitCircular", config.getParameters(), 0)); // default: false
73 const bool ignore0 = static_cast<bool>(tools::GetFirstFromMap("Ignore0", config.getParameters(), 1)); // default: true
74
75 const bool checkSigmaDev = static_cast<bool>(tools::GetFirstFromMap("CheckSigmaDev", config.getParameters(), 1)); // default: true
76 const double sigmaDev = checkSigmaDev ? tools::GetFirstFromMap("SigmaDev", config.getParameters()) : 0; // only applicable if checkSigmaDev, then mandatory
77 const bool dontCountSigmaOutliers = checkSigmaDev ? static_cast<bool>(tools::GetFirstFromMap("DontCountSigmaOutliers", config.getParameters(), 0)) : false; // only applicable if checkSigmaDev, default: false
78
79 const bool checkAbsDev = static_cast<bool>(tools::GetFirstFromMap("CheckAbsDev", config.getParameters(), 0)); // default: false
80 const double absDev = checkAbsDev ? tools::GetFirstFromMap("AbsDev", config.getParameters()) : 0; // only applicable if checkAbsDev, then mandatory
81
82 const bool checkRelDev = static_cast<bool>(tools::GetFirstFromMap("CheckRelDev", config.getParameters(), 0)); // default: false
83 const double relDev = checkRelDev ? tools::GetFirstFromMap("RelDev", config.getParameters()) : 0; // only applicable if checkRelDev, then mandatory
84
85 const bool checkAbsLimit = static_cast<bool>(tools::GetFirstFromMap("CheckAbsLimit", config.getParameters(), 0)); // default: false
86 const double absLimit = checkAbsLimit ? tools::GetFirstFromMap("AbsLimit", config.getParameters()) : 0; // only applicable if checkAbsLimit, then mandatory
87
88 const bool normRef = static_cast<bool>(tools::GetFirstFromMap("DivideByReference", config.getParameters(), 0)); // default: false
89 const bool diffRef = static_cast<bool>(tools::GetFirstFromMap("SubtractReference", config.getParameters(), 0)); // default: false
90
91 const bool subtractBinError = static_cast<bool>(tools::GetFirstFromMap("SubtractBinError", config.getParameters(), 0)); // default: false
92
93 const bool storeOutlierBins = static_cast<bool>(tools::GetFirstFromMap("StoreOutlierBins", config.getParameters(), 0)); // default: false
94 // if the checks are to be done against a reference take care of this now
95
96 TH1 *refhist = 0;
97 if (diffRef || normRef) {
98 if (diffRef && normRef) {
99 throw dqm_core::BadConfig(ERS_HERE, name, "Can either divide or subtract reference, not both");
100 }
101 try {
102 refhist = static_cast<TH1 *>(config.getReference());
103 } catch(dqm_core::Exception &ex) {
104 throw dqm_core::BadRefHist(ERS_HERE, name, "Could not retrieve reference");
105 }
106 if (histogram->GetDimension() != refhist->GetDimension()) {
107 throw dqm_core::BadRefHist(ERS_HERE, name, "Wrong dimension of reference");
108 }
109 if ((histogram->GetNbinsX() != refhist->GetNbinsX()) || (histogram->GetNbinsY() != refhist->GetNbinsY())) {
110 throw dqm_core::BadRefHist(ERS_HERE, name, "Non-matching number of bins of reference");
111 }
112
113 // scale reference to the same integral as the test hist
114 refhist->Sumw2();
115 refhist->Scale(histogram->Integral() / refhist->Integral());
116
117 if (diffRef) histogram->Add(refhist, -1.0);
118 if (normRef) histogram->Divide(refhist);
119 }
120
121 const std::vector<int> range = dqm_algorithms::tools::GetBinRange(histogram.get(), config.getParameters());
122 const int xminBin = range[0];
123 const int xmaxBin = range[1];
124 const int yminBin = range[2];
125 const int ymaxBin = range[3];
126
127 int zeroBins = 0;
128 for (int i = xminBin; i <= xmaxBin; ++i) {
129 for (int j = yminBin; j <= ymaxBin; ++j) {
130 if (histogram->GetBinContent(i, j) == 0) ++zeroBins; // count empty or zero bins
131 }
132 }
133
134 // compute outlier bins by using a simplified Grubb's test without cutting on a t-distribution but by giving an external threshold defining how many standard deviations a value has to differ from the mean to be classified as an outlier or by giving an absolute threshold for classifying values as outliers
135
136 int totalBadBins = 0;
137 int totalUncountedBadBins = 0;
138 double mean = 0;
139 double stddev = 0;
140 int currentIteration = 0;
141 const int maxIterations = 1000;
142
143 const int nBinsX = histogram->GetNbinsX();
144 const int nBinsY = histogram->GetNbinsY();
145 std::unique_ptr<TH1> knownOutliers(isOneDimensional ? // keep a record of known outliers (histogram is only used to set boolean flags)
146 static_cast<TH1 *>(new TH1C("knownOutliers", "knownOutliers", nBinsX, 0, nBinsX)) : // save the space for the additional frame of underflow/overflow bins
147 static_cast<TH1 *>(new TH2C("knownOutliers", "knownOutliers", nBinsX, 0, nBinsX, nBinsY, 0, nBinsY))); // ... but not sure if the savings make up for ugly code
148
149 while (++currentIteration < maxIterations) { // we break earlier if no new outliers are found
150 int newBadBins = 0;
151 int newUncountedBadBins = 0;
152
153 // compute the arithmetic mean and standard deviation
154 int nBins = 0;
155 double sum = 0;
156 double sumsqr = 0;
157 for (int i = xminBin; i <= xmaxBin; ++i) {
158 for (int j = yminBin; j <= ymaxBin; ++j) {
159 const bool isOutlier = knownOutliers->GetBinContent(i, j);
160 if (isOutlier) continue; // skip known outliers
161 const double binContent = histogram->GetBinContent(i, j);
162 if ((binContent == 0) && ignore0) continue; // skip zero bins if requested
163
164 ++nBins;
165 sum += binContent;
166 sumsqr += binContent * binContent;
167 }
168 }
169 mean = nBins ? (sum / nBins) : 0; // avoid division by zero
170 stddev = nBins ? std::sqrt(sumsqr / nBins - mean * mean) : 0; // one-pass: sigma^2 = < X^2 > - < X >^2
171
172 // check how many bins are out of N sigma range (SigmaDev) or out of the absolute dev (AbsDev) or over absolute limit (AbsLimit), count them as (uncounted) outliers and remove them from the histogram
173
174 if (checkRelDev || checkAbsDev || checkAbsLimit || checkSigmaDev) {
175 for (int i = xminBin; i <= xmaxBin; ++i) {
176 for (int j = yminBin; j <= ymaxBin; ++j) {
177 const bool isOutlier = knownOutliers->GetBinContent(i, j);
178 if (isOutlier) continue; // skip known outliers
179 const double binContent = histogram->GetBinContent(i, j);
180 const double binError = subtractBinError ? histogram->GetBinError(i, j) : 0.;
181 if ((binContent == 0) && ignore0) continue; // skip zero bins if requested
182
183 bool foundOutlier = false;
184 if (checkAbsLimit && !foundOutlier) {
185 if (binContent - binError > absLimit) {
186 ++newBadBins;
187 foundOutlier = true;
188 }
189 }
190 if (checkAbsDev && !foundOutlier) {
191 if (std::fabs(binContent - mean) - binError > absDev) {
192 ++newBadBins;
193 foundOutlier = true;
194 }
195 }
196 if (checkRelDev && !foundOutlier) {
197 if (std::fabs(binContent - mean) - binError > (relDev * mean)) {
198 ++newBadBins;
199 foundOutlier = true;
200 }
201 }
202 if (checkSigmaDev && !foundOutlier) {
203 if (std::fabs(binContent - mean) - binError > (sigmaDev * stddev)) {
204 if (dontCountSigmaOutliers) ++newUncountedBadBins;
205 else ++newBadBins;
206 foundOutlier = true;
207 }
208 }
209 if (foundOutlier) {
210 histogram->SetBinContent(i, j, 0); // ignore this bin in the subsequent fit (below)
211 histogram->SetBinError(i, j, 0); // ignore this bin in the subsequent fit (below)
212 knownOutliers->SetBinContent(i, j, true); // ingnore this bin in the next iteration of this loop
213 }
214 } // for (int j)
215 } // for (int i)
216 } // if (check...)
217 if ((newBadBins == 0) && (newUncountedBadBins == 0)) break; // we're done
218 totalUncountedBadBins += newUncountedBadBins;
219 totalBadBins += newBadBins;
220 } // while (protection against endless looping)
221 if (currentIteration == maxIterations) {
222 throw dqm_core::BadConfig(ERS_HERE, name, "maximum number of iterations reached while searching for outliers"); // should never happen
223 }
224 if (totalUncountedBadBins > 0) {
225 ERS_DEBUG(1, "Histogram " << histogram->GetName() << " has " << totalUncountedBadBins << " bins exceeding sigma limit which are NOT counted as outliers, but which are omitted for calculating mean and stddev");
226 }
227
228 // prepare map for results and commit outlier specific parts
229 std::map<std::string, double> results; // you can set flagging thresholds on any of these result tags
230 results["Number_of_outlier_bins"] = totalBadBins;
231 results["Corrected_mean"] = mean;
232 results["Corrected_standard_deviation"] = stddev;
233 results["Number_of_bins_equal_zero"] = zeroBins;
234
235 // store all x values of the outlier bins
236 if (storeOutlierBins) {
237 int outlierIndex = 0;
238 for (int i = xminBin; i <= xmaxBin; ++i) {
239 for (int j = yminBin; j <= ymaxBin; ++j) {
240 if (knownOutliers->GetBinContent(i, j)) {
241 outlierIndex++;
242 results[std::string("Outlier_bin_")+std::to_string(outlierIndex)] = knownOutliers->GetXaxis()->GetBinCenter(i);
243 }
244 }
245 }
246 }
247
248 if (checkFlatness) {
249 if (!isOneDimensional) {
250 throw dqm_core::BadConfig(ERS_HERE, name, "cannot check 2D histograms for flatness, please set CheckFlatness = 0");
251 }
252 std::unique_ptr<TF1> occupancyFit{};
253
254 const double xminAxis = histogram->GetXaxis()->GetBinLowEdge(xminBin);
255 const double xmaxAxis = histogram->GetXaxis()->GetBinUpEdge(xmaxBin);
256 const double width = (xmaxAxis - xminAxis);
257 const double center = (xmaxAxis + xminAxis) / 2;
258
259 if (fitCircular) {
260 occupancyFit = std::make_unique<TF1>("occupancyFit", "[0]+[1]*sin([3]*(x-[4]-[5]))+[2]*cos(2*[3]*(x-[4]-[5]))"); // poor man's Fourier analysis: first-order sin() plus second-order cos()
261 occupancyFit->SetParName(0, "constant");
262 occupancyFit->SetParName(1, "asym");
263 occupancyFit->SetParName(2, "sym");
264 occupancyFit->SetParName(3, "scale");
265 occupancyFit->SetParName(4, "offset");
266 occupancyFit->SetParName(5, "phaseoffset");
267 occupancyFit->SetParLimits(5, -0.5 * width, +0.5 * width);
268 occupancyFit->FixParameter(3, 2 * M_PI / width);
269 occupancyFit->FixParameter(4, center - width / 2);
270 } else {
271 occupancyFit = std::make_unique<TF1>("occupancyFit", "[0]+[1]*(x-[3])+[2]*(x-[3])*(x-[3])"); // parabola: second-order polynomial
272 occupancyFit->SetParName(0, "constant");
273 occupancyFit->SetParName(1, "asym");
274 occupancyFit->SetParName(2, "sym");
275 occupancyFit->SetParName(3, "offset");
276 // shift zero for fit to the center of the distribution
277 occupancyFit->FixParameter(3, center);
278 }
279
280 occupancyFit->SetRange(xminAxis, xmaxAxis);
281 histogram->Fit(occupancyFit.get(), "QNR");
282
283 double maxBin = 0;
284 double phaseOffset = 0;
285 if (fitCircular) {
286 // compute maximum deviation from sine and cosine distribution at the point of maximum deviation
287 phaseOffset = occupancyFit->GetParameter("phaseoffset");
288 maxBin = center + width / 4 + phaseOffset;
289 } else {
290 // compute maximum deviation from quadratic and linear contibutrions in the fit at the edges of the distribution
291 maxBin = center + width / 2;
292 }
293
294 const double constant = occupancyFit->GetParameter("constant");
295
296 double asym = 0;
297 double sym = 0;
298
299 if (diffRef) {
300 asym = occupancyFit->GetParameter("asym");
301 } else {
302 const double parBuff = occupancyFit->GetParameter("sym");
303 occupancyFit->SetParameter("sym", 0);
304 asym = std::fabs(occupancyFit->Eval(maxBin) - constant);
305 occupancyFit->SetParameter("sym", parBuff);
306 asym = constant ? std::fabs(asym / constant) : 0;
307 }
308
309 if (diffRef) {
310 sym = occupancyFit->GetParameter("sym");
311 } else {
312 const double parBuff = occupancyFit->GetParameter("asym");
313 occupancyFit->SetParameter("asym", 0);
314 sym = std::fabs(occupancyFit->Eval(maxBin) - constant);
315 occupancyFit->SetParameter("asym", parBuff);
316 sym = constant ? std::fabs(sym / constant) : 0;
317 if (!fitCircular) {
318 sym /= 2;
319 }
320 }
321
322 const int ndf = occupancyFit->GetNDF();
323 const double chisquare = occupancyFit->GetChisquare();
324 const double chisquareNDF = (ndf > 0) ? (chisquare / ndf) : 0; // prevent fits with no degrees of freedom from returning NaN or INF.
325
326 // commit flatness specific results to result map
327 results["Max_rel_sym_deviation"] = sym;
328 results["Max_rel_asym_deviation"] = asym;
329 results["Chisquare_ndf"] = chisquareNDF;
330 results["Phase_offset"] = phaseOffset;
331
332 } // if (checkFlatness)
333
334 // compare with given thresholds and compute DQ status
335 return tools::MakeComparisons(results, config.getGreenThresholds(), config.getRedThresholds()); // this utility function handles all threshold comparisons
336}
337
339{
340 out << m_name << ": Checks TH1-inherited histograms for bins which lie either Nsigma or AbsDev away from the mean (by default excludes bins with zero entries) or which exceed a given limit and removes them from the distribution.\n"
341 "Remaining (corrected for outliers) distribution is fitted with either a quadratic or a sinusoidal function (option FitCircular) and symmetric and asymmetric deviations from a flat distribution are computed by evaluating the quadratic and linear fit contributions respectively.\n"
342 "Parameters:\n"
343 "\tFitCircular:\tFit sinoidal function instead of quadratic. This is usefull for distributions expected to show circular behaviour (e.g. phi distributions) (default 0).\n"
344 "\tCheckFlatness:\tThis switch can be used to disable the flatness test and just check for outliers (default 1).\n"
345 "\tCheckAbsDev:\tCheck for absolute deviation from mean (default 0).\n"
346 "\tCheckRelDev:\tCheck for relative deviation from mean (default 0).\n"
347 "\tCheckAbsLimit:\tCheck for values exceeding given absolute limit (default 0).\n"
348 "\tCheckSigmaDev:\tCheck for deviation in units of standard deviations from mean (default 1).\n"
349 "\tDontCountSigmaDev:\tBins deviating by a given number of standard deviations are removed from the distribution for mean and flatness computation, but not counted as outliers (default 0).\n"
350 "\tCheckZeroContent:\tIf set to 1 the algorithm will check whether the histogram is filled with only zeros and return red in this case. WARNING this uses the TH1::Integral function which seems to be problematic in Lightweigth Histograms. (default 1).\n"
351 "\tMinStat:\tMinimum Statistics for the histogram to be checked (default 1).\n"
352 "\txmin and/or xmax:\tRestrict all counting and fitting to this x-axis interval (default: full axis range).\n"
353 "\tIgnore0:\tBins with 0 content are ignored for outlier computation (ignored for flatness-test anyway) (default 1).\n"
354 "\tSigmaDev:\tNumber of Standard Deviations a single bin has to differ from the mean of the distribution to be classified as outlier. Has to be given if \"CheckSigmaDev\" is set to 1. (default 5)\n"
355 "\tAbsDev:\tAbsolute value a single bin has to differ from the mean of the distribution to be classified as outlier. Has to be given if \"CheckAbsDev\" is set to 1.\n"
356 "\tRelDev:\tFraction of the mean value a single bin has to differ from the mean of the distribution to be classified as outlier (i.e. a bin is classified as outlier if it deviates more than |AbsDev*mean| from the mean of the distribution). Has to be given if \"CheckRelDev\" is set to 1.\n"
357 "\tAbsLimit:\tAbsolute limit a single bin has to exceed to be classified as outlier. Has to be given if \"CheckAbsLimit\" is set to 1.\n"
358 "\tDivideByReference:\tDivide test histogram by reference histogram and perform checks on the resulting ratio. (default 0)\n"
359 "\tSubtractReference:\tSubtract reference histogram from test histogram and perform checks on the resulting difference. Carefull! This yields pretty unstable results for the flatness tests! (default 0)\n"
360 "\tSubtractBinError:\tSubtract the absolute bin error from the difference between bin content and mean when checking for outliers. (default 0)\n"
361 "\tStoreOutlierBins:\tStore information on outlier bins in the output. (default 0)\n"
362 "Thresholds:\n"
363 "\tNumber_of_outlier_bins:\tNumber of bins classified as outliers using the given thresholds.\n"
364 "\tCorrected_mean:\tMean of distribution ignoring outliers.\n"
365 "\tCorrected_standard_deviation:\tStandard Deviation of distribution ignoring outliers.\n"
366 "\tMax_rel_sym_deviation:\tMaximum relative symmetric deviation from a flat distribution.\n"
367 "\tMax_rel_asym_deviation:\tMaximum relative asymmetric deviation from a flat distribution.\n"
368 "\tNumber_of_bins_equal_zero:\tNumber of Bins with zero content." << std::endl;
369}
#define M_PI
file declares the dqm_algorithms::OutlierAndFlatnessTest class.
const double width
std::string histogram
Definition chains.cxx:52
dqm_core::Result * execute(const std::string &, const TObject &, const dqm_core::AlgorithmConfig &)
void mean(std::vector< double > &bins, std::vector< double > &values, const std::vector< std::string > &files, const std::string &histname, const std::string &tplotname, const std::string &label="")
std::vector< int > GetBinRange(const TH1 *histogram, const std::map< std::string, double > &params)
double GetFirstFromMap(const std::string &paramName, const std::map< std::string, double > &params)
dqm_core::Result * MakeComparisons(const std::map< std::string, double > &algparams, const std::map< std::string, double > &gthreshold, const std::map< std::string, double > &rthreshold)
#define IsA
Declare the TObject style functions.