ATLAS Offline Software
OutlierAndFlatnessTest.cxx
Go to the documentation of this file.
1 /*
2  Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3 */
4 
5 // Checks and removes outliers from TH1-Histos and tests the remaining distribution for flatness. Originally by Steffen Schaepe.
6 
9 
10 #include "dqm_core/AlgorithmManager.h"
11 #include "dqm_core/exceptions.h"
12 #include "dqm_core/AlgorithmConfig.h"
13 #include "dqm_core/Result.h"
14 
15 #include "TH1.h"
16 #include "TF1.h"
17 #include "TH1C.h"
18 #include "TH2C.h"
19 #include "TClass.h"
20 
21 #include <memory>
22 #include <cmath>
23 
24 namespace
25 {
26 dqm_algorithms::OutlierAndFlatnessTest OutlierAndFlatnessTest("OutlierAndFlatnessTest");
27 }
28 
30 {
31  dqm_core::AlgorithmManager::instance().registerAlgorithm(name, this);
32 }
33 
35 {
36  return new OutlierAndFlatnessTest(m_name);
37 }
38 
39 dqm_core::Result *dqm_algorithms::OutlierAndFlatnessTest::execute(const std::string &name, const TObject &object, const dqm_core::AlgorithmConfig &config)
40 {
41  if (!object.IsA()->InheritsFrom("TH1")) {
42  throw dqm_core::BadConfig(ERS_HERE, name, "does not inherit from TH1");
43  }
44  std::unique_ptr<TH1> histogram(static_cast<TH1 *>(object.Clone())); // we just checked that this is really a TH1, so we can safely type-cast the pointer
45  if (histogram->GetDimension() > 2) {
46  throw dqm_core::BadConfig(ERS_HERE, name, "dimension > 2");
47  }
48  const bool isOneDimensional = (histogram->GetDimension() == 1); // can only be 1 or 2 after the previous check
49 
50  // check for minimum statistics
51 
52  const double minstat = tools::GetFirstFromMap("MinStat", config.getParameters(), 1);
53  if (histogram->GetEntries() < minstat) {
55  result->tags_["InsufficientEntries"] = histogram->GetEntries();
56  return result;
57  }
58 
59  // check if histogram conatins only zeros
60 
61  const bool checkZeroContent = static_cast<bool>(tools::GetFirstFromMap("CheckZeroContent", config.getParameters(), 1));
62  if (checkZeroContent && (histogram->Integral() == 0) && (histogram->GetEntries() > 0)) {
63  ERS_DEBUG(1, "Histogram " << histogram->GetName() << " is filled with zeros!");
65  result->tags_["Integral"] = histogram->Integral();
66  return result;
67  }
68 
69  // get configuration parameters and set defaults
70 
71  const bool checkFlatness = static_cast<bool>(tools::GetFirstFromMap("CheckFlatness", config.getParameters(), 1)); // default: true
72  const bool fitCircular = static_cast<bool>(tools::GetFirstFromMap("FitCircular", config.getParameters(), 0)); // default: false
73  const bool ignore0 = static_cast<bool>(tools::GetFirstFromMap("Ignore0", config.getParameters(), 1)); // default: true
74 
75  const bool checkSigmaDev = static_cast<bool>(tools::GetFirstFromMap("CheckSigmaDev", config.getParameters(), 1)); // default: true
76  const double sigmaDev = checkSigmaDev ? tools::GetFirstFromMap("SigmaDev", config.getParameters()) : 0; // only applicable if checkSigmaDev, then mandatory
77  const bool dontCountSigmaOutliers = checkSigmaDev ? static_cast<bool>(tools::GetFirstFromMap("DontCountSigmaOutliers", config.getParameters(), 0)) : false; // only applicable if checkSigmaDev, default: false
78 
79  const bool checkAbsDev = static_cast<bool>(tools::GetFirstFromMap("CheckAbsDev", config.getParameters(), 0)); // default: false
80  const double absDev = checkAbsDev ? tools::GetFirstFromMap("AbsDev", config.getParameters()) : 0; // only applicable if checkAbsDev, then mandatory
81 
82  const bool checkRelDev = static_cast<bool>(tools::GetFirstFromMap("CheckRelDev", config.getParameters(), 0)); // default: false
83  const double relDev = checkRelDev ? tools::GetFirstFromMap("RelDev", config.getParameters()) : 0; // only applicable if checkRelDev, then mandatory
84 
85  const bool checkAbsLimit = static_cast<bool>(tools::GetFirstFromMap("CheckAbsLimit", config.getParameters(), 0)); // default: false
86  const double absLimit = checkAbsLimit ? tools::GetFirstFromMap("AbsLimit", config.getParameters()) : 0; // only applicable if checkAbsLimit, then mandatory
87 
88  const bool normRef = static_cast<bool>(tools::GetFirstFromMap("DivideByReference", config.getParameters(), 0)); // default: false
89  const bool diffRef = static_cast<bool>(tools::GetFirstFromMap("SubtractReference", config.getParameters(), 0)); // default: false
90 
91  const bool subtractBinError = static_cast<bool>(tools::GetFirstFromMap("SubtractBinError", config.getParameters(), 0)); // default: false
92 
93  const bool storeOutlierBins = static_cast<bool>(tools::GetFirstFromMap("StoreOutlierBins", config.getParameters(), 0)); // default: false
94  // if the checks are to be done against a reference take care of this now
95 
96  TH1 *refhist = 0;
97  if (diffRef || normRef) {
98  if (diffRef && normRef) {
99  throw dqm_core::BadConfig(ERS_HERE, name, "Can either divide or subtract reference, not both");
100  }
101  try {
102  refhist = static_cast<TH1 *>(config.getReference());
103  } catch(dqm_core::Exception &ex) {
104  throw dqm_core::BadRefHist(ERS_HERE, name, "Could not retrieve reference");
105  }
106  if (histogram->GetDimension() != refhist->GetDimension()) {
107  throw dqm_core::BadRefHist(ERS_HERE, name, "Wrong dimension of reference");
108  }
109  if ((histogram->GetNbinsX() != refhist->GetNbinsX()) || (histogram->GetNbinsY() != refhist->GetNbinsY())) {
110  throw dqm_core::BadRefHist(ERS_HERE, name, "Non-matching number of bins of reference");
111  }
112 
113  // scale reference to the same integral as the test hist
114  refhist->Sumw2();
115  refhist->Scale(histogram->Integral() / refhist->Integral());
116 
117  if (diffRef) histogram->Add(refhist, -1.0);
118  if (normRef) histogram->Divide(refhist);
119  }
120 
121  const std::vector<int> range = dqm_algorithms::tools::GetBinRange(histogram.get(), config.getParameters());
122  const int xminBin = range[0];
123  const int xmaxBin = range[1];
124  const int yminBin = range[2];
125  const int ymaxBin = range[3];
126 
127  int zeroBins = 0;
128  for (int i = xminBin; i <= xmaxBin; ++i) {
129  for (int j = yminBin; j <= ymaxBin; ++j) {
130  if (histogram->GetBinContent(i, j) == 0) ++zeroBins; // count empty or zero bins
131  }
132  }
133 
134  // compute outlier bins by using a simplified Grubb's test without cutting on a t-distribution but by giving an external threshold defining how many standard deviations a value has to differ from the mean to be classified as an outlier or by giving an absolute threshold for classifying values as outliers
135 
136  int totalBadBins = 0;
137  int totalUncountedBadBins = 0;
138  double mean = 0;
139  double stddev = 0;
140  int currentIteration = 0;
141  const int maxIterations = 1000;
142 
143  const int nBinsX = histogram->GetNbinsX();
144  const int nBinsY = histogram->GetNbinsY();
145  std::unique_ptr<TH1> knownOutliers(isOneDimensional ? // keep a record of known outliers (histogram is only used to set boolean flags)
146  static_cast<TH1 *>(new TH1C("knownOutliers", "knownOutliers", nBinsX, 0, nBinsX)) : // save the space for the additional frame of underflow/overflow bins
147  static_cast<TH1 *>(new TH2C("knownOutliers", "knownOutliers", nBinsX, 0, nBinsX, nBinsY, 0, nBinsY))); // ... but not sure if the savings make up for ugly code
148 
149  while (++currentIteration < maxIterations) { // we break earlier if no new outliers are found
150  int newBadBins = 0;
151  int newUncountedBadBins = 0;
152 
153  // compute the arithmetic mean and standard deviation
154  int nBins = 0;
155  double sum = 0;
156  double sumsqr = 0;
157  for (int i = xminBin; i <= xmaxBin; ++i) {
158  for (int j = yminBin; j <= ymaxBin; ++j) {
159  const bool isOutlier = knownOutliers->GetBinContent(i, j);
160  if (isOutlier) continue; // skip known outliers
161  const double binContent = histogram->GetBinContent(i, j);
162  if ((binContent == 0) && ignore0) continue; // skip zero bins if requested
163 
164  ++nBins;
165  sum += binContent;
166  sumsqr += binContent * binContent;
167  }
168  }
169  mean = nBins ? (sum / nBins) : 0; // avoid division by zero
170  stddev = nBins ? std::sqrt(sumsqr / nBins - mean * mean) : 0; // one-pass: sigma^2 = < X^2 > - < X >^2
171 
172  // check how many bins are out of N sigma range (SigmaDev) or out of the absolute dev (AbsDev) or over absolute limit (AbsLimit), count them as (uncounted) outliers and remove them from the histogram
173 
174  if (checkRelDev || checkAbsDev || checkAbsLimit || checkSigmaDev) {
175  for (int i = xminBin; i <= xmaxBin; ++i) {
176  for (int j = yminBin; j <= ymaxBin; ++j) {
177  const bool isOutlier = knownOutliers->GetBinContent(i, j);
178  if (isOutlier) continue; // skip known outliers
179  const double binContent = histogram->GetBinContent(i, j);
180  const double binError = subtractBinError ? histogram->GetBinError(i, j) : 0.;
181  if ((binContent == 0) && ignore0) continue; // skip zero bins if requested
182 
183  bool foundOutlier = false;
184  if (checkAbsLimit && !foundOutlier) {
185  if (binContent - binError > absLimit) {
186  ++newBadBins;
187  foundOutlier = true;
188  }
189  }
190  if (checkAbsDev && !foundOutlier) {
191  if (std::fabs(binContent - mean) - binError > absDev) {
192  ++newBadBins;
193  foundOutlier = true;
194  }
195  }
196  if (checkRelDev && !foundOutlier) {
197  if (std::fabs(binContent - mean) - binError > (relDev * mean)) {
198  ++newBadBins;
199  foundOutlier = true;
200  }
201  }
202  if (checkSigmaDev && !foundOutlier) {
203  if (std::fabs(binContent - mean) - binError > (sigmaDev * stddev)) {
204  if (dontCountSigmaOutliers) ++newUncountedBadBins;
205  else ++newBadBins;
206  foundOutlier = true;
207  }
208  }
209  if (foundOutlier) {
210  histogram->SetBinContent(i, j, 0); // ignore this bin in the subsequent fit (below)
211  histogram->SetBinError(i, j, 0); // ignore this bin in the subsequent fit (below)
212  knownOutliers->SetBinContent(i, j, true); // ingnore this bin in the next iteration of this loop
213  }
214  } // for (int j)
215  } // for (int i)
216  } // if (check...)
217  if ((newBadBins == 0) && (newUncountedBadBins == 0)) break; // we're done
218  totalUncountedBadBins += newUncountedBadBins;
219  totalBadBins += newBadBins;
220  } // while (protection against endless looping)
221  if (currentIteration == maxIterations) {
222  throw dqm_core::BadConfig(ERS_HERE, name, "maximum number of iterations reached while searching for outliers"); // should never happen
223  }
224  if (totalUncountedBadBins > 0) {
225  ERS_DEBUG(1, "Histogram " << histogram->GetName() << " has " << totalUncountedBadBins << " bins exceeding sigma limit which are NOT counted as outliers, but which are omitted for calculating mean and stddev");
226  }
227 
228  // prepare map for results and commit outlier specific parts
229  std::map<std::string, double> results; // you can set flagging thresholds on any of these result tags
230  results["Number_of_outlier_bins"] = totalBadBins;
231  results["Corrected_mean"] = mean;
232  results["Corrected_standard_deviation"] = stddev;
233  results["Number_of_bins_equal_zero"] = zeroBins;
234 
235  // store all x values of the outlier bins
236  if (storeOutlierBins) {
237  int outlierIndex = 0;
238  for (int i = xminBin; i <= xmaxBin; ++i) {
239  for (int j = yminBin; j <= ymaxBin; ++j) {
240  if (knownOutliers->GetBinContent(i, j)) {
241  outlierIndex++;
242  results[std::string("Outlier_bin_")+std::to_string(outlierIndex)] = knownOutliers->GetXaxis()->GetBinCenter(i);
243  }
244  }
245  }
246  }
247 
248  if (checkFlatness) {
249  if (!isOneDimensional) {
250  throw dqm_core::BadConfig(ERS_HERE, name, "cannot check 2D histograms for flatness, please set CheckFlatness = 0");
251  }
252  std::unique_ptr<TF1> occupancyFit{};
253 
254  const double xminAxis = histogram->GetXaxis()->GetBinLowEdge(xminBin);
255  const double xmaxAxis = histogram->GetXaxis()->GetBinUpEdge(xmaxBin);
256  const double width = (xmaxAxis - xminAxis);
257  const double center = (xmaxAxis + xminAxis) / 2;
258 
259  if (fitCircular) {
260  occupancyFit = std::make_unique<TF1>("occupancyFit", "[0]+[1]*sin([3]*(x-[4]-[5]))+[2]*cos(2*[3]*(x-[4]-[5]))"); // poor man's Fourier analysis: first-order sin() plus second-order cos()
261  occupancyFit->SetParName(0, "constant");
262  occupancyFit->SetParName(1, "asym");
263  occupancyFit->SetParName(2, "sym");
264  occupancyFit->SetParName(3, "scale");
265  occupancyFit->SetParName(4, "offset");
266  occupancyFit->SetParName(5, "phaseoffset");
267  occupancyFit->SetParLimits(5, -0.5 * width, +0.5 * width);
268  occupancyFit->FixParameter(3, 2 * M_PI / width);
269  occupancyFit->FixParameter(4, center - width / 2);
270  } else {
271  occupancyFit = std::make_unique<TF1>("occupancyFit", "[0]+[1]*(x-[3])+[2]*(x-[3])*(x-[3])"); // parabola: second-order polynomial
272  occupancyFit->SetParName(0, "constant");
273  occupancyFit->SetParName(1, "asym");
274  occupancyFit->SetParName(2, "sym");
275  occupancyFit->SetParName(3, "offset");
276  // shift zero for fit to the center of the distribution
277  occupancyFit->FixParameter(3, center);
278  }
279 
280  occupancyFit->SetRange(xminAxis, xmaxAxis);
281  histogram->Fit(occupancyFit.get(), "QNR");
282 
283  double maxBin = 0;
284  double phaseOffset = 0;
285  if (fitCircular) {
286  // compute maximum deviation from sine and cosine distribution at the point of maximum deviation
287  phaseOffset = occupancyFit->GetParameter("phaseoffset");
288  maxBin = center + width / 4 + phaseOffset;
289  } else {
290  // compute maximum deviation from quadratic and linear contibutrions in the fit at the edges of the distribution
291  maxBin = center + width / 2;
292  }
293 
294  const double constant = occupancyFit->GetParameter("constant");
295 
296  double asym = 0;
297  double sym = 0;
298 
299  if (diffRef) {
300  asym = occupancyFit->GetParameter("asym");
301  } else {
302  const double parBuff = occupancyFit->GetParameter("sym");
303  occupancyFit->SetParameter("sym", 0);
304  asym = std::fabs(occupancyFit->Eval(maxBin) - constant);
305  occupancyFit->SetParameter("sym", parBuff);
306  asym = constant ? std::fabs(asym / constant) : 0;
307  }
308 
309  if (diffRef) {
310  sym = occupancyFit->GetParameter("sym");
311  } else {
312  const double parBuff = occupancyFit->GetParameter("asym");
313  occupancyFit->SetParameter("asym", 0);
314  sym = std::fabs(occupancyFit->Eval(maxBin) - constant);
315  occupancyFit->SetParameter("asym", parBuff);
316  sym = constant ? std::fabs(sym / constant) : 0;
317  if (!fitCircular) {
318  sym /= 2;
319  }
320  }
321 
322  const int ndf = occupancyFit->GetNDF();
323  const double chisquare = occupancyFit->GetChisquare();
324  const double chisquareNDF = (ndf > 0) ? (chisquare / ndf) : 0; // prevent fits with no degrees of freedom from returning NaN or INF.
325 
326  // commit flatness specific results to result map
327  results["Max_rel_sym_deviation"] = sym;
328  results["Max_rel_asym_deviation"] = asym;
329  results["Chisquare_ndf"] = chisquareNDF;
330  results["Phase_offset"] = phaseOffset;
331 
332  } // if (checkFlatness)
333 
334  // compare with given thresholds and compute DQ status
335  return tools::MakeComparisons(results, config.getGreenThresholds(), config.getRedThresholds()); // this utility function handles all threshold comparisons
336 }
337 
339 {
340  out << m_name << ": Checks TH1-inherited histograms for bins which lie either Nsigma or AbsDev away from the mean (by default excludes bins with zero entries) or which exceed a given limit and removes them from the distribution.\n"
341  "Remaining (corrected for outliers) distribution is fitted with either a quadratic or a sinusoidal function (option FitCircular) and symmetric and asymmetric deviations from a flat distribution are computed by evaluating the quadratic and linear fit contributions respectively.\n"
342  "Parameters:\n"
343  "\tFitCircular:\tFit sinoidal function instead of quadratic. This is usefull for distributions expected to show circular behaviour (e.g. phi distributions) (default 0).\n"
344  "\tCheckFlatness:\tThis switch can be used to disable the flatness test and just check for outliers (default 1).\n"
345  "\tCheckAbsDev:\tCheck for absolute deviation from mean (default 0).\n"
346  "\tCheckRelDev:\tCheck for relative deviation from mean (default 0).\n"
347  "\tCheckAbsLimit:\tCheck for values exceeding given absolute limit (default 0).\n"
348  "\tCheckSigmaDev:\tCheck for deviation in units of standard deviations from mean (default 1).\n"
349  "\tDontCountSigmaDev:\tBins deviating by a given number of standard deviations are removed from the distribution for mean and flatness computation, but not counted as outliers (default 0).\n"
350  "\tCheckZeroContent:\tIf set to 1 the algorithm will check whether the histogram is filled with only zeros and return red in this case. WARNING this uses the TH1::Integral function which seems to be problematic in Lightweigth Histograms. (default 1).\n"
351  "\tMinStat:\tMinimum Statistics for the histogram to be checked (default 1).\n"
352  "\txmin and/or xmax:\tRestrict all counting and fitting to this x-axis interval (default: full axis range).\n"
353  "\tIgnore0:\tBins with 0 content are ignored for outlier computation (ignored for flatness-test anyway) (default 1).\n"
354  "\tSigmaDev:\tNumber of Standard Deviations a single bin has to differ from the mean of the distribution to be classified as outlier. Has to be given if \"CheckSigmaDev\" is set to 1. (default 5)\n"
355  "\tAbsDev:\tAbsolute value a single bin has to differ from the mean of the distribution to be classified as outlier. Has to be given if \"CheckAbsDev\" is set to 1.\n"
356  "\tRelDev:\tFraction of the mean value a single bin has to differ from the mean of the distribution to be classified as outlier (i.e. a bin is classified as outlier if it deviates more than |AbsDev*mean| from the mean of the distribution). Has to be given if \"CheckRelDev\" is set to 1.\n"
357  "\tAbsLimit:\tAbsolute limit a single bin has to exceed to be classified as outlier. Has to be given if \"CheckAbsLimit\" is set to 1.\n"
358  "\tDivideByReference:\tDivide test histogram by reference histogram and perform checks on the resulting ratio. (default 0)\n"
359  "\tSubtractReference:\tSubtract reference histogram from test histogram and perform checks on the resulting difference. Carefull! This yields pretty unstable results for the flatness tests! (default 0)\n"
360  "\tSubtractBinError:\tSubtract the absolute bin error from the difference between bin content and mean when checking for outliers. (default 0)\n"
361  "\tStoreOutlierBins:\tStore information on outlier bins in the output. (default 0)\n"
362  "Thresholds:\n"
363  "\tNumber_of_outlier_bins:\tNumber of bins classified as outliers using the given thresholds.\n"
364  "\tCorrected_mean:\tMean of distribution ignoring outliers.\n"
365  "\tCorrected_standard_deviation:\tStandard Deviation of distribution ignoring outliers.\n"
366  "\tMax_rel_sym_deviation:\tMaximum relative symmetric deviation from a flat distribution.\n"
367  "\tMax_rel_asym_deviation:\tMaximum relative asymmetric deviation from a flat distribution.\n"
368  "\tNumber_of_bins_equal_zero:\tNumber of Bins with zero content." << std::endl;
369 }
covarianceTool.ndf
ndf
Definition: covarianceTool.py:678
dqm_algorithms::tools::GetBinRange
std::vector< int > GetBinRange(const TH1 *histogram, const std::map< std::string, double > &params)
Definition: AlgorithmHelper.cxx:380
Undefined
@ Undefined
Definition: MaterialTypes.h:8
mean
void mean(std::vector< double > &bins, std::vector< double > &values, const std::vector< std::string > &files, const std::string &histname, const std::string &tplotname, const std::string &label="")
Definition: dependence.cxx:254
get_generator_info.result
result
Definition: get_generator_info.py:21
IsA
#define IsA
Declare the TObject style functions.
Definition: xAODTEventBranch.h:59
python.AthDsoLogger.out
out
Definition: AthDsoLogger.py:71
M_PI
#define M_PI
Definition: ActiveFraction.h:11
dqm_algorithms::OutlierAndFlatnessTest
Definition: OutlierAndFlatnessTest.h:18
dqm_algorithms::OutlierAndFlatnessTest::clone
OutlierAndFlatnessTest * clone()
Definition: OutlierAndFlatnessTest.cxx:34
config
Definition: PhysicsAnalysis/AnalysisCommon/AssociationUtils/python/config.py:1
instance
std::map< std::string, double > instance
Definition: Run_To_Get_Tags.h:8
TH1C
Definition: rootspy.cxx:352
convertTimingResiduals.sum
sum
Definition: convertTimingResiduals.py:55
dqm_algorithms::OutlierAndFlatnessTest::printDescription
void printDescription(std::ostream &out)
Definition: OutlierAndFlatnessTest.cxx:338
TH1::SetBinContent
void SetBinContent(int, double)
Definition: rootspy.cxx:301
lumiFormat.i
int i
Definition: lumiFormat.py:92
Result
ICscStripFitter::Result Result
Definition: CalibCscStripFitter.cxx:13
dqm_algorithms::tools::MakeComparisons
dqm_core::Result * MakeComparisons(const std::map< std::string, double > &algparams, const std::map< std::string, double > &gthreshold, const std::map< std::string, double > &rthreshold)
Definition: AlgorithmHelper.cxx:60
plotBeamSpotVxVal.range
range
Definition: plotBeamSpotVxVal.py:195
OutlierAndFlatnessTest.h
dumpNswErrorDb.constant
def constant
Definition: dumpNswErrorDb.py:22
python.handimod.Red
Red
Definition: handimod.py:551
dumpTgcDigiJitter.nBins
list nBins
Definition: dumpTgcDigiJitter.py:29
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:195
ActsTrk::to_string
std::string to_string(const DetectorType &type)
Definition: GeometryDefs.h:34
python.ami.results
def results
Definition: ami.py:386
TH1::GetBinContent
double GetBinContent(int) const
Definition: rootspy.cxx:298
TH1::Sumw2
void Sumw2()
Definition: rootspy.cxx:284
Base_Fragment.width
width
Definition: Sherpa_i/share/common/Base_Fragment.py:59
TH2C
Definition: rootspy.cxx:390
TH1
Definition: rootspy.cxx:268
AlgorithmHelper.h
dqm_algorithms::OutlierAndFlatnessTest::execute
dqm_core::Result * execute(const std::string &, const TObject &, const dqm_core::AlgorithmConfig &)
Definition: OutlierAndFlatnessTest.cxx:39
dqm_algorithms::tools::GetFirstFromMap
double GetFirstFromMap(const std::string &paramName, const std::map< std::string, double > &params)
Definition: AlgorithmHelper.cxx:339
histogram
std::string histogram
Definition: chains.cxx:52
dqm_algorithms::OutlierAndFlatnessTest::OutlierAndFlatnessTest
OutlierAndFlatnessTest(const std::string &name)
Definition: OutlierAndFlatnessTest.cxx:29