ATLAS Offline Software
Tools/PyUtils/bin/checkxAOD.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 # Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
4 #
5 #
6 # This is a modified version of PyUtils/bin/checkFile.py. It has been taught
7 # how to sum up the sizes of all the branches belonging to a single xAOD
8 # object/container.
9 #
10 
11 __author__ = "Sebastien Binet <binet@cern.ch>, " \
12  "Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>, " \
13  "RD Schaffer R.D.Schaffer@cern.ch"
14 
15 import sys
16 import os
17 import re
18 import operator
19 
20 from optparse import OptionParser
21 
22 if __name__ == "__main__":
23 
24  parser = OptionParser( usage = "usage: %prog [OPTION]... my.xAOD.file.pool.root" )
25  p = parser.add_option
26  p( "--si",
27  action="store_true", dest = "siUnits",
28  help = "print sizes in kB, i.e., in units of 1000 bytes"
29  " (default: print sizes in KiB, i.e., in units of 1024 bytes)" )
30  p( "-f",
31  "--file",
32  dest = "fileName",
33  help = "The path to the POOL file to analyze" )
34  p( "-c",
35  "--csv",
36  dest = "csvFileName",
37  help = "Output CSV file name, to use with spreadsheets" )
38  ( options, args ) = parser.parse_args()
39 
40  # Ideally, pattern lists ought to be defined such that categories do NOT overlap.
41  # Should they overlap, the last, i.e., lower on this list, matched category, if any, wins.
42  # The following categories currently overlap:
43  # "PFO" and "Jet", "Muon" and "LRT",
44  # "Trig" and "LRT", "Trig" and "PFO",
45  # "Trig" and "caloringer", "InDet" and "LRT"
46  # Set up categorization matching strings:
47  categoryStrings = {
48  "MetaData" : ["^DataHeader", "(.*)_mems$", "(.*)_timings$", "^Token$", "^RawInfoSummaryForTag$", "^index_ref$"],
49  "Trig" : ["^HLT", "^LVL1", "^L1", "^xTrig", "^Trig", "^CTP_Decision", "^TrigInDetTrackTruthMap", "^TrigNavigation", ".*TriggerTowers", "TileTTL1MBTS", "^TileL2Cnt", "RoIBResult","^_TRIGGER","^L1TopoRawData", "BunchConfKey"],
50  "MET" : ["^MET", "^METMAP", "JEMEtSums"],
51  "EvtId" : ["^ByteStreamEventInfo", "^EventInfo", "^McEventInfo", "^LumiBlockN", "^EventWeight", "^RunNumber", "^ConditionsRun", "^EventTime", "^BunchId", "^EventNumber","^IsTestBeam", "^IsSimulation", "^IsCalibration", "^AvgIntPerXing", "^ActualIntPerXing", "^RandomNumber", "^McChannel"],
52  "tau" : ["^Tau", "^DiTauJets"],
53  "PFO" : ["(.*)EventShape$", "^AntiKt4EMPFlowJets", "^JetETMissChargedParticleFlowObjects", "^JetETMissNeutralParticleFlowObjects", "^CHS(.*)ChargedParticleFlowObjects", "^CHSNeutralParticleFlowObjects", "^JetETMissLCNeutralParticleFlowObjects", "^Global(.*)ParticleFlowObjects"],
54  "egamma" : ["^GSF", "^ForwardElectron", "^egamma", "^Electron(?!.*Ring)", "^Photon(?!.*Ring)"],
55  "Muon" : ["^Muon", "^TileMuObj", "^MS", "^SlowMuons", ".*Stau", "(.*)MuonTrackParticles$", "MUCTPI_RDO", "^RPC", "^TGC", "^MDT", "^CSC", "^sTGC", "^MM", ".*MuonMeasurements$", "^ExtrapolatedMuonTracks", "^CombinedMuonTracks", "^NCB_MuonSegments", "^UnAssocMuonSegments", "^EMEO_Muons", "^EMEO_MuonSpectrometerTrackParticles", "^xAODNSWSegments"],
56  "BTag" : ["^BTag"],
57  "HGTD" : ["^HGTD"],
58  "InDet" : ["^InDet", "^PrimaryVertices", "^ComTime_TRT", "^Pixel", "^TRT", "^SCT", "^BCM", "^CTP", "^Tracks", "^ResolvedForwardTracks", "^SplitClusterAmbiguityMap", "^SoftBVrt","^BLMHits", "^FourLeptonVertices"],
59  "ITk" : ["^ITk"],
60  "ACTS" : [".*Acts.*"],
61  "Jet" : ["^CamKt", "^AntiKt", "^Jet(?!.*ParticleFlowObjects$)","^LCOriginTopoClusters","^EMOriginTopoClusters"],
62  "CaloTopo" : ["CaloCalTopoCluster", "CaloCalFwdTopoTowers"],
63  "Calo" : ["^LAr", "^AllCalo", "^AODCellContainer", "^MBTSContainer", "^CaloCompactCellContainer", "^CaloEntryLayer", "^E4prContainer", "^TileHitVec", "^TileCellVec", "^TileDigits", "^MBTSHits"],
64  "Truth" : ["^Truth", "Truth$", "TruthMap$", "TruthCollection$", "^PRD_MultiTruth", "TracksTruth$", ".*TrackTruth$", "TrackTruthCollection", "^HardScatter", "BornLeptons",".*ExitLayer$",".*EntryLayer$"],
65  "AFP" : ["^AFP"],
66  "LRT" : ["^LRT", "(.*)LRT$", "(.*)LRTTrackParticles$", "(.*)LargeD0TrackParticles$"],
67  "caloringer" : ["(.*)Ring"],
68  "AnalysisElectrons" : ["^AnalysisElectrons" ],
69  "AnalysisTauJets" : ["^AnalysisTauJets" ],
70  "AnalysisPhotons" : ["^AnalysisPhotons" ],
71  "AnalysisMuons" : ["^AnalysisMuons" ],
72  "AnalysisJets" : ["^AnalysisJets" ],
73  "AnalysisHLT" : ["^AnalysisHLT" ],
74  "AnalysisTrigMatch" : ["^AnalysisTrigMatch" ],
75  "AnalysisLargeRJets" : ["^AnalysisLargeRJets" ],
76  "AnalysisSiHitElectrons" : ["^AnalysisSiHitElectrons" ],
77  }
78 
79  fileNames = []
80 
81  if len( args ) > 0:
82  fileNames = [ arg for arg in args if arg[ 0 ] != "-" ]
83  pass
84 
85  if options.fileName is None and len( fileNames ) == 0:
86  str( parser.print_help() or "" )
87  sys.exit( 1 )
88 
89  if options.fileName is not None:
90  fileName = os.path.expandvars( os.path.expanduser( options.fileName ) )
91  fileNames.append( fileName )
92  pass
93 
94  fileNames = set( fileNames )
95 
96  # Check the consistency with the CSV output:
97  if len( fileNames ) > 1 and options.csvFileName:
98  print( "WARNING CSV output is only available when processing a single "
99  "input file" )
100  pass
101 
102  import PyUtils.PoolFile as PF
103  if options.siUnits:
104  PF.Units.kb = 1000.
105  sizeUnits = "kB"
106  else:
107  sizeUnits = "KiB"
108 
109  # Pattern for a static/dynamic auxiliary variable identification
110  auxvarptn = re.compile( r"Aux(?:Dyn)?(?:\.|:)" )
111  # Loop over the specified file(s):
112  for fileName in fileNames:
113 
114  # Open the file:
115  poolFile = PF.PoolFile( fileName )
116 
117  # Loop over all the branches of the file, and sum up the information
118  # about them in a smart way...
119  summedData = {}
120  categData = {}
121  for d in poolFile.data:
122  # Skip metadata/TAG/etc. branches:
123  # if d.dirType != "B": continue
124  # The name of this branch:
125  brName = d.name
126  # Check if this is a static/dynamic auxiliary variable:
127  m = auxvarptn.search( d.name )
128  if m:
129  # Yes, it is. And the name of the main object/container is:
130  brName = d.name[:m.start()]
131  # Check if we already know this container:
132  if brName in summedData.keys():
133  summedData[ brName ].memSize += d.memSize
134  summedData[ brName ].diskSize += d.diskSize
135  else:
136  summedData[ brName ] = \
137  PF.PoolRecord( brName,
138  d.memSize,
139  d.diskSize,
140  d.memSizeNoZip,
141  d.nEntries,
142  d.dirType )
143  # Set the C++ type name of the main object/container
144  if brName == d.name:
145  if summedData[ brName ].typeName and \
146  summedData[ brName ].typeName != d.typeName:
147  print(f"WARNING: Reset typeName {summedData[ brName ].typeName!r}"
148  f" -> {d.typeName!r} for {brName}", file=sys.stderr)
149  summedData[ brName ].typeName = d.typeName
150  pass
151 
152  # Order the records by size:
153  orderedData = [rec for rec in summedData.values()]
154  sorter = PF.PoolRecord.Sorter.DiskSize
155  orderedData.sort( key = operator.attrgetter( sorter ) )
156 
157  # Print a header:
158  print( "" )
159  print( "=" * 106 )
160  print( " Event data" )
161  print( "=" * 106 )
162  print(f'{"Mem Size":^16} {"Disk Size":^16} {"Size/Evt":^16} {"Compression":>12}'
163  f' {"Items":>8} Container Name (Type) [Category]')
164  print( "-" * 106 )
165 
166  # Now, let's print the event-wise info that we gathered:
167  memSize = 0.0
168  diskSize = 0.0
169  for d in orderedData:
170  # keep branches with either the same number of entries as the number of events, or the
171  # special tlp branches with extra event information
172  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
173  if d.nEntries != poolFile.dataHeader.nEntries and not mtlp: continue
174 
175  colTypeName = d.typeName
176  if colTypeName:
177  for ptn in ("(?:_[pv]._|_tlp._|_v.>_)(.*)", "^[a-zA-Z]+_(.*_[lL]inks?)"):
178  m = re.search(ptn, d.name)
179  if m:
180  d_name = m.group(1)
181  break
182  else:
183  m = re.search("_tlp.$", d.name)
184  if m:
185  d_name = d.name[:m.start()].replace("_",":")
186  else:
187  d_name = d.name
188  nameType = "%s (%s)" % (d_name, colTypeName)
189  else:
190  m = re.search( "_v._", d.name )
191  if m:
192  d_name = d.name[m.end():]
193  nameType = "%s (%s)" % ( d_name, (d.name[:m.end()-1]) )
194  else:
195  m = re.search("_tlp.$", d.name)
196  if m:
197  d_name = d.name[:m.start()].replace("_",":")
198  nameType = "%s (%s)" % (d_name, d_name + m.group())
199  else:
200  d_name = d.name
201  nameType = "%s (%s)" % ( d.name, "()" )
202 
203  # Find category:
204  for categ in reversed(categoryStrings.keys()):
205  for pattern in categoryStrings[ categ ]:
206  if re.match(pattern, d_name.replace("Bkg_","")):
207  catName = categ
208  # Stop searching since category found
209  break
210  else:
211  continue
212  # Stop searching since category found
213  break
214  else:
215  catName = '*Unknown*'
216  # Add on category to name/type
217  nameType += ' [' + catName + ']'
218 
219  # Now sum up the sizes according to the category
220  # Check if we already know this category:
221  if catName in categData.keys():
222  categData[ catName ].memSize += d.memSize
223  categData[ catName ].diskSize += d.diskSize
224  else:
225  categData[ catName ] = \
226  PF.PoolRecord( catName,
227  d.memSize,
228  d.diskSize,
229  d.memSizeNoZip,
230  d.nEntries,
231  d.dirType )
232  pass
233  pass
234 
235  print(f"{d.memSize:12.3f} {sizeUnits:3} {d.diskSize:12.3f} {sizeUnits:3}"
236  f" {d.diskSize / poolFile.dataHeader.nEntries:12.3f} {sizeUnits:3}"
237  f" {d.memSize / d.diskSize:12.3f} {d.nEntries:8d} {nameType:s}")
238  memSize = memSize + d.memSize
239  diskSize = diskSize + d.diskSize
240  pass
241  print( "-" * 106 )
242  print(f"{memSize:12.3f} {sizeUnits:3} {diskSize:12.3f} {sizeUnits:3}"
243  f" {diskSize / poolFile.dataHeader.nEntries:12.3f} {sizeUnits:3}"
244  f" {memSize / diskSize:12.3f} {poolFile.dataHeader.nEntries:8d} Total")
245  print( "" )
246 
247  # Now print out the categorized information
248  # Order the records by size:
249  categorizedData = list(categData.values())
250  sorter = PF.PoolRecord.Sorter.DiskSize
251  categorizedData.sort( key = operator.attrgetter( sorter ) )
252 
253  print( "=" * 80 )
254  print( " Categorized data" )
255  print( "=" * 80 )
256  print(f'{"Disk Size/Evt":^16} {"Fraction":8} Category Name')
257  print( "-" * 80 )
258  totDiskSize = 0.0
259  frac = 0.0
260  ds = []
261  dsFrac = []
262  dsName = []
263  for d in categorizedData:
264  dsPerEvt = d.diskSize / poolFile.dataHeader.nEntries
265  dsPerCatFrac = d.diskSize / diskSize
266  totDiskSize += dsPerEvt
267  frac += dsPerCatFrac
268  ds += [dsPerEvt]
269  dsFrac += [dsPerCatFrac]
270  dsName += [d.name]
271  print(f"{dsPerEvt:12.3f} {sizeUnits:3} {dsPerCatFrac:8.3f} {d.name:s}")
272  pass
273  print(f"{totDiskSize:12.3f} {sizeUnits:3} {frac:8.3f} Total")
274  ds += [totDiskSize]
275  dsFrac += [frac]
276  dsName += ["Total"]
277 
278  print( "" )
279  print( "=" * 80 )
280  print( "CSV for categories disk size/evt and fraction:" )
281  # print out comment separated list in descending order
282  print (",".join(reversed(dsName)))
283  b = ['{:.3f}'.format(i) for i in reversed(ds)]
284  print (",".join(b))
285  b = ['{:.3f}'.format(i) for i in reversed(dsFrac)]
286  print (",".join(b))
287  print( "=" * 80 )
288  print( "" )
289 
290 
291  print( "=" * 80 )
292  print( " Meta data" )
293  print( "=" * 80 )
294  print(f'{"Mem Size":^16} {"Disk Size":^16} Container Name')
295  print( "-" * 80 )
296 
297  # Now print the info about the metadata:
298  memSize = 0.0
299  diskSize = 0.0
300  for d in orderedData:
301  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
302  if d.nEntries == poolFile.dataHeader.nEntries or mtlp: continue
303  print(f"{d.memSize:12.3f} {sizeUnits:3} {d.diskSize:12.3f} {sizeUnits:3} {d.name:s}")
304  memSize = memSize + d.memSize
305  diskSize = diskSize + d.diskSize
306  pass
307  print( "-" * 80 )
308  print(f"{memSize:12.3f} {sizeUnits:3} {diskSize:12.3f} {sizeUnits:3} Total")
309  print( "=" * 80 )
310 
311  # Write out a CSV file if one was requested:
312  if options.csvFileName and ( len( fileNames ) == 1 ):
313  # Open the output file:
314  import csv
315  args = {'newline' : ''}
316  with open( options.csvFileName, "w", **args ) as f:
317  writer = csv.writer( f )
318  # Set up the formatting of the file:
319  writer.writerow( [ "Name (Type)", "Size/Evt" ] )
320  # Write all entries to it:
321  for d in orderedData:
322  # Skip metadata items:
323  if d.nEntries != poolFile.dataHeader.nEntries: continue
324  # Construct the name of the entry:
325  colTypeName = d.typeName
326  if not colTypeName: continue
327  nameType = "%s (%s)" % \
328  ( d.name, colTypeName )
329  # Write the entry:
330  writer.writerow( [ nameType, d.diskSize / d.nEntries ] )
331  pass
332  pass
333  pass
334 
335  if len(fileNames) > 1:
336  print ("")
337  pass # loop over fileNames
338 
339  print ("## Bye.")
340  sys.exit( 0 )
replace
std::string replace(std::string s, const std::string &s2, const std::string &s3)
Definition: hcg.cxx:307
vtune_athena.format
format
Definition: vtune_athena.py:14
histSizes.list
def list(name, path='/')
Definition: histSizes.py:38
CxxUtils::set
constexpr std::enable_if_t< is_bitmask_v< E >, E & > set(E &lhs, E rhs)
Convenience function to set bits in a class enum bitmask.
Definition: bitmask.h:232
print
void print(char *figname, TCanvas *c1)
Definition: TRTCalib_StrawStatusPlots.cxx:25
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
Trk::open
@ open
Definition: BinningType.h:40
checkxAOD.p
p
Definition: Tools/PyUtils/bin/checkxAOD.py:25
str
Definition: BTagTrackIpAccessor.cxx:11