ATLAS Offline Software
Tools/PyUtils/bin/checkxAOD.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 # Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
4 #
5 #
6 # This is a modified version of PyUtils/bin/checkFile.py. It has been taught
7 # how to sum up the sizes of all the branches belonging to a single xAOD
8 # object/container.
9 #
10 
11 __author__ = "Sebastien Binet <binet@cern.ch>, " \
12  "Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>, " \
13  "RD Schaffer R.D.Schaffer@cern.ch"
14 
15 import sys
16 import os
17 import re
18 import operator
19 
20 from optparse import OptionParser
21 
22 if __name__ == "__main__":
23 
24  parser = OptionParser( usage = "usage: %prog [-f] my.xAOD.file.pool.root" )
25  p = parser.add_option
26  p( "-f",
27  "--file",
28  dest = "fileName",
29  help = "The path to the POOL file to analyze" )
30  p( "-c",
31  "--csv",
32  dest = "csvFileName",
33  help = "Output CSV file name, to use with spreadsheets" )
34  ( options, args ) = parser.parse_args()
35 
36  # Ideally, pattern lists ought to be defined such that categories do NOT overlap.
37  # Should they overlap, the last, i.e., lower on this list, matched category, if any, wins.
38  # The following categories currently overlap:
39  # "PFO" and "Jet", "Muon" and "LRT",
40  # "Trig" and "LRT", "Trig" and "PFO",
41  # "Trig" and "caloringer", "InDet" and "LRT"
42  # Set up categorization matching strings:
43  categoryStrings = {
44  "MetaData" : ["^DataHeader", "(.*)_mems$", "(.*)_timings$", "^Token$", "^RawInfoSummaryForTag$", "^index_ref$"],
45  "Trig" : ["^HLT", "^LVL1", "^L1", "^xTrig", "^Trig", "^CTP_Decision", "^TrigInDetTrackTruthMap", "^TrigNavigation", ".*TriggerTowers", "TileTTL1MBTS", "^TileL2Cnt", "RoIBResult","^_TRIGGER","^L1TopoRawData", "BunchConfKey"],
46  "MET" : ["^MET", "^METMAP", "JEMEtSums"],
47  "EvtId" : ["^ByteStreamEventInfo", "^EventInfo", "^McEventInfo", "^LumiBlockN", "^EventWeight", "^RunNumber", "^ConditionsRun", "^EventTime", "^BunchId", "^EventNumber","^IsTestBeam", "^IsSimulation", "^IsCalibration", "^AvgIntPerXing", "^ActualIntPerXing", "^RandomNumber", "^McChannel"],
48  "tau" : ["^Tau", "^DiTauJets"],
49  "PFO" : ["(.*)EventShape$", "^AntiKt4EMPFlowJets", "^JetETMissChargedParticleFlowObjects", "^JetETMissNeutralParticleFlowObjects", "^CHS(.*)ChargedParticleFlowObjects", "^CHSNeutralParticleFlowObjects", "^JetETMissLCNeutralParticleFlowObjects", "^Global(.*)ParticleFlowObjects"],
50  "egamma" : ["^GSF", "^ForwardElectron", "^egamma", "^Electron(?!.*Ring)", "^Photon(?!.*Ring)"],
51  "Muon" : ["^Muon", "^TileMuObj", "^MS", "^SlowMuons", ".*Stau", "(.*)MuonTrackParticles$", "MUCTPI_RDO", "^RPC", "^TGC", "^MDT", "^CSC", "^sTGC", "^MM", ".*MuonMeasurements$", "^ExtrapolatedMuonTracks", "^CombinedMuonTracks", "^NCB_MuonSegments", "^UnAssocMuonSegments", "^EMEO_Muons", "^EMEO_MuonSpectrometerTrackParticles", "^xAODNSWSegments"],
52  "BTag" : ["^BTag"],
53  "HGTD" : ["^HGTD"],
54  "InDet" : ["^InDet", "^PrimaryVertices", "^ComTime_TRT", "^Pixel", "^TRT", "^SCT", "^BCM", "^CTP", "^Tracks", "^ResolvedForwardTracks", "^SplitClusterAmbiguityMap", "^SoftBVrt","^BLMHits", "^FourLeptonVertices"],
55  "ITk" : ["^ITk"],
56  "Jet" : ["^CamKt", "^AntiKt", "^Jet(?!.*ParticleFlowObjects$)","^LCOriginTopoClusters","^EMOriginTopoClusters"],
57  "CaloTopo" : ["CaloCalTopoCluster", "CaloCalFwdTopoTowers"],
58  "Calo" : ["^LAr", "^AllCalo", "^AODCellContainer", "^MBTSContainer", "^CaloCompactCellContainer", "^CaloEntryLayer", "^E4prContainer", "^TileHitVec", "^TileCellVec", "^TileDigits", "^MBTSHits"],
59  "Truth" : ["^Truth", "Truth$", "TruthMap$", "TruthCollection$", "^PRD_MultiTruth", "TracksTruth$", ".*TrackTruth$", "TrackTruthCollection", "^HardScatter", "BornLeptons",".*ExitLayer$",".*EntryLayer$"],
60  "AFP" : ["^AFP"],
61  "LRT" : ["^LRT", "(.*)LRT$", "(.*)LRTTrackParticles$", "(.*)LargeD0TrackParticles$"],
62  "caloringer" : ["(.*)Ring"],
63  "AnalysisElectrons" : ["^AnalysisElectrons" ],
64  "AnalysisTauJets" : ["^AnalysisTauJets" ],
65  "AnalysisPhotons" : ["^AnalysisPhotons" ],
66  "AnalysisMuons" : ["^AnalysisMuons" ],
67  "AnalysisJets" : ["^AnalysisJets" ],
68  "AnalysisHLT" : ["^AnalysisHLT" ],
69  "AnalysisTrigMatch" : ["^AnalysisTrigMatch" ],
70  "AnalysisLargeRJets" : ["^AnalysisLargeRJets" ],
71  "AnalysisSiHitElectrons" : ["^AnalysisSiHitElectrons" ],
72  }
73 
74  fileNames = []
75 
76  if len( args ) > 0:
77  fileNames = [ arg for arg in args if arg[ 0 ] != "-" ]
78  pass
79 
80  if options.fileName is None and len( fileNames ) == 0:
81  str( parser.print_help() or "" )
82  sys.exit( 1 )
83 
84  if options.fileName is not None:
85  fileName = os.path.expandvars( os.path.expanduser( options.fileName ) )
86  fileNames.append( fileName )
87  pass
88 
89  fileNames = set( fileNames )
90 
91  # Check the consistency with the CSV output:
92  if len( fileNames ) > 1 and options.csvFileName:
93  print( "WARNING CSV output is only available when processing a single "
94  "input file" )
95  pass
96 
97  # Pattern for a static/dynamic auxiliary variable identification
98  auxvarptn = re.compile( r"Aux(?:Dyn)?(?:\.|:)" )
99  # Loop over the specified file(s):
100  for fileName in fileNames:
101 
102  # Open the file:
103  import PyUtils.PoolFile as PF
104  poolFile = PF.PoolFile( fileName )
105 
106  # Loop over all the branches of the file, and sum up the information
107  # about them in a smart way...
108  summedData = {}
109  categData = {}
110  for d in poolFile.data:
111  # Skip metadata/TAG/etc. branches:
112  # if d.dirType != "B": continue
113  # The name of this branch:
114  brName = d.name
115  # Check if this is a static/dynamic auxiliary variable:
116  m = auxvarptn.search( d.name )
117  if m:
118  # Yes, it is. And the name of the main object/container is:
119  brName = d.name[:m.start()]
120  # Check if we already know this container:
121  if brName in summedData.keys():
122  summedData[ brName ].memSize += d.memSize
123  summedData[ brName ].diskSize += d.diskSize
124  else:
125  summedData[ brName ] = \
126  PF.PoolRecord( brName,
127  d.memSize,
128  d.diskSize,
129  d.memSizeNoZip,
130  d.nEntries,
131  d.dirType )
132  # Set the C++ type name of the main object/container
133  if brName == d.name:
134  if summedData[ brName ].typeName and \
135  summedData[ brName ].typeName != d.typeName:
136  print(f"WARNING: Reset typeName {summedData[ brName ].typeName!r}"
137  f" -> {d.typeName!r} for {brName}", file=sys.stderr)
138  summedData[ brName ].typeName = d.typeName
139  pass
140 
141  # Order the records by size:
142  orderedData = [rec for rec in summedData.values()]
143  sorter = PF.PoolRecord.Sorter.DiskSize
144  orderedData.sort( key = operator.attrgetter( sorter ) )
145 
146  # Print a header:
147  print( "" )
148  print( "=" * 80 )
149  print( " Event data" )
150  print( "=" * 80 )
151  print( PF.PoolOpts.HDR_FORMAT %
152  ( "Mem Size", "Disk Size", "Size/Evt", "Compression",
153  "Items", "Container Name (Type)" ) )
154  print( "-" * 80 )
155 
156  # Now, let's print the event-wise info that we gathered:
157  memSize = 0.0
158  diskSize = 0.0
159  for d in orderedData:
160  # keep branches with either the same number of entries as the number of events, or the
161  # special tlp branches with extra event information
162  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
163  if d.nEntries != poolFile.dataHeader.nEntries and not mtlp: continue
164 
165  colTypeName = d.typeName
166  if colTypeName:
167  for ptn in ("(?:_[pv]._|_tlp._|_v.>_)(.*)", "^[a-zA-Z]+_(.*_[lL]inks?)"):
168  m = re.search(ptn, d.name)
169  if m:
170  d_name = m.group(1)
171  break
172  else:
173  m = re.search("_tlp.$", d.name)
174  if m:
175  d_name = d.name[:m.start()].replace("_",":")
176  else:
177  d_name = d.name
178  nameType = "%s (%s)" % (d_name, colTypeName)
179  else:
180  m = re.search( "_v._", d.name )
181  if m:
182  d_name = d.name[m.end():]
183  nameType = "%s (%s)" % ( d_name, (d.name[:m.end()-1]) )
184  else:
185  m = re.search("_tlp.$", d.name)
186  if m:
187  d_name = d.name[:m.start()].replace("_",":")
188  nameType = "%s (%s)" % (d_name, d_name + m.group())
189  else:
190  d_name = d.name
191  nameType = "%s (%s)" % ( d.name, "()" )
192 
193  # Find category:
194  for categ in reversed(categoryStrings.keys()):
195  for pattern in categoryStrings[ categ ]:
196  if re.match(pattern, d_name.replace("Bkg_","")):
197  catName = categ
198  # Stop searching since category found
199  break
200  else:
201  continue
202  # Stop searching since category found
203  break
204  else:
205  catName = '*Unknown*'
206  # Add on category to name/type
207  nameType += ' [' + catName + ']'
208 
209  # Now sum up the sizes according to the category
210  # Check if we already know this category:
211  if catName in categData.keys():
212  categData[ catName ].memSize += d.memSize
213  categData[ catName ].diskSize += d.diskSize
214  else:
215  categData[ catName ] = \
216  PF.PoolRecord( catName,
217  d.memSize,
218  d.diskSize,
219  d.memSizeNoZip,
220  d.nEntries,
221  d.dirType )
222  pass
223  pass
224 
225  print( PF.PoolOpts.ROW_FORMAT %
226  ( d.memSize,
227  d.diskSize,
228  ( d.diskSize / poolFile.dataHeader.nEntries ),
229  ( d.memSize / d.diskSize ),
230  d.nEntries,
231  nameType ) )
232  memSize = memSize + d.memSize
233  diskSize = diskSize + d.diskSize
234  pass
235  print( "-" * 80 )
236  print( PF.PoolOpts.ROW_FORMAT %
237  ( memSize,
238  diskSize,
239  ( diskSize / poolFile.dataHeader.nEntries ),
240  0.0,
241  poolFile.dataHeader.nEntries,
242  "Total" ) )
243  print( "" )
244 
245  # Now print out the categorized information
246  # Order the records by size:
247  categorizedData = list(categData.values())
248  sorter = PF.PoolRecord.Sorter.DiskSize
249  categorizedData.sort( key = operator.attrgetter( sorter ) )
250 
251  print( "=" * 80 )
252  print( " Categorized data" )
253  print( "=" * 80 )
254  print( " Disk Size Fraction Category Name" )
255  print( "-" * 80 )
256  totDiskSize = 0.0
257  frac = 0.0
258  ds = []
259  dsFrac = []
260  dsName = []
261  for d in categorizedData:
262  dsPerEvt = d.diskSize / poolFile.dataHeader.nEntries
263  dsPerEvtFrac = d.diskSize / diskSize
264  totDiskSize += dsPerEvt
265  frac += dsPerEvtFrac
266  ds += [dsPerEvt]
267  dsFrac += [dsPerEvtFrac]
268  dsName += [d.name]
269  print( "%12.3f kb %12.3f %s" % ( dsPerEvt, dsPerEvtFrac, d.name ) )
270  pass
271  print( "%12.3f kb %12.3f %s" % ( totDiskSize , frac, "Total" ) )
272  ds += [totDiskSize]
273  dsFrac += [frac]
274  dsName += ["Total"]
275 
276  print( "" )
277  print( "=" * 80 )
278  print( "CSV for categories disk size/evt and fraction:" )
279  # print out comment separated list in descending order
280  print (",".join(reversed(dsName)))
281  b = ['{:.3f}'.format(i) for i in reversed(ds)]
282  print (",".join(b))
283  b = ['{:.3f}'.format(i) for i in reversed(dsFrac)]
284  print (",".join(b))
285  print( "=" * 80 )
286  print( "" )
287 
288 
289  print( "=" * 80 )
290  print( " Meta data" )
291  print( "=" * 80 )
292  print( " Mem Size Disk Size Container Name" )
293  print( "-" * 80 )
294 
295  # Now print the info about the metadata:
296  memSize = 0.0
297  diskSize = 0.0
298  for d in orderedData:
299  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
300  if d.nEntries == poolFile.dataHeader.nEntries or mtlp: continue
301  print( "%12.3f kb %12.3f kb %s" %
302  ( d.memSize, d.diskSize, d.name ) )
303  memSize = memSize + d.memSize
304  diskSize = diskSize + d.diskSize
305  pass
306  print( "-" * 80 )
307  print( "%12.3f kb %12.3f kb %s" %
308  ( memSize, diskSize, "Total" ) )
309  print( "=" * 80 )
310 
311  # Write out a CSV file if one was requested:
312  if options.csvFileName and ( len( fileNames ) == 1 ):
313  # Open the output file:
314  import csv
315  args = {'newline' : ''}
316  with open( options.csvFileName, "w", **args ) as f:
317  writer = csv.writer( f )
318  # Set up the formatting of the file:
319  writer.writerow( [ "Name (Type)", "Size/Evt" ] )
320  # Write all entries to it:
321  for d in orderedData:
322  # Skip metadata items:
323  if d.nEntries != poolFile.dataHeader.nEntries: continue
324  # Construct the name of the entry:
325  colTypeName = d.typeName
326  if not colTypeName: continue
327  nameType = "%s (%s)" % \
328  ( d.name, colTypeName )
329  # Write the entry:
330  writer.writerow( [ nameType, d.diskSize / d.nEntries ] )
331  pass
332  pass
333  pass
334 
335  if len(fileNames) > 1:
336  print ("")
337  pass # loop over fileNames
338 
339  print ("## Bye.")
340  sys.exit( 0 )
replace
std::string replace(std::string s, const std::string &s2, const std::string &s3)
Definition: hcg.cxx:307
vtune_athena.format
format
Definition: vtune_athena.py:14
histSizes.list
def list(name, path='/')
Definition: histSizes.py:38
CxxUtils::set
constexpr std::enable_if_t< is_bitmask_v< E >, E & > set(E &lhs, E rhs)
Convenience function to set bits in a class enum bitmask.
Definition: bitmask.h:232
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
Trk::open
@ open
Definition: BinningType.h:40
checkxAOD.p
p
Definition: Tools/PyUtils/bin/checkxAOD.py:25
str
Definition: BTagTrackIpAccessor.cxx:11
dbg::print
void print(std::FILE *stream, std::format_string< Args... > fmt, Args &&... args)
Definition: SGImplSvc.cxx:70