ATLAS Offline Software
Tools/PyUtils/bin/checkxAOD.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 # Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
4 #
5 #
6 # This is a modified version of PyUtils/bin/checkFile.py. It has been taught
7 # how to sum up the sizes of all the branches belonging to a single xAOD
8 # object/container.
9 #
10 
11 __author__ = "Sebastien Binet <binet@cern.ch>, " \
12  "Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>, " \
13  "RD Schaffer R.D.Schaffer@cern.ch"
14 
15 import sys
16 import os
17 import re
18 import operator
19 
20 from optparse import OptionParser
21 
22 if __name__ == "__main__":
23 
24  parser = OptionParser( usage = "usage: %prog [-f] my.xAOD.file.pool.root" )
25  p = parser.add_option
26  p( "-f",
27  "--file",
28  dest = "fileName",
29  help = "The path to the POOL file to analyze" )
30  p( "-c",
31  "--csv",
32  dest = "csvFileName",
33  help = "Output CSV file name, to use with spreadsheets" )
34  ( options, args ) = parser.parse_args()
35 
36  # Ideally, pattern lists ought to be defined such that categories do NOT overlap.
37  # Should they overlap, the last, i.e., lower on this list, matched category, if any, wins.
38  # The following categories currently overlap:
39  # "PFO" and "Jet", "Muon" and "LRT",
40  # "Trig" and "LRT", "Trig" and "PFO",
41  # "Trig" and "caloringer", "InDet" and "LRT"
42  # Set up categorization matching strings:
43  categoryStrings = {
44  "MetaData" : ["^DataHeader", "(.*)_mems$", "(.*)_timings$", "^Token$", "^RawInfoSummaryForTag$", "^index_ref$"],
45  "Trig" : ["^HLT", "^LVL1", "^L1", "^xTrig", "^Trig", "^CTP_Decision", "^TrigInDetTrackTruthMap", "^TrigNavigation", ".*TriggerTowers", "TileTTL1MBTS", "^TileL2Cnt", "RoIBResult","^_TRIGGER","^L1TopoRawData", "BunchConfKey"],
46  "MET" : ["^MET", "^METMAP", "JEMEtSums"],
47  "EvtId" : ["^ByteStreamEventInfo", "^EventInfo", "^McEventInfo", "^LumiBlockN", "^EventWeight", "^RunNumber", "^ConditionsRun", "^EventTime", "^BunchId", "^EventNumber","^IsTestBeam", "^IsSimulation", "^IsCalibration", "^AvgIntPerXing", "^ActualIntPerXing", "^RandomNumber", "^McChannel"],
48  "tau" : ["^Tau", "^DiTauJets"],
49  "PFO" : ["(.*)EventShape$", "^AntiKt4EMPFlowJets", "^JetETMissChargedParticleFlowObjects", "^JetETMissNeutralParticleFlowObjects", "^CHS(.*)ChargedParticleFlowObjects", "^CHSNeutralParticleFlowObjects", "^JetETMissLCNeutralParticleFlowObjects", "^Global(.*)ParticleFlowObjects"],
50  "egamma" : ["^GSF", "^ForwardElectron", "^egamma", "^Electron(?!.*Ring)", "^Photon(?!.*Ring)"],
51  "Muon" : ["^Muon", "^TileMuObj", "^MS", "^SlowMuons", ".*Stau", "(.*)MuonTrackParticles$", "MUCTPI_RDO", "^RPC", "^TGC", "^MDT", "^CSC", "^sTGC", "^MM", ".*MuonMeasurements$", "^ExtrapolatedMuonTracks", "^CombinedMuonTracks", "^NCB_MuonSegments", "^UnAssocMuonSegments", "^EMEO_Muons", "^EMEO_MuonSpectrometerTrackParticles", "^xAODNSWSegments"],
52  "BTag" : ["^BTag"],
53  "HGTD" : ["^HGTD"],
54  "InDet" : ["^InDet", "^PrimaryVertices", "^ComTime_TRT", "^Pixel", "^TRT", "^SCT", "^BCM", "^CTP", "^Tracks", "^ResolvedForwardTracks", "^SplitClusterAmbiguityMap", "^SoftBVrt","^BLMHits"],
55  "ITk" : ["^ITk"],
56  "Jet" : ["^CamKt", "^AntiKt", "^Jet(?!.*ParticleFlowObjects$)","^LCOriginTopoClusters","^EMOriginTopoClusters"],
57  "CaloTopo" : ["CaloCalTopoCluster", "CaloCalFwdTopoTowers"],
58  "Calo" : ["^LAr", "^AllCalo", "^AODCellContainer", "^MBTSContainer", "^CaloCompactCellContainer", "^CaloEntryLayer", "^E4prContainer", "^TileHitVec", "^TileCellVec", "^TileDigits", "^MBTSHits"],
59  "Truth" : ["^Truth", "Truth$", "TruthMap$", "TruthCollection$", "^PRD_MultiTruth", "TracksTruth$", ".*TrackTruth$", "TrackTruthCollection", "^HardScatter", "BornLeptons",".*ExitLayer$",".*EntryLayer$"],
60  "AFP" : ["^AFP"],
61  "LRT" : ["^LRT", "(.*)LRT$", "(.*)LRTTrackParticles$", "(.*)LargeD0TrackParticles$"],
62  "caloringer" : ["(.*)Ring"],
63  "AnalysisElectrons" : ["^AnalysisElectrons" ],
64  "AnalysisTauJets" : ["^AnalysisTauJets" ],
65  "AnalysisPhotons" : ["^AnalysisPhotons" ],
66  "AnalysisMuons" : ["^AnalysisMuons" ],
67  "AnalysisJets" : ["^AnalysisJets" ],
68  "AnalysisHLT" : ["^AnalysisHLT" ],
69  "AnalysisTrigMatch" : ["^AnalysisTrigMatch" ],
70  "AnalysisLargeRJets" : ["^AnalysisLargeRJets" ],
71  }
72 
73  fileNames = []
74 
75  if len( args ) > 0:
76  fileNames = [ arg for arg in args if arg[ 0 ] != "-" ]
77  pass
78 
79  if options.fileName is None and len( fileNames ) == 0:
80  str( parser.print_help() or "" )
81  sys.exit( 1 )
82 
83  if options.fileName is not None:
84  fileName = os.path.expandvars( os.path.expanduser( options.fileName ) )
85  fileNames.append( fileName )
86  pass
87 
88  fileNames = set( fileNames )
89 
90  # Check the consistency with the CSV output:
91  if len( fileNames ) > 1 and options.csvFileName:
92  print( "WARNING CSV output is only available when processing a single "
93  "input file" )
94  pass
95 
96  # Pattern for a static/dynamic auxiliary variable identification
97  auxvarptn = re.compile( r"Aux(?:Dyn)?(?:\.|:)" )
98  # Loop over the specified file(s):
99  for fileName in fileNames:
100 
101  # Open the file:
102  import PyUtils.PoolFile as PF
103  poolFile = PF.PoolFile( fileName )
104 
105  # Loop over all the branches of the file, and sum up the information
106  # about them in a smart way...
107  summedData = {}
108  categData = {}
109  for d in poolFile.data:
110  # Skip metadata/TAG/etc. branches:
111  # if d.dirType != "B": continue
112  # The name of this branch:
113  brName = d.name
114  # Check if this is a static/dynamic auxiliary variable:
115  m = auxvarptn.search( d.name )
116  if m:
117  # Yes, it is. And the name of the main object/container is:
118  brName = d.name[:m.start()]
119  # Check if we already know this container:
120  if brName in summedData.keys():
121  summedData[ brName ].memSize += d.memSize
122  summedData[ brName ].diskSize += d.diskSize
123  else:
124  summedData[ brName ] = \
125  PF.PoolRecord( brName,
126  d.memSize,
127  d.diskSize,
128  d.memSizeNoZip,
129  d.nEntries,
130  d.dirType )
131  # Set the C++ type name of the main object/container
132  if brName == d.name:
133  if summedData[ brName ].typeName and \
134  summedData[ brName ].typeName != d.typeName:
135  print(f"WARNING: Reset typeName {summedData[ brName ].typeName!r}"
136  f" -> {d.typeName!r} for {brName}", file=sys.stderr)
137  summedData[ brName ].typeName = d.typeName
138  pass
139 
140  # Order the records by size:
141  orderedData = [rec for rec in summedData.values()]
142  sorter = PF.PoolRecord.Sorter.DiskSize
143  orderedData.sort( key = operator.attrgetter( sorter ) )
144 
145  # Print a header:
146  print( "" )
147  print( "=" * 80 )
148  print( " Event data" )
149  print( "=" * 80 )
150  print( PF.PoolOpts.HDR_FORMAT %
151  ( "Mem Size", "Disk Size", "Size/Evt", "Compression",
152  "Items", "Container Name (Type)" ) )
153  print( "-" * 80 )
154 
155  # Now, let's print the event-wise info that we gathered:
156  memSize = 0.0
157  diskSize = 0.0
158  for d in orderedData:
159  # keep branches with either the same number of entries as the number of events, or the
160  # special tlp branches with extra event information
161  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
162  if d.nEntries != poolFile.dataHeader.nEntries and not mtlp: continue
163 
164  colTypeName = d.typeName
165  if colTypeName:
166  for ptn in ("(?:_[pv]._|_tlp._|_v.>_)(.*)", "^[a-zA-Z]+_(.*_[lL]inks?)"):
167  m = re.search(ptn, d.name)
168  if m:
169  d_name = m.group(1)
170  break
171  else:
172  m = re.search("_tlp.$", d.name)
173  if m:
174  d_name = d.name[:m.start()].replace("_",":")
175  else:
176  d_name = d.name
177  nameType = "%s (%s)" % (d_name, colTypeName)
178  else:
179  m = re.search( "_v._", d.name )
180  if m:
181  d_name = d.name[m.end():]
182  nameType = "%s (%s)" % ( d_name, (d.name[:m.end()-1]) )
183  else:
184  m = re.search("_tlp.$", d.name)
185  if m:
186  d_name = d.name[:m.start()].replace("_",":")
187  nameType = "%s (%s)" % (d_name, d_name + m.group())
188  else:
189  d_name = d.name
190  nameType = "%s (%s)" % ( d.name, "()" )
191 
192  # Find category:
193  for categ in reversed(categoryStrings.keys()):
194  for pattern in categoryStrings[ categ ]:
195  if re.match(pattern, d_name.replace("Bkg_","")):
196  catName = categ
197  # Stop searching since category found
198  break
199  else:
200  continue
201  # Stop searching since category found
202  break
203  else:
204  catName = '*Unknown*'
205  # Add on category to name/type
206  nameType += ' [' + catName + ']'
207 
208  # Now sum up the sizes according to the category
209  # Check if we already know this category:
210  if catName in categData.keys():
211  categData[ catName ].memSize += d.memSize
212  categData[ catName ].diskSize += d.diskSize
213  else:
214  categData[ catName ] = \
215  PF.PoolRecord( catName,
216  d.memSize,
217  d.diskSize,
218  d.memSizeNoZip,
219  d.nEntries,
220  d.dirType )
221  pass
222  pass
223 
224  print( PF.PoolOpts.ROW_FORMAT %
225  ( d.memSize,
226  d.diskSize,
227  ( d.diskSize / poolFile.dataHeader.nEntries ),
228  ( d.memSize / d.diskSize ),
229  d.nEntries,
230  nameType ) )
231  memSize = memSize + d.memSize
232  diskSize = diskSize + d.diskSize
233  pass
234  print( "-" * 80 )
235  print( PF.PoolOpts.ROW_FORMAT %
236  ( memSize,
237  diskSize,
238  ( diskSize / poolFile.dataHeader.nEntries ),
239  0.0,
240  poolFile.dataHeader.nEntries,
241  "Total" ) )
242  print( "" )
243 
244  # Now print out the categorized information
245  # Order the records by size:
246  categorizedData = list(categData.values())
247  sorter = PF.PoolRecord.Sorter.DiskSize
248  categorizedData.sort( key = operator.attrgetter( sorter ) )
249 
250  print( "=" * 80 )
251  print( " Categorized data" )
252  print( "=" * 80 )
253  print( " Disk Size Fraction Category Name" )
254  print( "-" * 80 )
255  totDiskSize = 0.0
256  frac = 0.0
257  ds = []
258  dsFrac = []
259  dsName = []
260  for d in categorizedData:
261  dsPerEvt = d.diskSize / poolFile.dataHeader.nEntries
262  dsPerEvtFrac = d.diskSize / diskSize
263  totDiskSize += dsPerEvt
264  frac += dsPerEvtFrac
265  ds += [dsPerEvt]
266  dsFrac += [dsPerEvtFrac]
267  dsName += [d.name]
268  print( "%12.3f kb %12.3f %s" % ( dsPerEvt, dsPerEvtFrac, d.name ) )
269  pass
270  print( "%12.3f kb %12.3f %s" % ( totDiskSize , frac, "Total" ) )
271  ds += [totDiskSize]
272  dsFrac += [frac]
273  dsName += ["Total"]
274 
275  print( "" )
276  print( "=" * 80 )
277  print( "CSV for categories disk size/evt and fraction:" )
278  # print out comment separated list in descending order
279  print (",".join(reversed(dsName)))
280  b = ['{:.3f}'.format(i) for i in reversed(ds)]
281  print (",".join(b))
282  b = ['{:.3f}'.format(i) for i in reversed(dsFrac)]
283  print (",".join(b))
284  print( "=" * 80 )
285  print( "" )
286 
287 
288  print( "=" * 80 )
289  print( " Meta data" )
290  print( "=" * 80 )
291  print( " Mem Size Disk Size Container Name" )
292  print( "-" * 80 )
293 
294  # Now print the info about the metadata:
295  memSize = 0.0
296  diskSize = 0.0
297  for d in orderedData:
298  mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
299  if d.nEntries == poolFile.dataHeader.nEntries or mtlp: continue
300  print( "%12.3f kb %12.3f kb %s" %
301  ( d.memSize, d.diskSize, d.name ) )
302  memSize = memSize + d.memSize
303  diskSize = diskSize + d.diskSize
304  pass
305  print( "-" * 80 )
306  print( "%12.3f kb %12.3f kb %s" %
307  ( memSize, diskSize, "Total" ) )
308  print( "=" * 80 )
309 
310  # Write out a CSV file if one was requested:
311  if options.csvFileName and ( len( fileNames ) == 1 ):
312  # Open the output file:
313  import csv
314  args = {'newline' : ''}
315  with open( options.csvFileName, "w", **args ) as f:
316  writer = csv.writer( f )
317  # Set up the formatting of the file:
318  writer.writerow( [ "Name (Type)", "Size/Evt" ] )
319  # Write all entries to it:
320  for d in orderedData:
321  # Skip metadata items:
322  if d.nEntries != poolFile.dataHeader.nEntries: continue
323  # Construct the name of the entry:
324  colTypeName = d.typeName
325  if not colTypeName: continue
326  nameType = "%s (%s)" % \
327  ( d.name, colTypeName )
328  # Write the entry:
329  writer.writerow( [ nameType, d.diskSize / d.nEntries ] )
330  pass
331  pass
332  pass
333 
334  if len(fileNames) > 1:
335  print ("")
336  pass # loop over fileNames
337 
338  print ("## Bye.")
339  sys.exit( 0 )
replace
std::string replace(std::string s, const std::string &s2, const std::string &s3)
Definition: hcg.cxx:307
vtune_athena.format
format
Definition: vtune_athena.py:14
CxxUtils::set
constexpr std::enable_if_t< is_bitmask_v< E >, E & > set(E &lhs, E rhs)
Convenience function to set bits in a class enum bitmask.
Definition: bitmask.h:224
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
Trk::open
@ open
Definition: BinningType.h:40
python.KeyStore.list
def list(self, key=None)
Definition: KeyStore.py:318
Muon::print
std::string print(const MuPatSegment &)
Definition: MuonTrackSteering.cxx:28
checkxAOD.p
p
Definition: Tools/PyUtils/bin/checkxAOD.py:25
str
Definition: BTagTrackIpAccessor.cxx:11