ATLAS Offline Software
Loading...
Searching...
No Matches
Tools/PyUtils/bin/checkxAOD.py
Go to the documentation of this file.
1#!/usr/bin/env python
2
3# Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
4#
5#
6# This is a modified version of PyUtils/bin/checkFile.py. It has been taught
7# how to sum up the sizes of all the branches belonging to a single xAOD
8# object/container.
9#
10
11__author__ = "Sebastien Binet <binet@cern.ch>, " \
12 "Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>, " \
13 "RD Schaffer R.D.Schaffer@cern.ch"
14
15import sys
16import os
17import re
18import operator
19
20from optparse import OptionParser
21
22if __name__ == "__main__":
23
24 parser = OptionParser( usage = "usage: %prog [OPTION]... my.xAOD.file.pool.root" )
25 p = parser.add_option
26 p( "--si",
27 action="store_true", dest = "siUnits",
28 help = "print sizes in kB, i.e., in units of 1000 bytes"
29 " (default: print sizes in KiB, i.e., in units of 1024 bytes)" )
30 p( "-f",
31 "--file",
32 dest = "fileName",
33 help = "The path to the POOL file to analyze" )
34 p( "-c",
35 "--csv",
36 dest = "csvFileName",
37 help = "Output CSV file name, to use with spreadsheets" )
38 ( options, args ) = parser.parse_args()
39
40 # Ideally, pattern lists ought to be defined such that categories do NOT overlap.
41 # Should they overlap, the last, i.e., lower on this list, matched category, if any, wins.
42 # The following categories currently overlap:
43 # "PFO" and "Jet", "Muon" and "LRT",
44 # "Trig" and "LRT", "Trig" and "PFO",
45 # "Trig" and "caloringer", "InDet" and "LRT"
46 # Set up categorization matching strings:
47 categoryStrings = {
48 "MetaData" : ["^DataHeader", "(.*)_mems$", "(.*)_timings$", "^Token$", "^RawInfoSummaryForTag$", "^index_ref$"],
49 "Trig" : ["^HLT", "^LVL1", "^L1", "^xTrig", "^Trig", "^CTP_Decision", "^TrigInDetTrackTruthMap", "^TrigNavigation", ".*TriggerTowers", "TileTTL1MBTS", "^TileL2Cnt", "RoIBResult","^_TRIGGER","^L1TopoRawData", "BunchConfKey"],
50 "MET" : ["^MET", "^METMAP", "JEMEtSums"],
51 "EvtId" : ["^ByteStreamEventInfo", "^EventInfo", "^McEventInfo", "^LumiBlockN", "^EventWeight", "^RunNumber", "^ConditionsRun", "^EventTime", "^BunchId", "^EventNumber","^IsTestBeam", "^IsSimulation", "^IsCalibration", "^AvgIntPerXing", "^ActualIntPerXing", "^RandomNumber", "^McChannel"],
52 "tau" : ["^Tau", "^DiTauJets"],
53 "PFO" : ["(.*)EventShape$", "^AntiKt4EMPFlowJets", "^JetETMissChargedParticleFlowObjects", "^JetETMissNeutralParticleFlowObjects", "^CHS(.*)ChargedParticleFlowObjects", "^CHSNeutralParticleFlowObjects", "^JetETMissLCNeutralParticleFlowObjects", "^Global(.*)ParticleFlowObjects"],
54 "egamma" : ["^GSF", "^ForwardElectron", "^egamma", "^Electron(?!.*Ring)", "^Photon(?!.*Ring)"],
55 "Muon" : ["^Muon", "^TileMuObj", "^MS", "^SlowMuons", ".*Stau", "(.*)MuonTrackParticles$", "MUCTPI_RDO", "^RPC", "^TGC", "^MDT", "^CSC", "^sTGC", "^MM", ".*MuonMeasurements$", "^ExtrapolatedMuonTracks", "^CombinedMuonTracks", "^NCB_MuonSegments", "^UnAssocMuonSegments", "^EMEO_Muons", "^EMEO_MuonSpectrometerTrackParticles", "^xAODNSWSegments"],
56 "BTag" : ["^BTag"],
57 "HGTD" : ["^HGTD"],
58 "InDet" : ["^InDet", "^PrimaryVertices", "^ComTime_TRT", "^Pixel", "^TRT", "^SCT", "^BCM", "^CTP", "^Tracks", "^ResolvedForwardTracks", "^SplitClusterAmbiguityMap", "^SoftBVrt","^BLMHits", "^FourLeptonVertices"],
59 "ITk" : ["^ITk"],
60 "ACTS" : [".*Acts.*"],
61 "Jet" : ["^CamKt", "^AntiKt", "^Jet(?!.*ParticleFlowObjects$)","^LCOriginTopoClusters","^EMOriginTopoClusters"],
62 "CaloTopo" : ["CaloCalTopoCluster", "CaloCalFwdTopoTowers"],
63 "Calo" : ["^LAr", "^AllCalo", "^AODCellContainer", "^MBTSContainer", "^CaloCompactCellContainer", "^CaloEntryLayer", "^E4prContainer", "^TileHitVec", "^TileCellVec", "^TileDigits", "^MBTSHits"],
64 "Truth" : ["^Truth", "Truth$", "TruthMap$", "TruthCollection$", "^PRD_MultiTruth", "TracksTruth$", ".*TrackTruth$", "TrackTruthCollection", "^HardScatter", "BornLeptons",".*ExitLayer$",".*EntryLayer$"],
65 "AFP" : ["^AFP"],
66 "LRT" : ["^LRT", "(.*)LRT$", "(.*)LRTTrackParticles$", "(.*)LargeD0TrackParticles$"],
67 "caloringer" : ["(.*)Ring"],
68 "AnalysisElectrons" : ["^AnalysisElectrons" ],
69 "AnalysisTauJets" : ["^AnalysisTauJets" ],
70 "AnalysisPhotons" : ["^AnalysisPhotons" ],
71 "AnalysisMuons" : ["^AnalysisMuons" ],
72 "AnalysisJets" : ["^AnalysisJets" ],
73 "AnalysisHLT" : ["^AnalysisHLT" ],
74 "AnalysisTrigMatch" : ["^AnalysisTrigMatch" ],
75 "AnalysisLargeRJets" : ["^AnalysisLargeRJets" ],
76 "AnalysisSiHitElectrons" : ["^AnalysisSiHitElectrons" ],
77 }
78
79 fileNames = []
80
81 if len( args ) > 0:
82 fileNames = [ arg for arg in args if arg[ 0 ] != "-" ]
83 pass
84
85 if options.fileName is None and len( fileNames ) == 0:
86 str( parser.print_help() or "" )
87 sys.exit( 1 )
88
89 if options.fileName is not None:
90 fileName = os.path.expandvars( os.path.expanduser( options.fileName ) )
91 fileNames.append( fileName )
92 pass
93
94 fileNames = set( fileNames )
95
96 # Check the consistency with the CSV output:
97 if len( fileNames ) > 1 and options.csvFileName:
98 print( "WARNING CSV output is only available when processing a single "
99 "input file" )
100 pass
101
102 import PyUtils.PoolFile as PF
103 if options.siUnits:
104 PF.Units.kb = 1000.
105 sizeUnits = "kB"
106 else:
107 sizeUnits = "KiB"
108
109 # Pattern for a static/dynamic auxiliary variable identification
110 auxvarptn = re.compile( r"Aux(?:Dyn)?(?:\.|:)" )
111 # Loop over the specified file(s):
112 for fileName in fileNames:
113
114 # Open the file:
115 poolFile = PF.PoolFile( fileName )
116
117 # Loop over all the branches of the file, and sum up the information
118 # about them in a smart way...
119 summedData = {}
120 categData = {}
121 for d in poolFile.data:
122 # Skip metadata/TAG/etc. branches:
123 # if d.dirType != "B": continue
124 # The name of this branch:
125 brName = d.name
126 # Check if this is a static/dynamic auxiliary variable:
127 m = auxvarptn.search( d.name )
128 if m:
129 # Yes, it is. And the name of the main object/container is:
130 brName = d.name[:m.start()]
131 # Check if we already know this container:
132 if brName in summedData.keys():
133 summedData[ brName ].memSize += d.memSize
134 summedData[ brName ].diskSize += d.diskSize
135 else:
136 summedData[ brName ] = \
137 PF.PoolRecord( brName,
138 d.memSize,
139 d.diskSize,
140 d.memSizeNoZip,
141 d.nEntries,
142 d.dirType )
143 # Set the C++ type name of the main object/container
144 if brName == d.name:
145 if summedData[ brName ].typeName and \
146 summedData[ brName ].typeName != d.typeName:
147 print(f"WARNING: Reset typeName {summedData[ brName ].typeName!r}"
148 f" -> {d.typeName!r} for {brName}", file=sys.stderr)
149 summedData[ brName ].typeName = d.typeName
150 pass
151
152 # Order the records by size:
153 orderedData = [rec for rec in summedData.values()]
154 sorter = PF.PoolRecord.Sorter.DiskSize
155 orderedData.sort( key = operator.attrgetter( sorter ) )
156
157 # Print a header:
158 print( "" )
159 print( "=" * 106 )
160 print( " Event data" )
161 print( "=" * 106 )
162 print(f'{"Mem Size":^16} {"Disk Size":^16} {"Size/Evt":^16} {"Compression":>12}'
163 f' {"Items":>8} Container Name (Type) [Category]')
164 print( "-" * 106 )
165
166 # Now, let's print the event-wise info that we gathered:
167 memSize = 0.0
168 diskSize = 0.0
169 for d in orderedData:
170 # keep branches with either the same number of entries as the number of events, or the
171 # special tlp branches with extra event information
172 mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
173 if d.nEntries != poolFile.dataHeader.nEntries and not mtlp: continue
174
175 colTypeName = d.typeName
176 if colTypeName:
177 for ptn in ("(?:_[pv]._|_tlp._|_v.>_)(.*)", "^[a-zA-Z]+_(.*_[lL]inks?)"):
178 m = re.search(ptn, d.name)
179 if m:
180 d_name = m.group(1)
181 break
182 else:
183 m = re.search("_tlp.$", d.name)
184 if m:
185 d_name = d.name[:m.start()].replace("_",":")
186 else:
187 d_name = d.name
188 nameType = "%s (%s)" % (d_name, colTypeName)
189 else:
190 m = re.search( "_v._", d.name )
191 if m:
192 d_name = d.name[m.end():]
193 nameType = "%s (%s)" % ( d_name, (d.name[:m.end()-1]) )
194 else:
195 m = re.search("_tlp.$", d.name)
196 if m:
197 d_name = d.name[:m.start()].replace("_",":")
198 nameType = "%s (%s)" % (d_name, d_name + m.group())
199 else:
200 d_name = d.name
201 nameType = "%s (%s)" % ( d.name, "()" )
202
203 # Find category:
204 for categ in reversed(categoryStrings.keys()):
205 for pattern in categoryStrings[ categ ]:
206 if re.match(pattern, d_name.replace("Bkg_","")):
207 catName = categ
208 # Stop searching since category found
209 break
210 else:
211 continue
212 # Stop searching since category found
213 break
214 else:
215 catName = '*Unknown*'
216 # Add on category to name/type
217 nameType += ' [' + catName + ']'
218
219 # Now sum up the sizes according to the category
220 # Check if we already know this category:
221 if catName in categData.keys():
222 categData[ catName ].memSize += d.memSize
223 categData[ catName ].diskSize += d.diskSize
224 else:
225 categData[ catName ] = \
226 PF.PoolRecord( catName,
227 d.memSize,
228 d.diskSize,
229 d.memSizeNoZip,
230 d.nEntries,
231 d.dirType )
232 pass
233 pass
234
235 print(f"{d.memSize:12.3f} {sizeUnits:3} {d.diskSize:12.3f} {sizeUnits:3}"
236 f" {d.diskSize / max(poolFile.dataHeader.nEntries,1):12.3f} {sizeUnits:3}"
237 f" {d.memSize / d.diskSize:12.3f} {d.nEntries:8d} {nameType:s}")
238 memSize = memSize + d.memSize
239 diskSize = diskSize + d.diskSize
240 pass
241 print( "-" * 106 )
242 print(f"{memSize:12.3f} {sizeUnits:3} {diskSize:12.3f} {sizeUnits:3}"
243 f" {diskSize / max(poolFile.dataHeader.nEntries,1):12.3f} {sizeUnits:3}"
244 f" {memSize / diskSize:12.3f} {poolFile.dataHeader.nEntries:8d} Total")
245 print( "" )
246
247 # Now print out the categorized information
248 # Order the records by size:
249 categorizedData = list(categData.values())
250 sorter = PF.PoolRecord.Sorter.DiskSize
251 categorizedData.sort( key = operator.attrgetter( sorter ) )
252
253 print( "=" * 80 )
254 print( " Categorized data" )
255 print( "=" * 80 )
256 print(f'{"Disk Size/Evt":^16} {"Fraction":8} Category Name')
257 print( "-" * 80 )
258 totDiskSize = 0.0
259 frac = 0.0
260 ds = []
261 dsFrac = []
262 dsName = []
263 for d in categorizedData:
264 dsPerEvt = d.diskSize / max(poolFile.dataHeader.nEntries,1)
265 dsPerCatFrac = d.diskSize / diskSize
266 totDiskSize += dsPerEvt
267 frac += dsPerCatFrac
268 ds += [dsPerEvt]
269 dsFrac += [dsPerCatFrac]
270 dsName += [d.name]
271 print(f"{dsPerEvt:12.3f} {sizeUnits:3} {dsPerCatFrac:8.3f} {d.name:s}")
272 pass
273 print(f"{totDiskSize:12.3f} {sizeUnits:3} {frac:8.3f} Total")
274 ds += [totDiskSize]
275 dsFrac += [frac]
276 dsName += ["Total"]
277
278 print( "" )
279 print( "=" * 80 )
280 print( "CSV for categories disk size/evt and fraction:" )
281 # print out comment separated list in descending order
282 print (",".join(reversed(dsName)))
283 b = ['{:.3f}'.format(i) for i in reversed(ds)]
284 print (",".join(b))
285 b = ['{:.3f}'.format(i) for i in reversed(dsFrac)]
286 print (",".join(b))
287 print( "=" * 80 )
288 print( "" )
289
290
291 print( "=" * 80 )
292 print( " Meta data" )
293 print( "=" * 80 )
294 print(f'{"Mem Size":^16} {"Disk Size":^16} Container Name')
295 print( "-" * 80 )
296
297 # Now print the info about the metadata:
298 memSize = 0.0
299 diskSize = 0.0
300 for d in orderedData:
301 mtlp = re.search( "_tlp.$", d.name ) or "DataHeader" in d.name
302 if d.nEntries == poolFile.dataHeader.nEntries or mtlp: continue
303 print(f"{d.memSize:12.3f} {sizeUnits:3} {d.diskSize:12.3f} {sizeUnits:3} {d.name:s}")
304 memSize = memSize + d.memSize
305 diskSize = diskSize + d.diskSize
306 pass
307 print( "-" * 80 )
308 print(f"{memSize:12.3f} {sizeUnits:3} {diskSize:12.3f} {sizeUnits:3} Total")
309 print( "=" * 80 )
310
311 # Write out a CSV file if one was requested:
312 if options.csvFileName and ( len( fileNames ) == 1 ):
313 # Open the output file:
314 import csv
315 args = {'newline' : ''}
316 with open( options.csvFileName, "w", **args ) as f:
317 writer = csv.writer( f )
318 # Set up the formatting of the file:
319 writer.writerow( [ "Name (Type)", "Size/Evt" ] )
320 # Write all entries to it:
321 for d in orderedData:
322 # Skip metadata items:
323 if d.nEntries != poolFile.dataHeader.nEntries: continue
324 # Construct the name of the entry:
325 colTypeName = d.typeName
326 if not colTypeName: continue
327 nameType = "%s (%s)" % \
328 ( d.name, colTypeName )
329 # Write the entry:
330 writer.writerow( [ nameType, d.diskSize / d.nEntries ] )
331 pass
332 pass
333 pass
334
335 if len(fileNames) > 1:
336 print ("")
337 pass # loop over fileNames
338
339 print ("## Bye.")
340 sys.exit( 0 )
void print(char *figname, TCanvas *c1)
#define max(a, b)
Definition cfImp.cxx:41
STL class.
std::string replace(std::string s, const std::string &s2, const std::string &s3)
Definition hcg.cxx:310