ATLAS Offline Software
Loading...
Searching...
No Matches
PoolFile.py
Go to the documentation of this file.
1# Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2
3# @author: Sebastien Binet <binet@cern.ch>
4# @date: March 2007
5#
6#
7
8__author__ = "Sebastien Binet <binet@cern.ch>"
9
10
11__all__ = [
12 'PoolFileCatalog',
13 'PoolOpts',
14 'isRNTuple',
15 'PoolRecord',
16 'PoolFile',
17 'DiffFiles',
18 ]
19
20
21import sys
22import os
23import shelve
24
25from dbm import whichdb
26
27from .Helpers import ShutUp
28
29
30class Units (object):
31 kb = 1024.
32 Mb = 1024.*1024.
33
34
35def isRNTuple(obj):
36 # MN: remove the "try" after migration to ROOT 6.34
37 try: from ROOT import RNTuple
38 except(ImportError): from ROOT.Experimental import RNTuple
39 return isinstance( obj, RNTuple )
40
41
42class PoolFileCatalog(object):
43 """ reverse-engineering of the POOL FileCatalog.
44 allows to retrieve the physical filename from a logical one, provided
45 that the file-id is known to the (real) PoolFileCatalog
46 """
47 DefaultCatalog = "xmlcatalog_file:PoolFileCatalog.xml"
48 AllowedProtocols = (
49 # see: PoolSvc::createCatalog
50 # http://alxr.usatlas.bnl.gov/lxr/source/atlas/Database/AthenaPOOL/PoolSvc/src/PoolSvc.cxx?v=head#736
51 "xmlcatalog_file:", # POOL default
52 "apcfile:", # ATLAS_POOLCOND_PATH
53 "prfile:", # file via PathResolver
54 "file:", # simple file on local FS
55 )
56
57 def __init__ (self, catalog=None):
58 super (PoolFileCatalog, self).__init__()
59 self.catalog = None
60
61 if catalog is None:
62 # chase poolfilecatalog location
63 catalog = os.environ.get("POOL_CATALOG", self.DefaultCatalog)
64
65 if isinstance(catalog, str):
66 catalog = [catalog]
67
68 if not isinstance (catalog, (str, list)):
69 raise TypeError(
70 "catalog contact string should be a string or a list thereof! (got %r)"%
71 type(catalog))
72
73 osp = os.path
74 def osp_exp(x):
75 return osp.expanduser(osp.expandvars(x))
76
77 def _handle_apcfile_old(x):
78 """ return $ATLAS_POOLCOND_PATH/poolcond/x
79 """
80 if 'ATLAS_POOLCOND_PATH' not in os.environ:
81 return osp_exp(x)
82 pcp = os.environ["ATLAS_POOLCOND_PATH"]
83 if x.startswith("apcfile:"):
84 x = x[len("apcfile:"):]
85 return osp_exp(osp.join(pcp, 'poolcond', x))
86
87 def _handle_apcfile(x):
88 """ return $ATLAS_POOLCOND_PATH/x
89 """
90 if 'ATLAS_POOLCOND_PATH' not in os.environ:
91 return osp_exp(x)
92 pcp = os.environ["ATLAS_POOLCOND_PATH"]
93 if x.startswith("apcfile:"):
94 x = x[len("apcfile:"):]
95 return osp_exp(osp.join(pcp, x))
96
97 def _handle_xmlcatalog_file(x):
98 return osp_exp(x[len("xmlcatalog_file:"):])
99
100 def _handle_prfile(x):
101 x = x[len("prfile:"):]
102 x = osp_exp(x)
103 try:
104 import AthenaCommon.Utils.unixtools as u
105 return u.FindFile(x,
106 os.environ['DATAPATH'].split(os.pathsep),
107 os.R_OK)
108 except ImportError:
109 return x
110
111 def _handle_file(x):
112 x = x[len("file:"):]
113 x = osp_exp(x)
114 return x
115
116 cat_dispatch = {
117 "xmlcatalog_file:": _handle_xmlcatalog_file,
118 "apcfile:": _handle_apcfile,
119 "prfile:": _handle_prfile,
120 "file:": _handle_file,
121 }
122 assert sorted(cat_dispatch.keys()) == sorted(self.AllowedProtocols), \
123 "catalog dispatch keys does not match AllowedProtocols:" \
124 "\n%s\n%s" % (sorted(cat_dispatch.keys()),
125 sorted(self.AllowedProtocols))
126
127 from . import xmldict
128 def _build_catalog(catalog):
129 if not catalog.startswith(self.AllowedProtocols):
130 raise ValueError(
131 "sorry PoolFile:PoolFileCatalog only supports %s"
132 " as a protocol for the POOL file catalog (got: '%s')"
133 % (self.AllowedProtocols, catalog)
134 )
135 for protocol, handler in cat_dispatch.iteritems():
136 if catalog.startswith(protocol):
137 catalog = handler(catalog)
138 break
139 # make sure the catalog exists...
140 import os
141
142 if not os.path.exists (catalog):
143 return {}
144 # raise RuntimeError(
145 # 'could not find any PoolFileCatalog in [%s]' % catalog
146 # )
147
148
149 root = xmldict.ElementTree.parse (catalog).getroot()
150 return dict(xmldict.xml2dict(root))
151
152 errors = []
153 cat = {'POOLFILECATALOG':{'File':[]}}
154 for c in catalog:
155 try:
156 bc = _build_catalog(c)
157 pc = bc.get('POOLFILECATALOG',{})
158 files = []
159 if pc:
160 files = pc.get('File',[])
161 if isinstance(files, dict):
162 files = [files]
163 cat['POOLFILECATALOG']['File'].extend(files)
164 except Exception as err:
165 errors.append(err)
166
167 if errors:
168 raise errors[0] # FIXME : should we customize this a bit ?
169
170 self.catalog = cat
171 pass
172
173 def pfn (self, url_or_fid):
174 """find the physical file name given a url or a file-id"""
175 import os.path as osp
176 url_or_fid = osp.expanduser(osp.expandvars(url_or_fid))
177 import types
178 if isinstance (url_or_fid, types.ListType):
179 return [self._pfn(f) for f in url_or_fid]
180 else:
181 return self._pfn(url_or_fid)
182
183 def _pfn (self, url_or_fid):
184 """find the physical file name given a url or a file-id"""
185 if not ('POOLFILECATALOG' in self.catalog):
186 return None
187 if not ('File' in self.catalog['POOLFILECATALOG']):
188 return None
189
190 PFN_IDX = 0 # take this pfn when alternates exist
191
192 files = self.catalog['POOLFILECATALOG']['File']
193 if isinstance(files, dict):
194 # in case there where only one entry in the catalog
195 files = [files]
196 import re
197 if url_or_fid.lower().startswith('fid:'):
198 url_or_fid = url_or_fid[len('fid:'):]
199 if re.compile (r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$').match (url_or_fid):
200 fid = url_or_fid.lower()
201 # better to check consistency of catalog over all entries
202 # than declare success on first match...
203 match = {}
204 for f in files:
205 if f.ID.lower() == fid:
206 match[fid] = []
207 pfn = f.physical.pfn
208 if isinstance(pfn, (list,tuple)):
209 match[fid].append([i.name for i in pfn])
210 else:
211 match[fid].append([pfn.name])
212 if len(match[fid])==1:
213 return match[fid][0][PFN_IDX]
214 if len(match[fid])>1:
215 raise LookupError (
216 "more than one match for FID='%s'!\n%r"%(fid,match)
217 )
218 raise KeyError ("no entry with FID='%s' in catalog" % fid)
219 else:
220 url = url_or_fid
221 if url.lower().startswith("lfn:"):
222 url = url[len("lfn:"):]
223 # better to check consistency of catalog over all entries
224 # than declare success on first match...
225 match = {}
226 for f in files:
227 if (f.logical != '' # no LFN for this entry
228 and f.logical.lfn.name == url):
229 match[url] = []
230 pfn = f.physical.pfn
231 if isinstance(pfn, (list,tuple)):
232 match[url].append([i.name for i in pfn])
233 else:
234 match[url].append([pfn.name])
235 if len(match[url])==1:
236 return match[url][0][PFN_IDX]
237 if len(match[url])>1:
238 raise LookupError (
239 "more than one match for LFN='%s'!\n%r"%(url,match)
240 )
241 raise KeyError ("no entry with LFN='%s' in catalog" % url)
242 # assume that if not LFN: then PFN:, no matter what...
243 if url.lower().startswith("pfn:"):
244 url = url[len("pfn:"):]
245 return url
246
247 def __call__ (self, url_or_fid):
248 return self.pfn (url_or_fid)
249
250 pass
251
252class PoolOpts(object):
253 # default names of APR file storage elements
254 # copied here from RootUtils/APRDefaults.h for performance (as the first dictionary access takes 7 sec)
255 # see ATEAM-973 for a more detailed discussion
256 # the definitions here should be kept in sync with those!
258 EventData = "CollectionTree"
259 EventTag = "POOLCollectionTree"
260 DataHeader = "POOLContainer"
261 MetaData = "MetaData"
263 EventData = "EventData"
264 EventTag = "EventTag"
265 DataHeader = "DataHeader"
266 MetaData = "MetaData"
268 PoolCollection = 256 # Also known as "ImplicitCollection"
269 RootCollection = 512
270 RootTTreeCollection = 514
271 RootRNTupleCollection = 516
272
273 FAST_MODE = False
274 SUPER_DETAILED_BRANCH_SZ = False
275 READ_MODE = "READ"
276 POOL_HEADER = TTreeNames.DataHeader
277 EVENT_DATA = TTreeNames.EventData
278 META_DATA = TTreeNames.MetaData
279 HDR_FORMAT = " %11s %11s %11s %11s %5s %s"
280 ROW_FORMAT = "%12.3f kb %12.3f kb %12.3f kb %12.3f %8i %s"
281
282 @classmethod
283 def isData(cls, name):
284 return not name.startswith("##") and not cls.isDataHeader(name)
285
286 @classmethod
287 def isDataHeader(cls, name):
288 return name in {cls.TTreeNames.DataHeader
289 , cls.TTreeNames.DataHeader+"_DataHeader"
290 , cls.RNTupleNames.DataHeader}
291
292 @classmethod
293 def isEventData(cls, name):
294 return name.startswith(PoolOpts.EVENT_DATA)
295
296 @classmethod
297 def isAugmentation(cls, name):
298 return "_DAOD_" in name
299
300 @classmethod
301 def augmentationName(cls, name):
302 s = (name+"__").split('_')[2]
303 if s.endswith("Form"):
304 s = s[:-4]
305 return s
306
307 @classmethod
308 def isAugmentedHeader(cls, name):
309 return name.startswith(PoolOpts.POOL_HEADER) and cls.isAugmentation(name)
310
311 pass # class PoolOpts
312
313def _get_total_size (branch):
314 if PoolOpts.FAST_MODE:
315 return -1.
316 if not PoolOpts.SUPER_DETAILED_BRANCH_SZ:
317 return branch.GetTotalSize()
318 brSize = 0
319 branch.LoadBaskets()
320 for bnum in range(0, branch.GetWriteBasket()):
321 basket = branch.GetBasket(bnum)
322 brSize += basket.GetObjlen() - 8
323 return brSize
324
325def file_name(fname):
326 """take a file name, return the pair (protocol, 'real' file name)
327 """
328 fname = os.path.expanduser(os.path.expandvars(fname))
329
330 def _normalize_uri(uri):
331 if uri.startswith('/'):
332 return 'file:'+uri
333 return uri
334
335 from urllib.parse import urlsplit
336 url = urlsplit(_normalize_uri(fname))
337 protocol = url.scheme
338 def _normalize(fname):
339 from posixpath import normpath
340 fname = normpath(fname)
341 if fname.startswith('//'): fname = fname[1:]
342 return fname
343
344 if protocol in ('', 'file', 'pfn'):
345 protocol = ''
346 fname = _normalize(url.path)
347
348
349 if fname.startswith('/castor/'):
350 protocol = 'rfio'
351 fname = protocol + ':' + fname
352
353 elif protocol in ('rfio', 'castor'):
354 protocol = 'rfio'
355 fname = _normalize(url.path)
356 fname = protocol+':'+fname
357
358 elif protocol in ('root','dcap', 'dcache', 'http', 'https', 'dav', 'davs'):
359 pass
360
361 elif protocol in ('gsidcap',):
362 protocol = 'gfal:gsidcap'
363 pass
364
365 elif protocol in ('lfn','fid',):
366 # percolate through the PoolFileCatalog
367 from PyUtils.PoolFile import PoolFileCatalog as pfc
368 fname = pfc().pfn(protocol+':'+url.path)
369 pass
370
371 elif protocol in ('ami',):
372 # !! keep order of tokens !
373 for token in ('ami:', '//', '/'):
374 if fname.startswith(token):
375 fname = fname[len(token):]
376 fname = 'ami://' + fname
377 pass
378
379 else:
380 print(f'## warning: unknown protocol [{protocol}]. we will just return our input')
381 pass
382
383 return (protocol, fname)
384
385def retrieveBranchInfos( branch, poolRecord, ident = "" ):
386 fmt = "%s %3i %8.3f %8.3f %8.3f %s"
387 if 0:
388 out = fmt % ( ident,
389 branch.GetListOfBranches().GetSize(),
390 _get_total_size (branch),
391 branch.GetTotBytes(),
392 branch.GetZipBytes(),
393 branch.GetName() )
394 print(out)
395
396 branches = branch.GetListOfBranches()
397 for b in branches:
398 poolRecord.memSize += _get_total_size (b) / Units.kb
399 if (b.GetZipBytes() < 0.001):
400 poolRecord.memSizeNoZip += _get_total_size (b) / Units.kb
401 poolRecord.diskSize += b.GetZipBytes() / Units.kb
402 poolRecord = retrieveBranchInfos ( b, poolRecord, ident+" " )
403
404 return poolRecord
405
406def make_pool_record (branch, dirType):
407 memSize = _get_total_size (branch) / Units.kb
408 zipBytes = branch.GetZipBytes()
409 memSizeNoZip = memSize if zipBytes < 0.001 else 0.
410 diskSize = branch.GetZipBytes() / Units.kb
411 typeName = branch.GetClassName()
412 if not typeName and (leaf := branch.GetListOfLeaves().At(0)):
413 typeName = leaf.GetTypeName()
414 return PoolRecord(branch.GetName(), memSize, diskSize, memSizeNoZip,
415 branch.GetEntries(),
416 dirType=dirType,
417 typeName=typeName)
418
419class PoolRecord(object):
420 """
421 """
422 class Sorter:
423 DiskSize = "diskSize"
424 MemSize = "memSize"
425 ContainerName = "name"
426
427 @staticmethod
429 return [ PoolRecord.Sorter.DiskSize,
430 PoolRecord.Sorter.MemSize,
431 PoolRecord.Sorter.ContainerName ]
432 pass
433 def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType,
434 detailedInfos = "", typeName = None):
435 """Initialize PoolRecord instance.
436
437 dirType first letter of object type name that may distinguish the types:
438 "T" for TTree, "B" for TBranch,
439 "N" for RNTuple, "F" for RField
440 """
441 object.__init__(self)
442 self.name = name
443 self.memSize = memSize
444 self.diskSize = diskSize
445 self.memSizeNoZip = memSizeNoZip
446 self.nEntries = nEntries
447 self.dirType = dirType
448 self.details = detailedInfos
449 self.augName = ''
450 self.typeName = typeName
451 return
452
453class PoolFile(object):
454 """
455 A simple class to retrieve informations about the content of a POOL file.
456 It should be abstracted from the underlying technology used to create this
457 POOL file (Db, ROOT,...).
458 Right now, we are using the easy and loosy solution: going straight to the
459 ROOT 'API'.
460 """
461
462 def __init__(self, fileName, verbose=True):
463 object.__init__(self)
464
465 self._fileInfos = None
466 self.keys = None
467 self.dataHeader = PoolRecord("DataHeader", 0, 0, 0,
468 nEntries = 0,
469 dirType = "T")
470 self.augNames = set()
471 self.dataHeaderA = {}
472 self.data = []
473 self.verbose = verbose
474
475 # get the "final" file name (handles all kind of protocols)
476 try:
477 protocol, fileName = file_name(fileName)
478 except Exception as err:
479 print("## warning: problem opening PoolFileCatalog:\n%s"%err)
480 import traceback
481 traceback.print_exc(err)
482 pass
483
484 self.poolFile = None
485 dbFileName = whichdb( fileName )
486 if dbFileName not in ( None, '' ):
487 if self.verbose is True:
488 print("## opening file [%s]..." % str(fileName))
489 db = shelve.open( fileName, 'r' )
490 if self.verbose is True:
491 print("## opening file [OK]")
492 report = db['report']
493 self._fileInfos = report['fileInfos']
494 self.dataHeader = report['dataHeader']
495 self.data = report['data']
496 else:
497 if self.verbose is True:
498 print("## opening file [%s]..." % str(fileName))
499 self.__openPoolFile( fileName )
500 if self.verbose is True:
501 print("## opening file [OK]")
502 self.__processFile()
503
504 return
505
506 def __openPoolFile(self, fileName):
507 # hack to prevent ROOT from loading graphic libraries and hence bother
508 # our fellow Mac users
509 if self.verbose is True:
510 print("## importing ROOT...")
511 import PyUtils.RootUtils as ru
512 ROOT = ru.import_root()
513 self.ROOT = ROOT
514 if self.verbose is True:
515 print("## importing ROOT... [DONE]")
516 # prevent ROOT from being too verbose
517 rootMsg = ShutUp()
518 rootMsg.mute()
519 ROOT.gErrorIgnoreLevel = ROOT.kFatal
520
521 poolFile = None
522 try:
523 poolFile = ROOT.TFile.Open( fileName, PoolOpts.READ_MODE )
524 except Exception as e:
525 rootMsg.unMute()
526 print("## Failed to open file [%s] !!" % fileName)
527 print("## Reason:")
528 print(e)
529 print("## Bailing out...")
530 raise IOError("Could not open file [%s]" % fileName)
531
532 rootMsg.unMute()
533
534 if poolFile is None:
535 print("## Failed to open file [%s] !!" % fileName)
536 msg = "Could not open file [%s]" % fileName
537 raise IOError(msg)
538
539 self.poolFile = poolFile
540 assert self.poolFile.IsOpen() and not self.poolFile.IsZombie(), \
541 "Invalid POOL file or a Zombie one"
542 self._fileInfos = {
543 'name' : self.poolFile.GetName(),
544 'size' : self.poolFile.GetSize(),
545 }
546 return
547
548 def __processFile(self):
549
550 for name in {PoolOpts.TTreeNames.DataHeader, PoolOpts.RNTupleNames.DataHeader}:
551 dhKey = self.poolFile.FindKey( name )
552 if dhKey:
553 obj = self.poolFile.Get( name )
554 if isinstance(obj, self.ROOT.TTree):
555 nEntries = obj.GetEntries()
556 elif isRNTuple(obj):
557 try:
558 nEntries = self.ROOT.Experimental.RNTupleReader.Open(obj).GetNEntries()
559 except AttributeError:
560 # ROOT 6.36 and later
561 nEntries = self.ROOT.RNTupleReader.Open(obj).GetNEntries()
562 else:
563 raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
564 break
565 else:
566 nEntries = 0
567
568 keys = []
569 containers = []
570 for k in self.poolFile.GetListOfKeys():
571 keyname = k.GetName()
572 obj = self.poolFile.Get( keyname )
573 if isinstance(obj, self.ROOT.TTree):
574 containerName = obj.GetName()
575 nEntries = obj.GetEntries()
576 dirType = "T"
577 elif isRNTuple(obj):
578 try:
579 reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
580 except AttributeError:
581 # ROOT 6.36 and later
582 reader = self.ROOT.RNTupleReader.Open(obj)
583 containerName = reader.GetDescriptor().GetName()
584 nEntries = reader.GetNEntries()
585 dirType = "N"
586 else:
587 raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
588 if containerName not in containers:
589 keys.append(k)
590 containers.append(containerName)
591 pass
592 if keyname.startswith(PoolOpts.POOL_HEADER) and not keyname.endswith('Form'):
593 self.dataHeaderA[PoolOpts.augmentationName(keyname)] = \
594 PoolRecord("DataHeader", 0, 0, 0,
595 nEntries = nEntries,
596 dirType = dirType)
597
598 keys.sort (key = lambda x: x.GetName())
599 self.keys = keys
600 del containers
601
602 for k in keys:
603 obj = self.poolFile.Get( k.GetName() )
604 if isinstance(obj, self.ROOT.TTree):
605 name = obj.GetName()
606 elif isRNTuple(obj):
607 try:
608 inspector = self.ROOT.Experimental.RNTupleInspector.Create(obj)
609 except AttributeError:
610 inspector = self.ROOT.RNTupleInspector.Create(obj)
611 name = inspector.GetDescriptor().GetName()
612
613 if PoolOpts.isDataHeader(name):
614 contName = "DataHeader"
615 if isinstance(obj, self.ROOT.TTree):
616 memSize = obj.GetTotBytes() / Units.kb
617 diskSize = obj.GetZipBytes() / Units.kb
618 memSizeNoZip = 0.0
619 if diskSize < 0.001:
620 memSizeNoZip = memSize
621 nEntries = obj.GetEntries()
622
624 dhBranchNames = [
625 br.GetName() for br in obj.GetListOfBranches()
626 if br.GetName().count("DataHeader_p") > 0
627 ]
628 if len(dhBranchNames) == 1:
629 dhBranch = obj.GetBranch(dhBranchNames[0])
630 typeName = dhBranch.GetClassName()
631 if not typeName and (leaf := dhBranch.GetListOfLeaves().At(0)):
632 typeName = leaf.GetTypeName()
633 poolRecord = retrieveBranchInfos(
634 dhBranch,
635 PoolRecord( contName, 0., 0., 0.,
636 nEntries,
637 dirType = "T",
638 typeName = typeName ),
639 ident = " "
640 )
641 else:
642 poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
643 nEntries,
644 dirType = "T")
645
646 self.dataHeader = poolRecord
647 elif isRNTuple(obj):
648 diskSize = inspector.GetCompressedSize() / Units.kb
649 memSize = inspector.GetUncompressedSize() / Units.kb
650
651 memSizeNoZip = 0.0
652 if diskSize < 0.001:
653 memSizeNoZip = memSize
654 nEntries = inspector.GetDescriptor().GetNEntries()
655 poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
656 nEntries,
657 dirType = "N")
658 self.dataHeader = poolRecord
659 elif PoolOpts.isData(name):
660 if isinstance(obj, self.ROOT.TTree):
661 if not hasattr(obj, 'GetListOfBranches'):
662 continue
663 branches = obj.GetListOfBranches()
664 dirType = "T"
665 if name in (PoolOpts.EVENT_DATA, PoolOpts.META_DATA):
666 dirType = "B"
667 for branch in branches:
668 poolRecord = retrieveBranchInfos(
669 branch,
670 make_pool_record(branch, dirType),
671 ident = " "
672 )
673 poolRecord.augName = PoolOpts.augmentationName(name)
674 self.augNames.add(poolRecord.augName)
675 self.data += [ poolRecord ]
676 elif isRNTuple(obj):
677 descriptor = inspector.GetDescriptor()
678 dirType = "N"
679 if name in {PoolOpts.RNTupleNames.EventData, PoolOpts.RNTupleNames.MetaData}:
680 dirType = "F"
681 fieldZeroId = descriptor.GetFieldZeroId()
682 for fieldDescriptor in descriptor.GetFieldIterable(fieldZeroId):
683 fieldId = fieldDescriptor.GetId()
684 fieldTreeInspector = inspector.GetFieldTreeInspector(fieldId)
685 diskSize = fieldTreeInspector.GetCompressedSize() / Units.kb
686 memSize = fieldTreeInspector.GetUncompressedSize() / Units.kb
687 typeName = fieldDescriptor.GetTypeName()
688 fieldName = fieldDescriptor.GetFieldName()
689 poolRecord = PoolRecord(fieldName, memSize, diskSize, memSize,
690 descriptor.GetNEntries(),
691 dirType=dirType,
692 typeName=typeName)
693 poolRecord.augName = PoolOpts.augmentationName(name)
694 self.augNames.add(poolRecord.augName)
695 self.data += [ poolRecord ]
696 # loop over keys
697
698 return
699
700 def fileInfos(self):
701 return os.linesep.join( [
702 "File:" + self._fileInfos['name'],
703 "Size: %12.3f kb" % (self._fileInfos['size'] / Units.kb),
704 "Nbr Events: %i" % self.dataHeader.nEntries
705 ] )
706
707
708 def checkFile(self, sorting = PoolRecord.Sorter.DiskSize):
709 if self.verbose is True:
710 print(self.fileInfos())
711 if len(self.augNames) > 1:
712 for aug in self.augNames:
713 if len(aug) > 0:
714 print( "Nbr %s Events: %i" % (aug, self.dataHeaderA[aug].nEntries) )
715
716
717 data = self.data
718 if sorting in PoolRecord.Sorter.allowedValues():
719 import operator
720 data.sort(key = operator.attrgetter(sorting) )
721
722 def _get_val(x, dflt=-999.):
723 if PoolOpts.FAST_MODE:
724 return dflt
725 return x
726
727 totMemSize = _get_val(self.dataHeader.memSize, dflt=0.)
728 totDiskSize = self.dataHeader.diskSize
729
730 def _safe_div(num,den):
731 if float(den) == 0.:
732 return 0.
733 return num/den
734
735 if self.verbose is True:
736 print("")
737 print("="*80)
738 print(PoolOpts.HDR_FORMAT % ( "Mem Size", "Disk Size","Size/Evt",
739 "MissZip/Mem","items",
740 "(X) Container Name (X=Tree|Branch)" ))
741 print("="*80)
742
743 print(PoolOpts.ROW_FORMAT % (
744 _get_val (self.dataHeader.memSize),
745 self.dataHeader.diskSize,
746 _safe_div(self.dataHeader.diskSize,float(self.dataHeader.nEntries)),
747 _get_val (_safe_div(self.dataHeader.memSizeNoZip,
748 self.dataHeader.memSize)),
749 self.dataHeader.nEntries,
750 "("+self.dataHeader.dirType+") "+self.dataHeader.name
751 ))
752 print("-"*80)
753
754 totMemSizeA = {}
755 totDiskSizeA = {}
756 for d in data:
757 totMemSize += 0. if PoolOpts.FAST_MODE else d.memSize
758 totDiskSize += d.diskSize
759 memSizeNoZip = d.memSizeNoZip/d.memSize if d.memSize != 0. else 0.
760 aug = d.augName
761 totMemSizeA[aug] = totMemSizeA.get(aug,0.) + d.memSize
762 totDiskSizeA[aug] = totDiskSizeA.get(aug,0.) + d.diskSize
763 if self.verbose is True:
764 print(PoolOpts.ROW_FORMAT % (
765 _get_val (d.memSize),
766 d.diskSize,
767 _safe_div(d.diskSize, float(self.dataHeader.nEntries)),
768 _get_val (memSizeNoZip),
769 d.nEntries,
770 "("+d.dirType+") "+d.name
771 ))
772
773 if self.verbose is True:
774 print("="*80)
775 if len(self.augNames) > 1:
776 augs = sorted(self.augNames)
777 for a in augs:
778 print(PoolOpts.ROW_FORMAT % (
779 totMemSizeA[a], totDiskSizeA[a],
780 _safe_div(totDiskSizeA[a], float(self.dataHeaderA[a].nEntries)),
781 0.0,
782 self.dataHeaderA[a].nEntries,
783 "Aug Stream: " + ('MAIN' if a=='' else a)
784 ))
785 print("-"*80)
786 print(PoolOpts.ROW_FORMAT % (
787 totMemSize, totDiskSize,
788 _safe_div(totDiskSize, float(self.dataHeader.nEntries)),
789 0.0, self.dataHeader.nEntries,
790 "TOTAL (POOL containers)"
791 ))
792 print("="*80)
793 if PoolOpts.FAST_MODE:
794 print("::: warning: FAST_MODE was enabled: some columns' content ",)
795 print("is meaningless...")
796 return
797
798 def detailedDump(self, bufferName = None ):
799 if self.poolFile is None or \
800 self.keys is None:
801 print("Can't perform a detailedDump with a shelve file as input !")
802 return
803
804 if bufferName is None:
805 bufferName = "/dev/stdout"
806 out = open( bufferName, "w" )
807 sys.stdout.flush()
808 save_stdout_fileno = os.dup (sys.stdout.fileno())
809 os.dup2( out.fileno(), sys.stdout.fileno() )
810
811 out.write( "#" * 80 + os.linesep )
812 out.write( "## detailed dump" + os.linesep )
813 out.flush()
814
815 for key in self.keys:
816 tree = key.ReadObj()
817 name = tree.GetName()
818
819 if PoolOpts.isDataHeader(name) or \
820 PoolOpts.isData(name):
821 try:
822 print ("=== [%s] ===" % name, file=sys.stderr)
823 tree.Print()
824 except Exception as err:
825 print ("Caught:",err, file=sys.stderr)
826 print (sys.exc_info()[0], file=sys.stderr)
827 print (sys.exc_info()[1], file=sys.stderr)
828 pass
829 pass
830 pass
831 out.write( "#" * 80 + os.linesep )
832 out.flush()
833 out.write( "#" * 80 + os.linesep )
834
838 out.flush()
839 if bufferName != "<stdout>":
840 out.close()
841 sys.stdout.close()
842 sys.stdout = open (save_stdout_fileno, 'a')
843 return
844
845 def poolRecord(self, name):
846 """
847 Return a PoolRecord according to its (branch) name
848 Raise KeyError if no match is found
849 """
850 for data in self.data:
851 if data.name == name:
852 return data
853 raise KeyError("No PoolRecord with name [%s]" % name)
854
855 def saveReport (self, fileName):
856 """
857 Save all the gathered informations into a python shelve or a CSV file
858 (depending on the @param `fileName` extension)
859 """
860 import os
861 if os.path.splitext(fileName)[-1] == '.csv':
862 return self._save_csv_report (fileName)
863 return self._save_shelve_report (fileName)
864
865 def _save_shelve_report(self, fileName):
866 """
867 Save all the gathered informations into a python shelve
868 Data can then be read like so:
869 >>> import shelve
870 >>> db = shelve.open( 'myfile.dat', 'r' )
871 >>> report = db['report']
872 >>> print ('fileSize:',report['fileSize'])
873 >>> print ('dataHeader/memSize:',report['dataHeader'].memSize)
874 >>> for d in report['data']:
875 ... print ('data:',d.name,d.nEntries,d.memSize)
876 """
877 import shelve, os
878 if os.path.exists (fileName):
879 os.unlink (fileName)
880 db = shelve.open (fileName)
881 db['report'] = {
882 'fileInfos' : self._fileInfos,
883 'nbrEvts' : self.dataHeader.nEntries,
884 'dataHeader' : self.dataHeader,
885 'data' : self.data
886 }
887 db.close()
888 return
889
890 def _save_csv_report(self, fileName):
891 """
892 Save all the gathered informations into a CSV file
893 """
894 import csv, os
895 if os.path.exists (fileName):
896 os.unlink (fileName)
897 args = {'newline' : ''}
898 f = open (fileName, 'w', **args)
899 o = csv.writer (f)
900 o.writerow (['file name', self._fileInfos['name']])
901 o.writerow (['file size', self._fileInfos['size']])
902 o.writerow (['nbr evts', self.dataHeader.nEntries])
903 o.writerow (['mem size', 'disk size', 'mem size nozip', 'items',
904 'container name', 'branch type'])
905
906 for d in self.data:
907 o.writerow ([d.memSize, d.diskSize, d.memSizeNoZip,
908 d.nEntries, d.name, d.dirType])
909 f.close()
910 return
911
912 def __del__(self):
913 if self.poolFile and hasattr(self.poolFile, 'Close'):
914 try:
915 self.poolFile.Close()
916 self.poolFile = None
917 except Exception as err:
918 print("WARNING:",err)
919 pass
920
921 pass # class PoolFile
922
923class DiffFiles(object):
924 """
925 A helper class to compare 2 POOL files and check that they match, both in
926 terms of containers' content and containers' sizes
927 """
928
929 def __init__(self, refFileName, chkFileName, verbose = False, ignoreList = None, strict = False):
930 object.__init__(self)
931
932 self.verbose = verbose
933 self.strict = strict
934 refFileName = os.path.expandvars( os.path.expanduser( refFileName ) )
935 chkFileName = os.path.expandvars( os.path.expanduser( chkFileName ) )
936
937 if ignoreList is None:
938 ignoreList = []
939
940 try:
941 self.refFile = PoolFile( refFileName )
942 self.chkFile = PoolFile( chkFileName )
943 self.ignList = sorted( ignoreList )
944 except Exception as err:
945 print("## Caught exception [%s] !!" % str(err.__class__))
946 print("## What:",err)
947 print(sys.exc_info()[0])
948 print(sys.exc_info()[1])
949 err = "Error while opening POOL files !"
950 err += " chk : %s%s" % ( chkFileName, os.linesep )
951 err += " ref : %s%s" % ( refFileName, os.linesep )
952 raise Exception(err)
953
954 self.allGood = True
955 self.summary = []
956
957 self.__checkDiff()
958 return
959
960 def __checkDiff(self):
961
962 self.summary += [
963 "=" * 80,
964 "::: Comparing POOL files...",
965 " ref : %s" % self.refFile._fileInfos['name'],
966 " chk : %s" % self.chkFile._fileInfos['name'],
967 "-" * 80,
968 ]
969
970 if self.chkFile.dataHeader.nEntries != \
971 self.refFile.dataHeader.nEntries :
972 self.summary += [
973 "## WARNING: files don't have the same number of entries !!",
974 " ref : %r" % self.refFile.dataHeader.nEntries,
975 " chk : %r" % self.chkFile.dataHeader.nEntries,
976 ]
977
978 refNames = sorted( [d.name for d in self.refFile.data] )
979 chkNames = sorted( [d.name for d in self.chkFile.data] )
980
981 if chkNames != refNames:
982 self.summary += [
983 "## ERROR: files don't have the same content !!",
984 ]
985 addNames = [ n for n in chkNames if n not in refNames ]
986 if len( addNames ) > 0:
987 self.summary += [ "## collections in 'chk' and not in 'ref'" ]
988 for n in addNames:
989 self.summary += [ " + %s" % n ]
990 subNames = [ n for n in refNames if n not in chkNames ]
991 if len( subNames ) > 0:
992 self.summary += [ "## collections in 'ref' and not in 'chk'" ]
993 for n in subNames:
994 self.summary += [ " - %s" % n ]
995 self.allGood = False
996 pass
997
998 if len(self.ignList) > 0:
999 self.summary += [ "## Ignoring the following:" ]
1000 for n in self.ignList:
1001 self.summary += [ " %s" % n ]
1002
1003 commonContent = [ d for d in chkNames if (d in refNames and d not in self.ignList)]
1004
1005 if not self.allGood:
1006 self.summary += [ "=" * 80 ]
1007 self.summary += [ "::: comparing common content (mem-size / disk-size)..." ]
1008
1009 for name in commonContent:
1010 chkMemSize = self.chkFile.poolRecord(name).memSize
1011 refMemSize = self.refFile.poolRecord(name).memSize
1012 chkDiskSize = self.chkFile.poolRecord(name).diskSize
1013 refDiskSize = self.refFile.poolRecord(name).diskSize
1014
1015 if chkMemSize != refMemSize or (self.strict and chkDiskSize != refDiskSize):
1016 self.summary += [
1017 "[ERR] %12.3f / %12.3f kb (ref) ==> %12.3f / %12.3f kb (chk) | %s" % \
1018 ( refMemSize,refDiskSize,chkMemSize,chkDiskSize, name )
1019 ]
1020 self.allGood = False
1021 elif self.verbose:
1022 self.summary += [
1023 " [OK] %12.3f/%12.3f kb | %s" % \
1024 ( chkMemSize, chkDiskSize, name )
1025 ]
1026
1027 self.summary += [ "=" * 80 ]
1028
1029
1030 if self.allGood: self.summary += [ "## Comparison : [OK]" ]
1031 else: self.summary += [ "## Comparison : [ERR]" ]
1032
1033 return self.allGood
1034
1035 def status(self):
1036 if self.allGood: return 0
1037 else: return 1
1038
1039 def printSummary(self, out = sys.stdout):
1040 for i in self.summary:
1041 out.writelines( i + os.linesep )
1042 pass
1043 return
1044
1045class Counter(object):
1046 """
1047 A counter just contains an item list (pairs class-name/sg-key) and the size
1048 """
1049 size = 0
1050 def __init__(self, name, itemList):
1051 object.__init__(self)
1052 self.name = name
1053 self.itemList = itemList
1054 pass # Counter
1055
1056
void print(char *figname, TCanvas *c1)
__init__(self, name, itemList)
Definition PoolFile.py:1050
printSummary(self, out=sys.stdout)
Definition PoolFile.py:1039
bool allGood
final decision
Definition PoolFile.py:954
__init__(self, refFileName, chkFileName, verbose=False, ignoreList=None, strict=False)
Definition PoolFile.py:929
__call__(self, url_or_fid)
Definition PoolFile.py:247
__init__(self, catalog=None)
Definition PoolFile.py:57
__openPoolFile(self, fileName)
Definition PoolFile.py:506
detailedDump(self, bufferName=None)
Definition PoolFile.py:798
__init__(self, fileName, verbose=True)
Definition PoolFile.py:462
isAugmentation(cls, name)
Definition PoolFile.py:297
augmentationName(cls, name)
Definition PoolFile.py:301
isAugmentedHeader(cls, name)
Definition PoolFile.py:308
__init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType, detailedInfos="", typeName=None)
Definition PoolFile.py:434
— data ---------------------------------------------------------------—
Definition PoolFile.py:30
STL class.
T * Get(TFile &f, const std::string &n, const std::string &dir="", const chainmap_t *chainmap=0, std::vector< std::string > *saved=0)
get a histogram given a path, and an optional initial directory if histogram is not found,...
bool add(const std::string &hname, TKey *tobj)
Definition fastadd.cxx:55
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:148
std::vector< std::string > split(const std::string &s, const std::string &t=":")
Definition hcg.cxx:179
make_pool_record(branch, dirType)
Definition PoolFile.py:406
file_name(fname)
Definition PoolFile.py:325
retrieveBranchInfos(branch, poolRecord, ident="")
Definition PoolFile.py:385
_save_shelve_report(self, fileName)
Definition PoolFile.py:865
_get_total_size(branch)
Definition PoolFile.py:313
poolRecord(self, name)
Definition PoolFile.py:845
_save_csv_report(self, fileName)
Definition PoolFile.py:890
saveReport(self, fileName)
Definition PoolFile.py:855
void handler(int sig)
signal handler
Definition rmain.cxx:99