ATLAS Offline Software
Loading...
Searching...
No Matches
PoolFile.py
Go to the documentation of this file.
1# Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2
3# @author: Sebastien Binet <binet@cern.ch>
4# @date: March 2007
5#
6#
7
8__author__ = "Sebastien Binet <binet@cern.ch>"
9
10
11__all__ = [
12 'PoolFileCatalog',
13 'PoolOpts',
14 'isRNTuple',
15 'PoolRecord',
16 'PoolFile',
17 'DiffFiles',
18 ]
19
20
21import sys
22import os
23import shelve
24
25from dbm import whichdb
26
27from .Helpers import ShutUp
28
29
30class Units (object):
31 kb = 1024.
32 Mb = 1024.*1024.
33
34
35def isRNTuple(obj):
36 # MN: remove the "try" after migration to ROOT 6.34
37 try: from ROOT import RNTuple
38 except(ImportError): from ROOT.Experimental import RNTuple
39 return isinstance( obj, RNTuple )
40
41
43 """ reverse-engineering of the POOL FileCatalog.
44 allows to retrieve the physical filename from a logical one, provided
45 that the file-id is known to the (real) PoolFileCatalog
46 """
47 DefaultCatalog = "xmlcatalog_file:PoolFileCatalog.xml"
48 AllowedProtocols = (
49 # see: PoolSvc::createCatalog
50 # http://alxr.usatlas.bnl.gov/lxr/source/atlas/Database/AthenaPOOL/PoolSvc/src/PoolSvc.cxx?v=head#736
51 "xmlcatalog_file:", # POOL default
52 "apcfile:", # ATLAS_POOLCOND_PATH
53 "prfile:", # file via PathResolver
54 "file:", # simple file on local FS
55 )
56
57 def __init__ (self, catalog=None):
58 super (PoolFileCatalog, self).__init__()
59 self.catalog = None
60
61 if catalog is None:
62 # chase poolfilecatalog location
63 catalog = os.environ.get("POOL_CATALOG", self.DefaultCatalog)
64
65 if isinstance(catalog, str):
66 catalog = [catalog]
67
68 if not isinstance (catalog, (str, list)):
69 raise TypeError(
70 "catalog contact string should be a string or a list thereof! (got %r)"%
71 type(catalog))
72
73 osp = os.path
74 def osp_exp(x):
75 return osp.expanduser(osp.expandvars(x))
76
77 def _handle_apcfile_old(x):
78 """ return $ATLAS_POOLCOND_PATH/poolcond/x
79 """
80 if 'ATLAS_POOLCOND_PATH' not in os.environ:
81 return osp_exp(x)
82 pcp = os.environ["ATLAS_POOLCOND_PATH"]
83 if x.startswith("apcfile:"):
84 x = x[len("apcfile:"):]
85 return osp_exp(osp.join(pcp, 'poolcond', x))
86
87 def _handle_apcfile(x):
88 """ return $ATLAS_POOLCOND_PATH/x
89 """
90 if 'ATLAS_POOLCOND_PATH' not in os.environ:
91 return osp_exp(x)
92 pcp = os.environ["ATLAS_POOLCOND_PATH"]
93 if x.startswith("apcfile:"):
94 x = x[len("apcfile:"):]
95 return osp_exp(osp.join(pcp, x))
96
97 def _handle_xmlcatalog_file(x):
98 return osp_exp(x[len("xmlcatalog_file:"):])
99
100 def _handle_prfile(x):
101 x = x[len("prfile:"):]
102 x = osp_exp(x)
103 try:
104 import AthenaCommon.Utils.unixtools as u
105 return u.FindFile(x,
106 os.environ['DATAPATH'].split(os.pathsep),
107 os.R_OK)
108 except ImportError:
109 return x
110
111 def _handle_file(x):
112 x = x[len("file:"):]
113 x = osp_exp(x)
114 return x
115
116 cat_dispatch = {
117 "xmlcatalog_file:": _handle_xmlcatalog_file,
118 "apcfile:": _handle_apcfile,
119 "prfile:": _handle_prfile,
120 "file:": _handle_file,
121 }
122 assert sorted(cat_dispatch.keys()) == sorted(self.AllowedProtocols), \
123 "catalog dispatch keys does not match AllowedProtocols:" \
124 "\n%s\n%s" % (sorted(cat_dispatch.keys()),
125 sorted(self.AllowedProtocols))
126
127 from . import xmldict
128 def _build_catalog(catalog):
129 if not catalog.startswith(self.AllowedProtocols):
130 raise ValueError(
131 "sorry PoolFile:PoolFileCatalog only supports %s"
132 " as a protocol for the POOL file catalog (got: '%s')"
133 % (self.AllowedProtocols, catalog)
134 )
135 for protocol, handler in cat_dispatch.iteritems():
136 if catalog.startswith(protocol):
137 catalog = handler(catalog)
138 break
139 # make sure the catalog exists...
140 import os
141
142 if not os.path.exists (catalog):
143 return {}
144 # raise RuntimeError(
145 # 'could not find any PoolFileCatalog in [%s]' % catalog
146 # )
147
148
149 root = xmldict.ElementTree.parse (catalog).getroot()
150 return dict(xmldict.xml2dict(root))
151
152 errors = []
153 cat = {'POOLFILECATALOG':{'File':[]}}
154 for c in catalog:
155 try:
156 bc = _build_catalog(c)
157 pc = bc.get('POOLFILECATALOG',{})
158 files = []
159 if pc:
160 files = pc.get('File',[])
161 if isinstance(files, dict):
162 files = [files]
163 cat['POOLFILECATALOG']['File'].extend(files)
164 except Exception as err:
165 errors.append(err)
166
167 if errors:
168 raise errors[0] # FIXME : should we customize this a bit ?
169
170 self.catalog = cat
171 pass
172
173 def pfn (self, url_or_fid):
174 """find the physical file name given a url or a file-id"""
175 import os.path as osp
176 url_or_fid = osp.expanduser(osp.expandvars(url_or_fid))
177 import types
178 if isinstance (url_or_fid, types.ListType):
179 return [self._pfn(f) for f in url_or_fid]
180 else:
181 return self._pfn(url_or_fid)
182
183 def _pfn (self, url_or_fid):
184 """find the physical file name given a url or a file-id"""
185 if not ('POOLFILECATALOG' in self.catalog):
186 return None
187 if not ('File' in self.catalog['POOLFILECATALOG']):
188 return None
189
190 PFN_IDX = 0 # take this pfn when alternates exist
191
192 files = self.catalog['POOLFILECATALOG']['File']
193 if isinstance(files, dict):
194 # in case there where only one entry in the catalog
195 files = [files]
196 import re
197 if url_or_fid.lower().startswith('fid:'):
198 url_or_fid = url_or_fid[len('fid:'):]
199 if re.compile (r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$').match (url_or_fid):
200 fid = url_or_fid.lower()
201 # better to check consistency of catalog over all entries
202 # than declare success on first match...
203 match = {}
204 for f in files:
205 if f.ID.lower() == fid:
206 match[fid] = []
207 pfn = f.physical.pfn
208 if isinstance(pfn, (list,tuple)):
209 match[fid].append([i.name for i in pfn])
210 else:
211 match[fid].append([pfn.name])
212 if len(match[fid])==1:
213 return match[fid][0][PFN_IDX]
214 if len(match[fid])>1:
215 raise LookupError (
216 "more than one match for FID='%s'!\n%r"%(fid,match)
217 )
218 raise KeyError ("no entry with FID='%s' in catalog" % fid)
219 else:
220 url = url_or_fid
221 if url.lower().startswith("lfn:"):
222 url = url[len("lfn:"):]
223 # better to check consistency of catalog over all entries
224 # than declare success on first match...
225 match = {}
226 for f in files:
227 if (f.logical != '' # no LFN for this entry
228 and f.logical.lfn.name == url):
229 match[url] = []
230 pfn = f.physical.pfn
231 if isinstance(pfn, (list,tuple)):
232 match[url].append([i.name for i in pfn])
233 else:
234 match[url].append([pfn.name])
235 if len(match[url])==1:
236 return match[url][0][PFN_IDX]
237 if len(match[url])>1:
238 raise LookupError (
239 "more than one match for LFN='%s'!\n%r"%(url,match)
240 )
241 raise KeyError ("no entry with LFN='%s' in catalog" % url)
242 # assume that if not LFN: then PFN:, no matter what...
243 if url.lower().startswith("pfn:"):
244 url = url[len("pfn:"):]
245 return url
246
247 def __call__ (self, url_or_fid):
248 return self.pfn (url_or_fid)
249
250 pass
251
253 # default names of APR file storage elements
254 # copied here from RootUtils/APRDefaults.h for performance (as the first dictionary access takes 7 sec)
255 # see ATEAM-973 for a more detailed discussion
256 # the definitions here should be kept in sync with those!
258 EventData = "CollectionTree"
259 EventTag = "POOLCollectionTree"
260 DataHeader = "POOLContainer"
261 MetaData = "MetaData"
263 EventData = "EventData"
264 EventTag = "EventTag"
265 DataHeader = "DataHeader"
266 MetaData = "MetaData"
268 PoolCollection = 256 # Also known as "ImplicitCollection"
269 RootCollection = 512
270 RootTTreeCollection = 514
271 RootRNTupleCollection = 516
272
273 FAST_MODE = False
274 SUPER_DETAILED_BRANCH_SZ = False
275 READ_MODE = "READ"
276 POOL_HEADER = TTreeNames.DataHeader
277 EVENT_DATA = TTreeNames.EventData
278 META_DATA = TTreeNames.MetaData
279 HDR_FORMAT = " %11s %11s %11s %11s %5s %s"
280 ROW_FORMAT = "%12.3f kb %12.3f kb %12.3f kb %12.3f %8i %s"
281
282 @classmethod
283 def isData(cls, name):
284 return not name.startswith("##") and not cls.isDataHeader(name)
285
286 @classmethod
287 def isDataHeader(cls, name):
288 return name in {cls.TTreeNames.DataHeader
289 , cls.TTreeNames.DataHeader+"_DataHeader"
290 , cls.RNTupleNames.DataHeader}
291
292 @classmethod
293 def isEventData(cls, name):
294 return name.startswith(PoolOpts.EVENT_DATA)
295
296 @classmethod
297 def isAugmentation(cls, name):
298 return "_DAOD_" in name
299
300 @classmethod
301 def augmentationName(cls, name):
302 s = (name+"__").split('_')[2]
303 if s.endswith("Form"):
304 s = s[:-4]
305 return s
306
307 @classmethod
308 def isAugmentedHeader(cls, name):
309 return name.startswith(PoolOpts.POOL_HEADER) and cls.isAugmentation(name)
310
311 pass # class PoolOpts
312
313def _get_total_size (branch):
314 if PoolOpts.FAST_MODE:
315 return -1.
316 if not PoolOpts.SUPER_DETAILED_BRANCH_SZ:
317 return branch.GetTotalSize()
318 brSize = 0
319 branch.LoadBaskets()
320 for bnum in range(0, branch.GetWriteBasket()):
321 basket = branch.GetBasket(bnum)
322 brSize += basket.GetObjlen() - 8
323 return brSize
324
325def file_name(fname):
326 """take a file name, return the pair (protocol, 'real' file name)
327 """
328 fname = os.path.expanduser(os.path.expandvars(fname))
329
330 def _normalize_uri(uri):
331 if uri.startswith('/'):
332 return 'file:'+uri
333 return uri
334
335 from urllib.parse import urlsplit
336 url = urlsplit(_normalize_uri(fname))
337 protocol = url.scheme
338 def _normalize(fname):
339 from posixpath import normpath
340 fname = normpath(fname)
341 if fname.startswith('//'): fname = fname[1:]
342 return fname
343
344 if protocol in ('', 'file', 'pfn'):
345 protocol = ''
346 fname = _normalize(url.path)
347
348
349 if fname.startswith('/castor/'):
350 protocol = 'rfio'
351 fname = protocol + ':' + fname
352
353 elif protocol in ('rfio', 'castor'):
354 protocol = 'rfio'
355 fname = _normalize(url.path)
356 fname = protocol+':'+fname
357
358 elif protocol in ('root','dcap', 'dcache', 'http', 'https', 'dav', 'davs'):
359 pass
360
361 elif protocol in ('gsidcap',):
362 protocol = 'gfal:gsidcap'
363 pass
364
365 elif protocol in ('lfn','fid',):
366 # percolate through the PoolFileCatalog
367 from PyUtils.PoolFile import PoolFileCatalog as pfc
368 fname = pfc().pfn(protocol+':'+url.path)
369 pass
370
371 elif protocol in ('ami',):
372 # !! keep order of tokens !
373 for token in ('ami:', '//', '/'):
374 if fname.startswith(token):
375 fname = fname[len(token):]
376 fname = 'ami://' + fname
377 pass
378
379 else:
380 print(f'## warning: unknown protocol [{protocol}]. we will just return our input')
381 pass
382
383 return (protocol, fname)
384
385def _setup_ssl(root):
386 x509_proxy = os.environ.get('X509_USER_PROXY', '')
387 if x509_proxy:
388 # setup proper credentials
389 root.TSSLSocket.SetUpSSL(
390 x509_proxy,
391 "/etc/grid-security/certificates",
392 x509_proxy,
393 x509_proxy)
394 else:
395 print("## warning: protocol https is requested but no X509_USER_PROXY was found! (opening the file might fail.)")
396 pass
397 return
398
399def _root_open(fname):
400 import PyUtils.RootUtils as ru
401 root = ru.import_root()
402 import re
403
404 with ShutUp(filters=[
405 re.compile('TClass::TClass:0: RuntimeWarning: no dictionary for class.*') ]):
406 root.gSystem.Load('libRootCollection')
407 root_open = root.TFile.Open
408
409 # we need to get back the protocol b/c of the special
410 # case of secure-http which needs to open TFiles as TWebFiles...
411 protocol, _ = file_name(fname)
412 if protocol == 'https':
413 _setup_ssl(root)
414 root_open = root.TWebFile.Open
415
416 f = root_open(fname, 'READ')
417 if f is None or not f:
418 import errno
419 raise IOError(errno.ENOENT,
420 'No such file or directory',fname)
421 return f
422 return
423
424def retrieveBranchInfos( branch, poolRecord, ident = "" ):
425 fmt = "%s %3i %8.3f %8.3f %8.3f %s"
426 if 0:
427 out = fmt % ( ident,
428 branch.GetListOfBranches().GetSize(),
429 _get_total_size (branch),
430 branch.GetTotBytes(),
431 branch.GetZipBytes(),
432 branch.GetName() )
433 print(out)
434
435 branches = branch.GetListOfBranches()
436 for b in branches:
437 poolRecord.memSize += _get_total_size (b) / Units.kb
438 if (b.GetZipBytes() < 0.001):
439 poolRecord.memSizeNoZip += _get_total_size (b) / Units.kb
440 poolRecord.diskSize += b.GetZipBytes() / Units.kb
441 poolRecord = retrieveBranchInfos ( b, poolRecord, ident+" " )
442
443 return poolRecord
444
445def make_pool_record (branch, dirType):
446 memSize = _get_total_size (branch) / Units.kb
447 zipBytes = branch.GetZipBytes()
448 memSizeNoZip = memSize if zipBytes < 0.001 else 0.
449 diskSize = branch.GetZipBytes() / Units.kb
450 typeName = branch.GetClassName()
451 if not typeName and (leaf := branch.GetListOfLeaves().At(0)):
452 typeName = leaf.GetTypeName()
453 return PoolRecord(branch.GetName(), memSize, diskSize, memSizeNoZip,
454 branch.GetEntries(),
455 dirType=dirType,
456 typeName=typeName)
457
459 """
460 """
461 class Sorter:
462 DiskSize = "diskSize"
463 MemSize = "memSize"
464 ContainerName = "name"
465
466 @staticmethod
468 return [ PoolRecord.Sorter.DiskSize,
469 PoolRecord.Sorter.MemSize,
470 PoolRecord.Sorter.ContainerName ]
471 pass
472 def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType,
473 detailedInfos = "", typeName = None):
474 """Initialize PoolRecord instance.
475
476 dirType first letter of object type name that may distinguish the types:
477 "T" for TTree, "B" for TBranch,
478 "N" for RNTuple, "F" for RField
479 """
480 object.__init__(self)
481 self.name = name
482 self.memSize = memSize
483 self.diskSize = diskSize
484 self.memSizeNoZip = memSizeNoZip
485 self.nEntries = nEntries
486 self.dirType = dirType
487 self.details = detailedInfos
488 self.augName = ''
489 self.typeName = typeName
490 return
491
493 """
494 A simple class to retrieve informations about the content of a POOL file.
495 It should be abstracted from the underlying technology used to create this
496 POOL file (Db, ROOT,...).
497 Right now, we are using the easy and loosy solution: going straight to the
498 ROOT 'API'.
499 """
500
501 def __init__(self, fileName, verbose=True):
502 object.__init__(self)
503
504 self._fileInfos = None
505 self.keys = None
506 self.dataHeader = PoolRecord("DataHeader", 0, 0, 0,
507 nEntries = 0,
508 dirType = "T")
509 self.augNames = set()
510 self.dataHeaderA = {}
511 self.data = []
512 self.verbose = verbose
513
514 # get the "final" file name (handles all kind of protocols)
515 try:
516 protocol, fileName = file_name(fileName)
517 except Exception as err:
518 print("## warning: problem opening PoolFileCatalog:\n%s"%err)
519 import traceback
520 traceback.print_exc(err)
521 pass
522
523 self.poolFile = None
524 dbFileName = whichdb( fileName )
525 if dbFileName not in ( None, '' ):
526 if self.verbose is True:
527 print("## opening file [%s]..." % str(fileName))
528 db = shelve.open( fileName, 'r' )
529 if self.verbose is True:
530 print("## opening file [OK]")
531 report = db['report']
532 self._fileInfos = report['fileInfos']
533 self.dataHeader = report['dataHeader']
534 self.data = report['data']
535 else:
536 if self.verbose is True:
537 print("## opening file [%s]..." % str(fileName))
538 self.__openPoolFile( fileName )
539 if self.verbose is True:
540 print("## opening file [OK]")
541 self.__processFile()
542
543 return
544
545 def __openPoolFile(self, fileName):
546 # hack to prevent ROOT from loading graphic libraries and hence bother
547 # our fellow Mac users
548 if self.verbose is True:
549 print("## importing ROOT...")
550 import PyUtils.RootUtils as ru
551 ROOT = ru.import_root()
552 self.ROOT = ROOT
553 if self.verbose is True:
554 print("## importing ROOT... [DONE]")
555 # prevent ROOT from being too verbose
556 rootMsg = ShutUp()
557 rootMsg.mute()
558 ROOT.gErrorIgnoreLevel = ROOT.kFatal
559
560 poolFile = None
561 try:
562 poolFile = ROOT.TFile.Open( fileName, PoolOpts.READ_MODE )
563 except Exception as e:
564 rootMsg.unMute()
565 print("## Failed to open file [%s] !!" % fileName)
566 print("## Reason:")
567 print(e)
568 print("## Bailing out...")
569 raise IOError("Could not open file [%s]" % fileName)
570
571 rootMsg.unMute()
572
573 if poolFile is None:
574 print("## Failed to open file [%s] !!" % fileName)
575 msg = "Could not open file [%s]" % fileName
576 raise IOError(msg)
577
578 self.poolFile = poolFile
579 assert self.poolFile.IsOpen() and not self.poolFile.IsZombie(), \
580 "Invalid POOL file or a Zombie one"
581 self._fileInfos = {
582 'name' : self.poolFile.GetName(),
583 'size' : self.poolFile.GetSize(),
584 }
585 return
586
587 def __processFile(self):
588
589 for name in {PoolOpts.TTreeNames.DataHeader, PoolOpts.RNTupleNames.DataHeader}:
590 dhKey = self.poolFile.FindKey( name )
591 if dhKey:
592 obj = self.poolFile.Get( name )
593 if isinstance(obj, self.ROOT.TTree):
594 nEntries = obj.GetEntries()
595 elif isRNTuple(obj):
596 try:
597 nEntries = self.ROOT.Experimental.RNTupleReader.Open(obj).GetNEntries()
598 except AttributeError:
599 # ROOT 6.36 and later
600 nEntries = self.ROOT.RNTupleReader.Open(obj).GetNEntries()
601 else:
602 raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
603 break
604 else:
605 nEntries = 0
606
607 keys = []
608 containers = []
609 for k in self.poolFile.GetListOfKeys():
610 keyname = k.GetName()
611 obj = self.poolFile.Get( keyname )
612 if isinstance(obj, self.ROOT.TTree):
613 containerName = obj.GetName()
614 nEntries = obj.GetEntries()
615 dirType = "T"
616 elif isRNTuple(obj):
617 try:
618 reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
619 except AttributeError:
620 # ROOT 6.36 and later
621 reader = self.ROOT.RNTupleReader.Open(obj)
622 containerName = reader.GetDescriptor().GetName()
623 nEntries = reader.GetNEntries()
624 dirType = "N"
625 else:
626 raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
627 if containerName not in containers:
628 keys.append(k)
629 containers.append(containerName)
630 pass
631 if keyname.startswith(PoolOpts.POOL_HEADER) and not keyname.endswith('Form'):
632 self.dataHeaderA[PoolOpts.augmentationName(keyname)] = \
633 PoolRecord("DataHeader", 0, 0, 0,
634 nEntries = nEntries,
635 dirType = dirType)
636
637 keys.sort (key = lambda x: x.GetName())
638 self.keys = keys
639 del containers
640
641 for k in keys:
642 obj = self.poolFile.Get( k.GetName() )
643 if isinstance(obj, self.ROOT.TTree):
644 name = obj.GetName()
645 elif isRNTuple(obj):
646 try:
647 inspector = self.ROOT.Experimental.RNTupleInspector.Create(obj)
648 except AttributeError:
649 inspector = self.ROOT.RNTupleInspector.Create(obj)
650 name = inspector.GetDescriptor().GetName()
651
652 if PoolOpts.isDataHeader(name):
653 contName = "DataHeader"
654 if isinstance(obj, self.ROOT.TTree):
655 memSize = obj.GetTotBytes() / Units.kb
656 diskSize = obj.GetZipBytes() / Units.kb
657 memSizeNoZip = 0.0
658 if diskSize < 0.001:
659 memSizeNoZip = memSize
660 nEntries = obj.GetEntries()
661
663 dhBranchNames = [
664 br.GetName() for br in obj.GetListOfBranches()
665 if br.GetName().count("DataHeader_p") > 0
666 ]
667 if len(dhBranchNames) == 1:
668 dhBranch = obj.GetBranch(dhBranchNames[0])
669 typeName = dhBranch.GetClassName()
670 if not typeName and (leaf := dhBranch.GetListOfLeaves().At(0)):
671 typeName = leaf.GetTypeName()
672 poolRecord = retrieveBranchInfos(
673 dhBranch,
674 PoolRecord( contName, 0., 0., 0.,
675 nEntries,
676 dirType = "T",
677 typeName = typeName ),
678 ident = " "
679 )
680 else:
681 poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
682 nEntries,
683 dirType = "T")
684
685 self.dataHeader = poolRecord
686 elif isRNTuple(obj):
687 diskSize = inspector.GetCompressedSize() / Units.kb
688 memSize = inspector.GetUncompressedSize() / Units.kb
689
690 memSizeNoZip = 0.0
691 if diskSize < 0.001:
692 memSizeNoZip = memSize
693 nEntries = inspector.GetDescriptor().GetNEntries()
694 poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
695 nEntries,
696 dirType = "N")
697 self.dataHeader = poolRecord
698 elif PoolOpts.isData(name):
699 if isinstance(obj, self.ROOT.TTree):
700 if not hasattr(obj, 'GetListOfBranches'):
701 continue
702 branches = obj.GetListOfBranches()
703 dirType = "T"
704 if name in (PoolOpts.EVENT_DATA, PoolOpts.META_DATA):
705 dirType = "B"
706 for branch in branches:
707 poolRecord = retrieveBranchInfos(
708 branch,
709 make_pool_record(branch, dirType),
710 ident = " "
711 )
712 poolRecord.augName = PoolOpts.augmentationName(name)
713 self.augNames.add(poolRecord.augName)
714 self.data += [ poolRecord ]
715 elif isRNTuple(obj):
716 descriptor = inspector.GetDescriptor()
717 dirType = "N"
718 if name in {PoolOpts.RNTupleNames.EventData, PoolOpts.RNTupleNames.MetaData}:
719 dirType = "F"
720 fieldZeroId = descriptor.GetFieldZeroId()
721 for fieldDescriptor in descriptor.GetFieldIterable(fieldZeroId):
722 fieldId = fieldDescriptor.GetId()
723 fieldTreeInspector = inspector.GetFieldTreeInspector(fieldId)
724 diskSize = fieldTreeInspector.GetCompressedSize() / Units.kb
725 memSize = fieldTreeInspector.GetUncompressedSize() / Units.kb
726 typeName = fieldDescriptor.GetTypeName()
727 fieldName = fieldDescriptor.GetFieldName()
728 poolRecord = PoolRecord(fieldName, memSize, diskSize, memSize,
729 descriptor.GetNEntries(),
730 dirType=dirType,
731 typeName=typeName)
732 poolRecord.augName = PoolOpts.augmentationName(name)
733 self.augNames.add(poolRecord.augName)
734 self.data += [ poolRecord ]
735 # loop over keys
736
737 return
738
739 def fileInfos(self):
740 return os.linesep.join( [
741 "File:" + self._fileInfos['name'],
742 "Size: %12.3f kb" % (self._fileInfos['size'] / Units.kb),
743 "Nbr Events: %i" % self.dataHeader.nEntries
744 ] )
745
746
747 def checkFile(self, sorting = PoolRecord.Sorter.DiskSize):
748 if self.verbose is True:
749 print(self.fileInfos())
750 if len(self.augNames) > 1:
751 for aug in self.augNames:
752 if len(aug) > 0:
753 print( "Nbr %s Events: %i" % (aug, self.dataHeaderA[aug].nEntries) )
754
755
756 data = self.data
757 if sorting in PoolRecord.Sorter.allowedValues():
758 import operator
759 data.sort(key = operator.attrgetter(sorting) )
760
761 def _get_val(x, dflt=-999.):
762 if PoolOpts.FAST_MODE:
763 return dflt
764 return x
765
766 totMemSize = _get_val(self.dataHeader.memSize, dflt=0.)
767 totDiskSize = self.dataHeader.diskSize
768
769 def _safe_div(num,den):
770 if float(den) == 0.:
771 return 0.
772 return num/den
773
774 if self.verbose is True:
775 print("")
776 print("="*80)
777 print(PoolOpts.HDR_FORMAT % ( "Mem Size", "Disk Size","Size/Evt",
778 "MissZip/Mem","items",
779 "(X) Container Name (X=Tree|Branch)" ))
780 print("="*80)
781
782 print(PoolOpts.ROW_FORMAT % (
783 _get_val (self.dataHeader.memSize),
784 self.dataHeader.diskSize,
785 _safe_div(self.dataHeader.diskSize,float(self.dataHeader.nEntries)),
786 _get_val (_safe_div(self.dataHeader.memSizeNoZip,
787 self.dataHeader.memSize)),
788 self.dataHeader.nEntries,
789 "("+self.dataHeader.dirType+") "+self.dataHeader.name
790 ))
791 print("-"*80)
792
793 totMemSizeA = {}
794 totDiskSizeA = {}
795 for d in data:
796 totMemSize += 0. if PoolOpts.FAST_MODE else d.memSize
797 totDiskSize += d.diskSize
798 memSizeNoZip = d.memSizeNoZip/d.memSize if d.memSize != 0. else 0.
799 aug = d.augName
800 totMemSizeA[aug] = totMemSizeA.get(aug,0.) + d.memSize
801 totDiskSizeA[aug] = totDiskSizeA.get(aug,0.) + d.diskSize
802 if self.verbose is True:
803 print(PoolOpts.ROW_FORMAT % (
804 _get_val (d.memSize),
805 d.diskSize,
806 _safe_div(d.diskSize, float(self.dataHeader.nEntries)),
807 _get_val (memSizeNoZip),
808 d.nEntries,
809 "("+d.dirType+") "+d.name
810 ))
811
812 if self.verbose is True:
813 print("="*80)
814 if len(self.augNames) > 1:
815 augs = sorted(self.augNames)
816 for a in augs:
817 print(PoolOpts.ROW_FORMAT % (
818 totMemSizeA[a], totDiskSizeA[a],
819 _safe_div(totDiskSizeA[a], float(self.dataHeaderA[a].nEntries)),
820 0.0,
821 self.dataHeaderA[a].nEntries,
822 "Aug Stream: " + ('MAIN' if a=='' else a)
823 ))
824 print("-"*80)
825 print(PoolOpts.ROW_FORMAT % (
826 totMemSize, totDiskSize,
827 _safe_div(totDiskSize, float(self.dataHeader.nEntries)),
828 0.0, self.dataHeader.nEntries,
829 "TOTAL (POOL containers)"
830 ))
831 print("="*80)
832 if PoolOpts.FAST_MODE:
833 print("::: warning: FAST_MODE was enabled: some columns' content ",)
834 print("is meaningless...")
835 return
836
837 def detailedDump(self, bufferName = None ):
838 if self.poolFile is None or \
839 self.keys is None:
840 print("Can't perform a detailedDump with a shelve file as input !")
841 return
842
843 if bufferName is None:
844 bufferName = "/dev/stdout"
845 out = open( bufferName, "w" )
846 sys.stdout.flush()
847 save_stdout_fileno = os.dup (sys.stdout.fileno())
848 os.dup2( out.fileno(), sys.stdout.fileno() )
849
850 out.write( "#" * 80 + os.linesep )
851 out.write( "## detailed dump" + os.linesep )
852 out.flush()
853
854 for key in self.keys:
855 tree = key.ReadObj()
856 name = tree.GetName()
857
858 if PoolOpts.isDataHeader(name) or \
859 PoolOpts.isData(name):
860 try:
861 print ("=== [%s] ===" % name, file=sys.stderr)
862 tree.Print()
863 except Exception as err:
864 print ("Caught:",err, file=sys.stderr)
865 print (sys.exc_info()[0], file=sys.stderr)
866 print (sys.exc_info()[1], file=sys.stderr)
867 pass
868 pass
869 pass
870 out.write( "#" * 80 + os.linesep )
871 out.flush()
872 out.write( "#" * 80 + os.linesep )
873
877 out.flush()
878 if bufferName != "<stdout>":
879 out.close()
880 sys.stdout.close()
881 sys.stdout = open (save_stdout_fileno, 'a')
882 return
883
884 def poolRecord(self, name):
885 """
886 Return a PoolRecord according to its (branch) name
887 Raise KeyError if no match is found
888 """
889 for data in self.data:
890 if data.name == name:
891 return data
892 raise KeyError("No PoolRecord with name [%s]" % name)
893
894 def saveReport (self, fileName):
895 """
896 Save all the gathered informations into a python shelve or a CSV file
897 (depending on the @param `fileName` extension)
898 """
899 import os
900 if os.path.splitext(fileName)[-1] == '.csv':
901 return self._save_csv_report (fileName)
902 return self._save_shelve_report (fileName)
903
904 def _save_shelve_report(self, fileName):
905 """
906 Save all the gathered informations into a python shelve
907 Data can then be read like so:
908 >>> import shelve
909 >>> db = shelve.open( 'myfile.dat', 'r' )
910 >>> report = db['report']
911 >>> print ('fileSize:',report['fileSize'])
912 >>> print ('dataHeader/memSize:',report['dataHeader'].memSize)
913 >>> for d in report['data']:
914 ... print ('data:',d.name,d.nEntries,d.memSize)
915 """
916 import shelve, os
917 if os.path.exists (fileName):
918 os.unlink (fileName)
919 db = shelve.open (fileName)
920 db['report'] = {
921 'fileInfos' : self._fileInfos,
922 'nbrEvts' : self.dataHeader.nEntries,
923 'dataHeader' : self.dataHeader,
924 'data' : self.data
925 }
926 db.close()
927 return
928
929 def _save_csv_report(self, fileName):
930 """
931 Save all the gathered informations into a CSV file
932 """
933 import csv, os
934 if os.path.exists (fileName):
935 os.unlink (fileName)
936 args = {'newline' : ''}
937 f = open (fileName, 'w', **args)
938 o = csv.writer (f)
939 o.writerow (['file name', self._fileInfos['name']])
940 o.writerow (['file size', self._fileInfos['size']])
941 o.writerow (['nbr evts', self.dataHeader.nEntries])
942 o.writerow (['mem size', 'disk size', 'mem size nozip', 'items',
943 'container name', 'branch type'])
944
945 for d in self.data:
946 o.writerow ([d.memSize, d.diskSize, d.memSizeNoZip,
947 d.nEntries, d.name, d.dirType])
948 f.close()
949 return
950
951 def __del__(self):
952 if self.poolFile and hasattr(self.poolFile, 'Close'):
953 try:
954 self.poolFile.Close()
955 self.poolFile = None
956 except Exception as err:
957 print("WARNING:",err)
958 pass
959
960 pass # class PoolFile
961
963 """
964 A helper class to compare 2 POOL files and check that they match, both in
965 terms of containers' content and containers' sizes
966 """
967
968 def __init__(self, refFileName, chkFileName, verbose = False, ignoreList = None, strict = False):
969 object.__init__(self)
970
971 self.verbose = verbose
972 self.strict = strict
973 refFileName = os.path.expandvars( os.path.expanduser( refFileName ) )
974 chkFileName = os.path.expandvars( os.path.expanduser( chkFileName ) )
975
976 if ignoreList is None:
977 ignoreList = []
978
979 try:
980 self.refFile = PoolFile( refFileName )
981 self.chkFile = PoolFile( chkFileName )
982 self.ignList = sorted( ignoreList )
983 except Exception as err:
984 print("## Caught exception [%s] !!" % str(err.__class__))
985 print("## What:",err)
986 print(sys.exc_info()[0])
987 print(sys.exc_info()[1])
988 err = "Error while opening POOL files !"
989 err += " chk : %s%s" % ( chkFileName, os.linesep )
990 err += " ref : %s%s" % ( refFileName, os.linesep )
991 raise Exception(err)
992
993 self.allGood = True
994 self.summary = []
995
996 self.__checkDiff()
997 return
998
999 def __checkDiff(self):
1000
1001 self.summary += [
1002 "=" * 80,
1003 "::: Comparing POOL files...",
1004 " ref : %s" % self.refFile._fileInfos['name'],
1005 " chk : %s" % self.chkFile._fileInfos['name'],
1006 "-" * 80,
1007 ]
1008
1009 if self.chkFile.dataHeader.nEntries != \
1010 self.refFile.dataHeader.nEntries :
1011 self.summary += [
1012 "## WARNING: files don't have the same number of entries !!",
1013 " ref : %r" % self.refFile.dataHeader.nEntries,
1014 " chk : %r" % self.chkFile.dataHeader.nEntries,
1015 ]
1016
1017 refNames = sorted( [d.name for d in self.refFile.data] )
1018 chkNames = sorted( [d.name for d in self.chkFile.data] )
1019
1020 if chkNames != refNames:
1021 self.summary += [
1022 "## ERROR: files don't have the same content !!",
1023 ]
1024 addNames = [ n for n in chkNames if n not in refNames ]
1025 if len( addNames ) > 0:
1026 self.summary += [ "## collections in 'chk' and not in 'ref'" ]
1027 for n in addNames:
1028 self.summary += [ " + %s" % n ]
1029 subNames = [ n for n in refNames if n not in chkNames ]
1030 if len( subNames ) > 0:
1031 self.summary += [ "## collections in 'ref' and not in 'chk'" ]
1032 for n in subNames:
1033 self.summary += [ " - %s" % n ]
1034 self.allGood = False
1035 pass
1036
1037 if len(self.ignList) > 0:
1038 self.summary += [ "## Ignoring the following:" ]
1039 for n in self.ignList:
1040 self.summary += [ " %s" % n ]
1041
1042 commonContent = [ d for d in chkNames if (d in refNames and d not in self.ignList)]
1043
1044 if not self.allGood:
1045 self.summary += [ "=" * 80 ]
1046 self.summary += [ "::: comparing common content (mem-size / disk-size)..." ]
1047
1048 for name in commonContent:
1049 chkMemSize = self.chkFile.poolRecord(name).memSize
1050 refMemSize = self.refFile.poolRecord(name).memSize
1051 chkDiskSize = self.chkFile.poolRecord(name).diskSize
1052 refDiskSize = self.refFile.poolRecord(name).diskSize
1053
1054 if chkMemSize != refMemSize or (self.strict and chkDiskSize != refDiskSize):
1055 self.summary += [
1056 "[ERR] %12.3f / %12.3f kb (ref) ==> %12.3f / %12.3f kb (chk) | %s" % \
1057 ( refMemSize,refDiskSize,chkMemSize,chkDiskSize, name )
1058 ]
1059 self.allGood = False
1060 elif self.verbose:
1061 self.summary += [
1062 " [OK] %12.3f/%12.3f kb | %s" % \
1063 ( chkMemSize, chkDiskSize, name )
1064 ]
1065
1066 self.summary += [ "=" * 80 ]
1067
1068
1069 if self.allGood: self.summary += [ "## Comparison : [OK]" ]
1070 else: self.summary += [ "## Comparison : [ERR]" ]
1071
1072 return self.allGood
1073
1074 def status(self):
1075 if self.allGood: return 0
1076 else: return 1
1077
1078 def printSummary(self, out = sys.stdout):
1079 for i in self.summary:
1080 out.writelines( i + os.linesep )
1081 pass
1082 return
1083
1085 """
1086 A counter just contains an item list (pairs class-name/sg-key) and the size
1087 """
1088 size = 0
1089 def __init__(self, name, itemList):
1090 object.__init__(self)
1091 self.name = name
1092 self.itemList = itemList
1093 pass # Counter
1094
1095
void print(char *figname, TCanvas *c1)
__init__(self, name, itemList)
Definition PoolFile.py:1089
printSummary(self, out=sys.stdout)
Definition PoolFile.py:1078
bool allGood
final decision
Definition PoolFile.py:993
__init__(self, refFileName, chkFileName, verbose=False, ignoreList=None, strict=False)
Definition PoolFile.py:968
__call__(self, url_or_fid)
Definition PoolFile.py:247
__init__(self, catalog=None)
Definition PoolFile.py:57
__openPoolFile(self, fileName)
Definition PoolFile.py:545
detailedDump(self, bufferName=None)
Definition PoolFile.py:837
__init__(self, fileName, verbose=True)
Definition PoolFile.py:501
isAugmentation(cls, name)
Definition PoolFile.py:297
augmentationName(cls, name)
Definition PoolFile.py:301
isAugmentedHeader(cls, name)
Definition PoolFile.py:308
__init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType, detailedInfos="", typeName=None)
Definition PoolFile.py:473
— data ---------------------------------------------------------------—
Definition PoolFile.py:30
STL class.
T * Get(TFile &f, const std::string &n, const std::string &dir="", const chainmap_t *chainmap=0, std::vector< std::string > *saved=0)
get a histogram given a path, and an optional initial directory if histogram is not found,...
bool add(const std::string &hname, TKey *tobj)
Definition fastadd.cxx:55
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146
std::vector< std::string > split(const std::string &s, const std::string &t=":")
Definition hcg.cxx:177
make_pool_record(branch, dirType)
Definition PoolFile.py:445
file_name(fname)
Definition PoolFile.py:325
retrieveBranchInfos(branch, poolRecord, ident="")
Definition PoolFile.py:424
_root_open(fname)
Definition PoolFile.py:399
_save_shelve_report(self, fileName)
Definition PoolFile.py:904
_get_total_size(branch)
Definition PoolFile.py:313
poolRecord(self, name)
Definition PoolFile.py:884
_save_csv_report(self, fileName)
Definition PoolFile.py:929
saveReport(self, fileName)
Definition PoolFile.py:894
void handler(int sig)
signal handler
Definition rmain.cxx:99