ATLAS Offline Software
PoolFile.py
Go to the documentation of this file.
1 # Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2 
3 # @author: Sebastien Binet <binet@cern.ch>
4 # @date: March 2007
5 #
6 #
7 
8 __author__ = "Sebastien Binet <binet@cern.ch>"
9 
10 
11 __all__ = [
12  'PoolFileCatalog',
13  'PoolOpts',
14  'extract_items',
15  'isRNTuple',
16  'PoolRecord',
17  'PoolFile',
18  'DiffFiles',
19  ]
20 
21 
22 import sys
23 import os
24 import shelve
25 
26 from dbm import whichdb
27 
28 from .Helpers import ShutUp
29 
30 
31 class Units (object):
32  kb = 1024.
33  Mb = 1024.*1024.
34 
35 
36 def isRNTuple(obj):
37  # MN: remove the "try" after migration to ROOT 6.34
38  try: from ROOT import RNTuple
39  except(ImportError): from ROOT.Experimental import RNTuple
40  return isinstance( obj, RNTuple )
41 
42 
44  """ reverse-engineering of the POOL FileCatalog.
45  allows to retrieve the physical filename from a logical one, provided
46  that the file-id is known to the (real) PoolFileCatalog
47  """
48  DefaultCatalog = "xmlcatalog_file:PoolFileCatalog.xml"
49  AllowedProtocols = (
50  # see: PoolSvc::createCatalog
51  # http://alxr.usatlas.bnl.gov/lxr/source/atlas/Database/AthenaPOOL/PoolSvc/src/PoolSvc.cxx?v=head#736
52  "xmlcatalog_file:", # POOL default
53  "apcfile:", # ATLAS_POOLCOND_PATH
54  "prfile:", # file via PathResolver
55  "file:", # simple file on local FS
56  )
57 
58  def __init__ (self, catalog=None):
59  super (PoolFileCatalog, self).__init__()
60  self.catalog = None
61 
62  if catalog is None:
63  # chase poolfilecatalog location
64  catalog = os.environ.get("POOL_CATALOG", self.DefaultCatalog)
65 
66  if isinstance(catalog, str):
67  catalog = [catalog]
68 
69  if not isinstance (catalog, (str, list)):
70  raise TypeError(
71  "catalog contact string should be a string or a list thereof! (got %r)"%
72  type(catalog))
73 
74  osp = os.path
75  def osp_exp(x):
76  return osp.expanduser(osp.expandvars(x))
77 
78  def _handle_apcfile_old(x):
79  """ return $ATLAS_POOLCOND_PATH/poolcond/x
80  """
81  if 'ATLAS_POOLCOND_PATH' not in os.environ:
82  return osp_exp(x)
83  pcp = os.environ["ATLAS_POOLCOND_PATH"]
84  if x.startswith("apcfile:"):
85  x = x[len("apcfile:"):]
86  return osp_exp(osp.join(pcp, 'poolcond', x))
87 
88  def _handle_apcfile(x):
89  """ return $ATLAS_POOLCOND_PATH/x
90  """
91  if 'ATLAS_POOLCOND_PATH' not in os.environ:
92  return osp_exp(x)
93  pcp = os.environ["ATLAS_POOLCOND_PATH"]
94  if x.startswith("apcfile:"):
95  x = x[len("apcfile:"):]
96  return osp_exp(osp.join(pcp, x))
97 
98  def _handle_xmlcatalog_file(x):
99  return osp_exp(x[len("xmlcatalog_file:"):])
100 
101  def _handle_prfile(x):
102  x = x[len("prfile:"):]
103  x = osp_exp(x)
104  try:
105  import AthenaCommon.Utils.unixtools as u
106  return u.FindFile(x,
107  os.environ['DATAPATH'].split(os.pathsep),
108  os.R_OK)
109  except ImportError:
110  return x
111 
112  def _handle_file(x):
113  x = x[len("file:"):]
114  x = osp_exp(x)
115  return x
116 
117  cat_dispatch = {
118  "xmlcatalog_file:": _handle_xmlcatalog_file,
119  "apcfile:": _handle_apcfile,
120  "prfile:": _handle_prfile,
121  "file:": _handle_file,
122  }
123  assert sorted(cat_dispatch.keys()) == sorted(self.AllowedProtocols), \
124  "catalog dispatch keys does not match AllowedProtocols:" \
125  "\n%s\n%s" % (sorted(cat_dispatch.keys()),
126  sorted(self.AllowedProtocols))
127 
128  from . import xmldict
129  def _build_catalog(catalog):
130  if not catalog.startswith(self.AllowedProtocols):
131  raise ValueError(
132  "sorry PoolFile:PoolFileCatalog only supports %s"
133  " as a protocol for the POOL file catalog (got: '%s')"
134  % (self.AllowedProtocols, catalog)
135  )
136  for protocol, handler in cat_dispatch.iteritems():
137  if catalog.startswith(protocol):
138  catalog = handler(catalog)
139  break
140  # make sure the catalog exists...
141  import os
142 
143  if not os.path.exists (catalog):
144  return {}
145  # raise RuntimeError(
146  # 'could not find any PoolFileCatalog in [%s]' % catalog
147  # )
148 
149 
150  root = xmldict.ElementTree.parse (catalog).getroot()
151  return dict(xmldict.xml2dict(root))
152 
153  errors = []
154  cat = {'POOLFILECATALOG':{'File':[]}}
155  for c in catalog:
156  try:
157  bc = _build_catalog(c)
158  pc = bc.get('POOLFILECATALOG',{})
159  files = []
160  if pc:
161  files = pc.get('File',[])
162  if isinstance(files, dict):
163  files = [files]
164  cat['POOLFILECATALOG']['File'].extend(files)
165  except Exception as err:
166  errors.append(err)
167 
168  if errors:
169  raise errors[0] # FIXME : should we customize this a bit ?
170 
171  self.catalog = cat
172  pass
173 
174  def pfn (self, url_or_fid):
175  """find the physical file name given a url or a file-id"""
176  import os.path as osp
177  url_or_fid = osp.expanduser(osp.expandvars(url_or_fid))
178  import types
179  if isinstance (url_or_fid, types.ListType):
180  return [self._pfn(f) for f in url_or_fid]
181  else:
182  return self._pfn(url_or_fid)
183 
184  def _pfn (self, url_or_fid):
185  """find the physical file name given a url or a file-id"""
186  if not ('POOLFILECATALOG' in self.catalog):
187  return None
188  if not ('File' in self.catalog['POOLFILECATALOG']):
189  return None
190 
191  PFN_IDX = 0 # take this pfn when alternates exist
192 
193  files = self.catalog['POOLFILECATALOG']['File']
194  if isinstance(files, dict):
195  # in case there where only one entry in the catalog
196  files = [files]
197  import re
198  if url_or_fid.lower().startswith('fid:'):
199  url_or_fid = url_or_fid[len('fid:'):]
200  if re.compile (r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$').match (url_or_fid):
201  fid = url_or_fid.lower()
202  # better to check consistency of catalog over all entries
203  # than declare success on first match...
204  match = {}
205  for f in files:
206  if f.ID.lower() == fid:
207  match[fid] = []
208  pfn = f.physical.pfn
209  if isinstance(pfn, (list,tuple)):
210  match[fid].append([i.name for i in pfn])
211  else:
212  match[fid].append([pfn.name])
213  if len(match[fid])==1:
214  return match[fid][0][PFN_IDX]
215  if len(match[fid])>1:
216  raise LookupError (
217  "more than one match for FID='%s'!\n%r"%(fid,match)
218  )
219  raise KeyError ("no entry with FID='%s' in catalog" % fid)
220  else:
221  url = url_or_fid
222  if url.lower().startswith("lfn:"):
223  url = url[len("lfn:"):]
224  # better to check consistency of catalog over all entries
225  # than declare success on first match...
226  match = {}
227  for f in files:
228  if (f.logical != '' # no LFN for this entry
229  and f.logical.lfn.name == url):
230  match[url] = []
231  pfn = f.physical.pfn
232  if isinstance(pfn, (list,tuple)):
233  match[url].append([i.name for i in pfn])
234  else:
235  match[url].append([pfn.name])
236  if len(match[url])==1:
237  return match[url][0][PFN_IDX]
238  if len(match[url])>1:
239  raise LookupError (
240  "more than one match for LFN='%s'!\n%r"%(url,match)
241  )
242  raise KeyError ("no entry with LFN='%s' in catalog" % url)
243  # assume that if not LFN: then PFN:, no matter what...
244  if url.lower().startswith("pfn:"):
245  url = url[len("pfn:"):]
246  return url
247 
248  def __call__ (self, url_or_fid):
249  return self.pfn (url_or_fid)
250 
251  pass
252 
254  # default names of APR file storage elements
255  # copied here from RootUtils/APRDefaults.h for performance (as the first dictionary access takes 7 sec)
256  # see ATEAM-973 for a more detailed discussion
257  # the definitions here should be kept in sync with those!
258  class TTreeNames:
259  EventData = "CollectionTree"
260  EventTag = "POOLCollectionTree"
261  DataHeader = "POOLContainer"
262  MetaData = "MetaData"
264  EventData = "EventData"
265  EventTag = "EventTag"
266  DataHeader = "DataHeader"
267  MetaData = "MetaData"
268 
269  FAST_MODE = False
270  SUPER_DETAILED_BRANCH_SZ = False
271  READ_MODE = "READ"
272  POOL_HEADER = TTreeNames.DataHeader
273  EVENT_DATA = TTreeNames.EventData
274  META_DATA = TTreeNames.MetaData
275  HDR_FORMAT = " %11s %11s %11s %11s %5s %s"
276  ROW_FORMAT = "%12.3f kb %12.3f kb %12.3f kb %12.3f %8i %s"
277 
278  @classmethod
279  def isData(cls, name):
280  return not name.startswith("##") and not cls.isDataHeader(name)
281 
282  @classmethod
283  def isDataHeader(cls, name):
284  return name in {cls.TTreeNames.DataHeader
285  , cls.TTreeNames.DataHeader+"_DataHeader"
286  , cls.RNTupleNames.DataHeader}
287 
288  @classmethod
289  def isEventData(cls, name):
290  return name.startswith(PoolOpts.EVENT_DATA)
291 
292  @classmethod
293  def isAugmentation(cls, name):
294  return "_DAOD_" in name
295 
296  @classmethod
297  def augmentationName(cls, name):
298  s = (name+"__").split('_')[2]
299  if s.endswith("Form"):
300  s = s[:-4]
301  return s
302 
303  @classmethod
304  def isAugmentedHeader(cls, name):
305  return name.startswith(PoolOpts.POOL_HEADER) and cls.isAugmentation(name)
306 
307  pass # class PoolOpts
308 
309 def _get_total_size (branch):
310  if PoolOpts.FAST_MODE:
311  return -1.
312  if not PoolOpts.SUPER_DETAILED_BRANCH_SZ:
313  return branch.GetTotalSize()
314  brSize = 0
315  branch.LoadBaskets()
316  for bnum in range(0, branch.GetWriteBasket()):
317  basket = branch.GetBasket(bnum)
318  brSize += basket.GetObjlen() - 8
319  return brSize
320 
321 def file_name(fname):
322  """take a file name, return the pair (protocol, 'real' file name)
323  """
324  fname = os.path.expanduser(os.path.expandvars(fname))
325 
326  def _normalize_uri(uri):
327  if uri.startswith('/'):
328  return 'file:'+uri
329  return uri
330 
331  from urllib.parse import urlsplit
332  url = urlsplit(_normalize_uri(fname))
333  protocol = url.scheme
334  def _normalize(fname):
335  from posixpath import normpath
336  fname = normpath(fname)
337  if fname.startswith('//'): fname = fname[1:]
338  return fname
339 
340  if protocol in ('', 'file', 'pfn'):
341  protocol = ''
342  fname = _normalize(url.path)
343 
344 
345  if fname.startswith('/castor/'):
346  protocol = 'rfio'
347  fname = protocol + ':' + fname
348 
349  elif protocol in ('rfio', 'castor'):
350  protocol = 'rfio'
351  fname = _normalize(url.path)
352  fname = protocol+':'+fname
353 
354  elif protocol in ('root','dcap', 'dcache', 'http', 'https', 'dav', 'davs'):
355  pass
356 
357  elif protocol in ('gsidcap',):
358  protocol = 'gfal:gsidcap'
359  pass
360 
361  elif protocol in ('lfn','fid',):
362  # percolate through the PoolFileCatalog
363  from PyUtils.PoolFile import PoolFileCatalog as pfc
364  fname = pfc().pfn(protocol+':'+url.path)
365  pass
366 
367  elif protocol in ('ami',):
368  # !! keep order of tokens !
369  for token in ('ami:', '//', '/'):
370  if fname.startswith(token):
371  fname = fname[len(token):]
372  fname = 'ami://' + fname
373  pass
374 
375  else:
376  print(f'## warning: unknown protocol [{protocol}]. we will just return our input')
377  pass
378 
379  return (protocol, fname)
380 
381 def _setup_ssl(root):
382  x509_proxy = os.environ.get('X509_USER_PROXY', '')
383  if x509_proxy:
384  # setup proper credentials
385  root.TSSLSocket.SetUpSSL(
386  x509_proxy,
387  "/etc/grid-security/certificates",
388  x509_proxy,
389  x509_proxy)
390  else:
391  print("## warning: protocol https is requested but no X509_USER_PROXY was found! (opening the file might fail.)")
392  pass
393  return
394 
395 def _root_open(fname):
396  import PyUtils.RootUtils as ru
397  root = ru.import_root()
398  import re
399 
400  with ShutUp(filters=[
401  re.compile('TClass::TClass:0: RuntimeWarning: no dictionary for class.*') ]):
402  root.gSystem.Load('libRootCollection')
403  root_open = root.TFile.Open
404 
405  # we need to get back the protocol b/c of the special
406  # case of secure-http which needs to open TFiles as TWebFiles...
407  protocol, _ = file_name(fname)
408  if protocol == 'https':
409  _setup_ssl(root)
410  root_open = root.TWebFile.Open
411 
412  f = root_open(fname, 'READ')
413  if f is None or not f:
414  import errno
415  raise IOError(errno.ENOENT,
416  'No such file or directory',fname)
417  return f
418  return
419 
420 def retrieveBranchInfos( branch, poolRecord, ident = "" ):
421  fmt = "%s %3i %8.3f %8.3f %8.3f %s"
422  if 0:
423  out = fmt % ( ident,
424  branch.GetListOfBranches().GetSize(),
425  _get_total_size (branch),
426  branch.GetTotBytes(),
427  branch.GetZipBytes(),
428  branch.GetName() )
429  print(out)
430 
431  branches = branch.GetListOfBranches()
432  for b in branches:
433  poolRecord.memSize += _get_total_size (b) / Units.kb
434  if (b.GetZipBytes() < 0.001):
435  poolRecord.memSizeNoZip += _get_total_size (b) / Units.kb
436  poolRecord.diskSize += b.GetZipBytes() / Units.kb
437  poolRecord = retrieveBranchInfos ( b, poolRecord, ident+" " )
438 
439  return poolRecord
440 
441 def make_pool_record (branch, dirType):
442  memSize = _get_total_size (branch) / Units.kb
443  zipBytes = branch.GetZipBytes()
444  memSizeNoZip = memSize if zipBytes < 0.001 else 0.
445  diskSize = branch.GetZipBytes() / Units.kb
446  typeName = branch.GetClassName()
447  if not typeName and (leaf := branch.GetListOfLeaves().At(0)):
448  typeName = leaf.GetTypeName()
449  return PoolRecord(branch.GetName(), memSize, diskSize, memSizeNoZip,
450  branch.GetEntries(),
451  dirType=dirType,
452  typeName=typeName)
453 
454 def extract_items(pool_file, verbose=True, items_type='eventdata'):
455  """Helper function to read a POOL file and extract the item-list from the
456  DataHeader content.
457  @params
458  `pool_file` the name of the pool file to inspect
459  `verbose` self-explanatory
460  `items_type` what kind of items one is interested in
461  allowed values: 'eventdata' 'metadata'
462  Note: this function is actually executed in a forked sub-process
463  if `fork` is True
464  """
465  _allowed_values = ('eventdata',
466  'metadata',)
467  if items_type not in _allowed_values:
468  err = "".join([
469  "invalid argument for 'items_type'. ",
470  "got: [%s] " % items_type,
471  "(allowed values: %r)" % _allowed_values
472  ])
473  raise ValueError(err)
474 
475  key = '%s_items' % items_type
476  f_root = _root_open(pool_file)
477  import PyUtils.FilePeekerTool as fpt
478  fp = fpt.FilePeekerTool(f_root)
479  items = fp.getPeekedData(key)
480 
481  if items is None:
482  items = []
483  return items
484 
486  """
487  """
488  class Sorter:
489  DiskSize = "diskSize"
490  MemSize = "memSize"
491  ContainerName = "name"
492 
493  @staticmethod
495  return [ PoolRecord.Sorter.DiskSize,
496  PoolRecord.Sorter.MemSize,
497  PoolRecord.Sorter.ContainerName ]
498  pass
499  def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType,
500  detailedInfos = "", typeName = None):
501  """Initialize PoolRecord instance.
502 
503  dirType first letter of object type name that may distinguish the types:
504  "T" for TTree, "B" for TBranch,
505  "N" for RNTuple, "F" for RField
506  """
507  object.__init__(self)
508  self.name = name
509  self.memSize = memSize
510  self.diskSize = diskSize
511  self.memSizeNoZip = memSizeNoZip
512  self.nEntries = nEntries
513  self.dirType = dirType
514  self.details = detailedInfos
515  self.augName = ''
516  self.typeName = typeName
517  return
518 
520  """
521  A simple class to retrieve informations about the content of a POOL file.
522  It should be abstracted from the underlying technology used to create this
523  POOL file (Db, ROOT,...).
524  Right now, we are using the easy and loosy solution: going straight to the
525  ROOT 'API'.
526  """
527 
528  def __init__(self, fileName, verbose=True):
529  object.__init__(self)
530 
531  self._fileInfos = None
532  self.keys = None
533  self.dataHeader = PoolRecord("DataHeader", 0, 0, 0,
534  nEntries = 0,
535  dirType = "T")
536  self.augNames = set()
537  self.dataHeaderA = {}
538  self.data = []
539  self.verbose = verbose
540 
541  # get the "final" file name (handles all kind of protocols)
542  try:
543  protocol, fileName = file_name(fileName)
544  except Exception as err:
545  print("## warning: problem opening PoolFileCatalog:\n%s"%err)
546  import traceback
547  traceback.print_exc(err)
548  pass
549 
550  self.poolFile = None
551  dbFileName = whichdb( fileName )
552  if dbFileName not in ( None, '' ):
553  if self.verbose is True:
554  print("## opening file [%s]..." % str(fileName))
555  db = shelve.open( fileName, 'r' )
556  if self.verbose is True:
557  print("## opening file [OK]")
558  report = db['report']
559  self._fileInfos = report['fileInfos']
560  self.dataHeader = report['dataHeader']
561  self.data = report['data']
562  else:
563  if self.verbose is True:
564  print("## opening file [%s]..." % str(fileName))
565  self.__openPoolFile( fileName )
566  if self.verbose is True:
567  print("## opening file [OK]")
568  self.__processFile()
569 
570  return
571 
572  def __openPoolFile(self, fileName):
573  # hack to prevent ROOT from loading graphic libraries and hence bother
574  # our fellow Mac users
575  if self.verbose is True:
576  print("## importing ROOT...")
577  import PyUtils.RootUtils as ru
578  ROOT = ru.import_root()
579  self.ROOT = ROOT
580  if self.verbose is True:
581  print("## importing ROOT... [DONE]")
582  # prevent ROOT from being too verbose
583  rootMsg = ShutUp()
584  rootMsg.mute()
585  ROOT.gErrorIgnoreLevel = ROOT.kFatal
586 
587  poolFile = None
588  try:
589  poolFile = ROOT.TFile.Open( fileName, PoolOpts.READ_MODE )
590  except Exception as e:
591  rootMsg.unMute()
592  print("## Failed to open file [%s] !!" % fileName)
593  print("## Reason:")
594  print(e)
595  print("## Bailing out...")
596  raise IOError("Could not open file [%s]" % fileName)
597 
598  rootMsg.unMute()
599 
600  if poolFile is None:
601  print("## Failed to open file [%s] !!" % fileName)
602  msg = "Could not open file [%s]" % fileName
603  raise IOError(msg)
604 
605  self.poolFile = poolFile
606  assert self.poolFile.IsOpen() and not self.poolFile.IsZombie(), \
607  "Invalid POOL file or a Zombie one"
608  self._fileInfos = {
609  'name' : self.poolFile.GetName(),
610  'size' : self.poolFile.GetSize(),
611  }
612  return
613 
614  def __processFile(self):
615 
616  for name in {PoolOpts.TTreeNames.DataHeader, PoolOpts.RNTupleNames.DataHeader}:
617  dhKey = self.poolFile.FindKey( name )
618  if dhKey:
619  obj = self.poolFile.Get( name )
620  if isinstance(obj, self.ROOT.TTree):
621  nEntries = obj.GetEntries()
622  elif isRNTuple(obj):
623  nEntries = self.ROOT.Experimental.RNTupleReader.Open(obj).GetNEntries()
624  else:
625  raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
626  break
627  else:
628  nEntries = 0
629 
630  keys = []
631  containers = []
632  for k in self.poolFile.GetListOfKeys():
633  keyname = k.GetName()
634  obj = self.poolFile.Get( keyname )
635  if isinstance(obj, self.ROOT.TTree):
636  containerName = obj.GetName()
637  nEntries = obj.GetEntries()
638  dirType = "T"
639  elif isRNTuple(obj):
640  reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
641  containerName = reader.GetDescriptor().GetName()
642  nEntries = reader.GetNEntries()
643  dirType = "N"
644  else:
645  raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
646  if containerName not in containers:
647  keys.append(k)
648  containers.append(containerName)
649  pass
650  if keyname.startswith(PoolOpts.POOL_HEADER) and not keyname.endswith('Form'):
651  self.dataHeaderA[PoolOpts.augmentationName(keyname)] = \
652  PoolRecord("DataHeader", 0, 0, 0,
653  nEntries = nEntries,
654  dirType = dirType)
655 
656  keys.sort (key = lambda x: x.GetName())
657  self.keys = keys
658  del containers
659 
660  for k in keys:
661  obj = self.poolFile.Get( k.GetName() )
662  if isinstance(obj, self.ROOT.TTree):
663  name = obj.GetName()
664  elif isRNTuple(obj):
665  inspector = self.ROOT.Experimental.RNTupleInspector.Create(obj)
666  name = inspector.GetDescriptor().GetName()
667 
668  if PoolOpts.isDataHeader(name):
669  contName = "DataHeader"
670  if isinstance(obj, self.ROOT.TTree):
671  memSize = obj.GetTotBytes() / Units.kb
672  diskSize = obj.GetZipBytes() / Units.kb
673  memSizeNoZip = 0.0
674  if diskSize < 0.001:
675  memSizeNoZip = memSize
676  nEntries = obj.GetEntries()
677 
679  dhBranchNames = [
680  br.GetName() for br in obj.GetListOfBranches()
681  if br.GetName().count("DataHeader_p") > 0
682  ]
683  if len(dhBranchNames) == 1:
684  dhBranch = obj.GetBranch(dhBranchNames[0])
685  typeName = dhBranch.GetClassName()
686  if not typeName and (leaf := dhBranch.GetListOfLeaves().At(0)):
687  typeName = leaf.GetTypeName()
688  poolRecord = retrieveBranchInfos(
689  dhBranch,
690  PoolRecord( contName, 0., 0., 0.,
691  nEntries,
692  dirType = "T",
693  typeName = typeName ),
694  ident = " "
695  )
696  else:
697  poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
698  nEntries,
699  dirType = "T")
700 
701  self.dataHeader = poolRecord
702  elif isRNTuple(obj):
703  diskSize = inspector.GetCompressedSize() / Units.kb
704  memSize = inspector.GetUncompressedSize() / Units.kb
705 
706  memSizeNoZip = 0.0
707  if diskSize < 0.001:
708  memSizeNoZip = memSize
709  nEntries = inspector.GetDescriptor().GetNEntries()
710  poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
711  nEntries,
712  dirType = "N")
713  self.dataHeader = poolRecord
714  elif PoolOpts.isData(name):
715  if isinstance(obj, self.ROOT.TTree):
716  if not hasattr(obj, 'GetListOfBranches'):
717  continue
718  branches = obj.GetListOfBranches()
719  dirType = "T"
720  if name in (PoolOpts.EVENT_DATA, PoolOpts.META_DATA):
721  dirType = "B"
722  for branch in branches:
723  poolRecord = retrieveBranchInfos(
724  branch,
725  make_pool_record(branch, dirType),
726  ident = " "
727  )
728  poolRecord.augName = PoolOpts.augmentationName(name)
729  self.augNames.add(poolRecord.augName)
730  self.data += [ poolRecord ]
731  elif isRNTuple(obj):
732  descriptor = inspector.GetDescriptor()
733  dirType = "N"
734  if name in {PoolOpts.RNTupleNames.EventData, PoolOpts.RNTupleNames.MetaData}:
735  dirType = "F"
736  fieldZeroId = descriptor.GetFieldZeroId()
737  for fieldDescriptor in descriptor.GetFieldIterable(fieldZeroId):
738  fieldId = fieldDescriptor.GetId()
739  fieldTreeInspector = inspector.GetFieldTreeInspector(fieldId)
740  diskSize = fieldTreeInspector.GetCompressedSize() / Units.kb
741  memSize = fieldTreeInspector.GetUncompressedSize() / Units.kb
742  typeName = fieldDescriptor.GetTypeName()
743  fieldName = fieldDescriptor.GetFieldName()
744  poolRecord = PoolRecord(fieldName, memSize, diskSize, memSize,
745  descriptor.GetNEntries(),
746  dirType=dirType,
747  typeName=typeName)
748  poolRecord.augName = PoolOpts.augmentationName(name)
749  self.augNames.add(poolRecord.augName)
750  self.data += [ poolRecord ]
751  # loop over keys
752 
753  return
754 
755  def fileInfos(self):
756  return os.linesep.join( [
757  "File:" + self._fileInfos['name'],
758  "Size: %12.3f kb" % (self._fileInfos['size'] / Units.kb),
759  "Nbr Events: %i" % self.dataHeader.nEntries
760  ] )
761 
762 
763  def checkFile(self, sorting = PoolRecord.Sorter.DiskSize):
764  if self.verbose is True:
765  print(self.fileInfos())
766  if len(self.augNames) > 1:
767  for aug in self.augNames:
768  if len(aug) > 0:
769  print( "Nbr %s Events: %i" % (aug, self.dataHeaderA[aug].nEntries) )
770 
771 
772  data = self.data
773  if sorting in PoolRecord.Sorter.allowedValues():
774  import operator
775  data.sort(key = operator.attrgetter(sorting) )
776 
777  def _get_val(x, dflt=-999.):
778  if PoolOpts.FAST_MODE:
779  return dflt
780  return x
781 
782  totMemSize = _get_val(self.dataHeader.memSize, dflt=0.)
783  totDiskSize = self.dataHeader.diskSize
784 
785  def _safe_div(num,den):
786  if float(den) == 0.:
787  return 0.
788  return num/den
789 
790  if self.verbose is True:
791  print("")
792  print("="*80)
793  print(PoolOpts.HDR_FORMAT % ( "Mem Size", "Disk Size","Size/Evt",
794  "MissZip/Mem","items",
795  "(X) Container Name (X=Tree|Branch)" ))
796  print("="*80)
797 
798  print(PoolOpts.ROW_FORMAT % (
799  _get_val (self.dataHeader.memSize),
800  self.dataHeader.diskSize,
801  _safe_div(self.dataHeader.diskSize,float(self.dataHeader.nEntries)),
802  _get_val (_safe_div(self.dataHeader.memSizeNoZip,
803  self.dataHeader.memSize)),
804  self.dataHeader.nEntries,
805  "("+self.dataHeader.dirType+") "+self.dataHeader.name
806  ))
807  print("-"*80)
808 
809  totMemSizeA = {}
810  totDiskSizeA = {}
811  for d in data:
812  totMemSize += 0. if PoolOpts.FAST_MODE else d.memSize
813  totDiskSize += d.diskSize
814  memSizeNoZip = d.memSizeNoZip/d.memSize if d.memSize != 0. else 0.
815  aug = d.augName
816  totMemSizeA[aug] = totMemSizeA.get(aug,0.) + d.memSize
817  totDiskSizeA[aug] = totDiskSizeA.get(aug,0.) + d.diskSize
818  if self.verbose is True:
819  print(PoolOpts.ROW_FORMAT % (
820  _get_val (d.memSize),
821  d.diskSize,
822  _safe_div(d.diskSize, float(self.dataHeader.nEntries)),
823  _get_val (memSizeNoZip),
824  d.nEntries,
825  "("+d.dirType+") "+d.name
826  ))
827 
828  if self.verbose is True:
829  print("="*80)
830  if len(self.augNames) > 1:
831  augs = sorted(self.augNames)
832  for a in augs:
833  print(PoolOpts.ROW_FORMAT % (
834  totMemSizeA[a], totDiskSizeA[a],
835  _safe_div(totDiskSizeA[a], float(self.dataHeaderA[a].nEntries)),
836  0.0,
837  self.dataHeaderA[a].nEntries,
838  "Aug Stream: " + ('MAIN' if a=='' else a)
839  ))
840  print("-"*80)
841  print(PoolOpts.ROW_FORMAT % (
842  totMemSize, totDiskSize,
843  _safe_div(totDiskSize, float(self.dataHeader.nEntries)),
844  0.0, self.dataHeader.nEntries,
845  "TOTAL (POOL containers)"
846  ))
847  print("="*80)
848  if PoolOpts.FAST_MODE:
849  print("::: warning: FAST_MODE was enabled: some columns' content ",)
850  print("is meaningless...")
851  return
852 
853  def detailedDump(self, bufferName = None ):
854  if self.poolFile is None or \
855  self.keys is None:
856  print("Can't perform a detailedDump with a shelve file as input !")
857  return
858 
859  if bufferName is None:
860  bufferName = "/dev/stdout"
861  out = open( bufferName, "w" )
862  sys.stdout.flush()
863  save_stdout_fileno = os.dup (sys.stdout.fileno())
864  os.dup2( out.fileno(), sys.stdout.fileno() )
865 
866  out.write( "#" * 80 + os.linesep )
867  out.write( "## detailed dump" + os.linesep )
868  out.flush()
869 
870  for key in self.keys:
871  tree = key.ReadObj()
872  name = tree.GetName()
873 
874  if PoolOpts.isDataHeader(name) or \
875  PoolOpts.isData(name):
876  try:
877  print ("=== [%s] ===" % name, file=sys.stderr)
878  tree.Print()
879  except Exception as err:
880  print ("Caught:",err, file=sys.stderr)
881  print (sys.exc_info()[0], file=sys.stderr)
882  print (sys.exc_info()[1], file=sys.stderr)
883  pass
884  pass
885  pass
886  out.write( "#" * 80 + os.linesep )
887  out.flush()
888  out.write( "#" * 80 + os.linesep )
889 
893  out.flush()
894  if bufferName != "<stdout>":
895  out.close()
896  sys.stdout.close()
897  sys.stdout = open (save_stdout_fileno, 'a')
898  return
899 
900  def poolRecord(self, name):
901  """
902  Return a PoolRecord according to its (branch) name
903  Raise KeyError if no match is found
904  """
905  for data in self.data:
906  if data.name == name:
907  return data
908  raise KeyError("No PoolRecord with name [%s]" % name)
909 
910  def saveReport (self, fileName):
911  """
912  Save all the gathered informations into a python shelve or a CSV file
913  (depending on the @param `fileName` extension)
914  """
915  import os
916  if os.path.splitext(fileName)[-1] == '.csv':
917  return self._save_csv_report (fileName)
918  return self._save_shelve_report (fileName)
919 
920  def _save_shelve_report(self, fileName):
921  """
922  Save all the gathered informations into a python shelve
923  Data can then be read like so:
924  >>> import shelve
925  >>> db = shelve.open( 'myfile.dat', 'r' )
926  >>> report = db['report']
927  >>> print ('fileSize:',report['fileSize'])
928  >>> print ('dataHeader/memSize:',report['dataHeader'].memSize)
929  >>> for d in report['data']:
930  ... print ('data:',d.name,d.nEntries,d.memSize)
931  """
932  import shelve, os
933  if os.path.exists (fileName):
934  os.unlink (fileName)
935  db = shelve.open (fileName)
936  db['report'] = {
937  'fileInfos' : self._fileInfos,
938  'nbrEvts' : self.dataHeader.nEntries,
939  'dataHeader' : self.dataHeader,
940  'data' : self.data
941  }
942  db.close()
943  return
944 
945  def _save_csv_report(self, fileName):
946  """
947  Save all the gathered informations into a CSV file
948  """
949  import csv, os
950  if os.path.exists (fileName):
951  os.unlink (fileName)
952  args = {'newline' : ''}
953  f = open (fileName, 'w', **args)
954  o = csv.writer (f)
955  o.writerow (['file name', self._fileInfos['name']])
956  o.writerow (['file size', self._fileInfos['size']])
957  o.writerow (['nbr evts', self.dataHeader.nEntries])
958  o.writerow (['mem size', 'disk size', 'mem size nozip', 'items',
959  'container name', 'branch type'])
960 
961  for d in self.data:
962  o.writerow ([d.memSize, d.diskSize, d.memSizeNoZip,
963  d.nEntries, d.name, d.dirType])
964  f.close()
965  return
966 
967  def __del__(self):
968  if self.poolFile and hasattr(self.poolFile, 'Close'):
969  try:
970  self.poolFile.Close()
971  self.poolFile = None
972  except Exception as err:
973  print("WARNING:",err)
974  pass
975 
976  pass # class PoolFile
977 
979  """
980  A helper class to compare 2 POOL files and check that they match, both in
981  terms of containers' content and containers' sizes
982  """
983 
984  def __init__(self, refFileName, chkFileName, verbose = False, ignoreList = None, strict = False):
985  object.__init__(self)
986 
987  self.verbose = verbose
988  self.strict = strict
989  refFileName = os.path.expandvars( os.path.expanduser( refFileName ) )
990  chkFileName = os.path.expandvars( os.path.expanduser( chkFileName ) )
991 
992  if ignoreList is None:
993  ignoreList = []
994 
995  try:
996  self.refFile = PoolFile( refFileName )
997  self.chkFile = PoolFile( chkFileName )
998  self.ignList = sorted( ignoreList )
999  except Exception as err:
1000  print("## Caught exception [%s] !!" % str(err.__class__))
1001  print("## What:",err)
1002  print(sys.exc_info()[0])
1003  print(sys.exc_info()[1])
1004  err = "Error while opening POOL files !"
1005  err += " chk : %s%s" % ( chkFileName, os.linesep )
1006  err += " ref : %s%s" % ( refFileName, os.linesep )
1007  raise Exception(err)
1008 
1009  self.allGood = True
1010  self.summary = []
1011 
1012  self.__checkDiff()
1013  return
1014 
1015  def __checkDiff(self):
1016 
1017  self.summary += [
1018  "=" * 80,
1019  "::: Comparing POOL files...",
1020  " ref : %s" % self.refFile._fileInfos['name'],
1021  " chk : %s" % self.chkFile._fileInfos['name'],
1022  "-" * 80,
1023  ]
1024 
1025  if self.chkFile.dataHeader.nEntries != \
1026  self.refFile.dataHeader.nEntries :
1027  self.summary += [
1028  "## WARNING: files don't have the same number of entries !!",
1029  " ref : %r" % self.refFile.dataHeader.nEntries,
1030  " chk : %r" % self.chkFile.dataHeader.nEntries,
1031  ]
1032 
1033  refNames = sorted( [d.name for d in self.refFile.data] )
1034  chkNames = sorted( [d.name for d in self.chkFile.data] )
1035 
1036  if chkNames != refNames:
1037  self.summary += [
1038  "## ERROR: files don't have the same content !!",
1039  ]
1040  addNames = [ n for n in chkNames if n not in refNames ]
1041  if len( addNames ) > 0:
1042  self.summary += [ "## collections in 'chk' and not in 'ref'" ]
1043  for n in addNames:
1044  self.summary += [ " + %s" % n ]
1045  subNames = [ n for n in refNames if n not in chkNames ]
1046  if len( subNames ) > 0:
1047  self.summary += [ "## collections in 'ref' and not in 'chk'" ]
1048  for n in subNames:
1049  self.summary += [ " - %s" % n ]
1050  self.allGood = False
1051  pass
1052 
1053  if len(self.ignList) > 0:
1054  self.summary += [ "## Ignoring the following:" ]
1055  for n in self.ignList:
1056  self.summary += [ " %s" % n ]
1057 
1058  commonContent = [ d for d in chkNames if (d in refNames and d not in self.ignList)]
1059 
1060  if not self.allGood:
1061  self.summary += [ "=" * 80 ]
1062  self.summary += [ "::: comparing common content (mem-size / disk-size)..." ]
1063 
1064  for name in commonContent:
1065  chkMemSize = self.chkFile.poolRecord(name).memSize
1066  refMemSize = self.refFile.poolRecord(name).memSize
1067  chkDiskSize = self.chkFile.poolRecord(name).diskSize
1068  refDiskSize = self.refFile.poolRecord(name).diskSize
1069 
1070  if chkMemSize != refMemSize or (self.strict and chkDiskSize != refDiskSize):
1071  self.summary += [
1072  "[ERR] %12.3f / %12.3f kb (ref) ==> %12.3f / %12.3f kb (chk) | %s" % \
1073  ( refMemSize,refDiskSize,chkMemSize,chkDiskSize, name )
1074  ]
1075  self.allGood = False
1076  elif self.verbose:
1077  self.summary += [
1078  " [OK] %12.3f/%12.3f kb | %s" % \
1079  ( chkMemSize, chkDiskSize, name )
1080  ]
1081 
1082  self.summary += [ "=" * 80 ]
1083 
1084 
1085  if self.allGood: self.summary += [ "## Comparison : [OK]" ]
1086  else: self.summary += [ "## Comparison : [ERR]" ]
1087 
1088  return self.allGood
1089 
1090  def status(self):
1091  if self.allGood: return 0
1092  else: return 1
1093 
1094  def printSummary(self, out = sys.stdout):
1095  for i in self.summary:
1096  out.writelines( i + os.linesep )
1097  pass
1098  return
1099 
1101  """
1102  A counter just contains an item list (pairs class-name/sg-key) and the size
1103  """
1104  size = 0
1105  def __init__(self, name, itemList):
1106  object.__init__(self)
1107  self.name = name
1108  self.itemList = itemList
1109  pass # Counter
1110 
1111 
python.PoolFile.PoolRecord.diskSize
diskSize
Definition: PoolFile.py:509
python.PoolFile.DiffFiles.summary
summary
Definition: PoolFile.py:1010
python.PoolFile.PoolFile.augNames
augNames
Definition: PoolFile.py:536
python.PoolFile.DiffFiles.verbose
verbose
Definition: PoolFile.py:987
python.PoolFile.DiffFiles.__checkDiff
def __checkDiff(self)
Definition: PoolFile.py:1015
python.PoolFile.PoolFile.poolFile
poolFile
Definition: PoolFile.py:550
python.PoolFile.DiffFiles.status
def status(self)
Definition: PoolFile.py:1090
python.PoolFile.PoolFile.keys
keys
first we try to fetch the DataHeader
Definition: PoolFile.py:532
python.PoolFile.PoolRecord.memSize
memSize
Definition: PoolFile.py:508
python.PoolFile.PoolFileCatalog.__init__
def __init__(self, catalog=None)
Definition: PoolFile.py:58
python.PoolFile.PoolOpts
Definition: PoolFile.py:253
python.PoolFile.PoolFile.checkFile
def checkFile(self, sorting=PoolRecord.Sorter.DiskSize)
Definition: PoolFile.py:763
python.PoolFile.PoolRecord.__init__
def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType, detailedInfos="", typeName=None)
Definition: PoolFile.py:499
python.PoolFile._save_shelve_report
def _save_shelve_report(self, fileName)
Definition: PoolFile.py:920
python.PoolFile.PoolOpts.RNTupleNames
Definition: PoolFile.py:263
python.PoolFile.__del__
def __del__(self)
Definition: PoolFile.py:967
python.PoolFile.PoolFile.dataHeaderA
dataHeaderA
Definition: PoolFile.py:537
python.PoolFile.PoolRecord.dirType
dirType
Definition: PoolFile.py:512
python.PoolFile.Counter.name
name
Definition: PoolFile.py:1107
python.PoolFile.poolRecord
def poolRecord(self, name)
Definition: PoolFile.py:900
python.PoolFile.PoolFile.__openPoolFile
def __openPoolFile(self, fileName)
Definition: PoolFile.py:572
python.PoolFile.PoolFileCatalog._pfn
def _pfn(self, url_or_fid)
Definition: PoolFile.py:184
python.PoolFile.saveReport
def saveReport(self, fileName)
Definition: PoolFile.py:910
python.PoolFile.PoolRecord.nEntries
nEntries
Definition: PoolFile.py:511
python.PoolFile.PoolRecord
Definition: PoolFile.py:485
python.PoolFile.file_name
def file_name(fname)
Definition: PoolFile.py:321
python.PoolFile.PoolRecord.Sorter
Definition: PoolFile.py:488
python.PoolFile.PoolFile.data
data
Definition: PoolFile.py:538
python.PoolFile.PoolFileCatalog.__call__
def __call__(self, url_or_fid)
Definition: PoolFile.py:248
dumpHVPathFromNtuple.append
bool append
Definition: dumpHVPathFromNtuple.py:91
python.PoolFile.PoolFile.ROOT
ROOT
Definition: PoolFile.py:579
python.PoolFile.retrieveBranchInfos
def retrieveBranchInfos(branch, poolRecord, ident="")
Definition: PoolFile.py:420
XMLtoHeader.count
count
Definition: XMLtoHeader.py:85
python.PoolFile.DiffFiles.refFile
refFile
Definition: PoolFile.py:996
handler
void handler(int sig)
signal handler
Definition: rmain.cxx:98
python.PoolFile.PoolOpts.isAugmentedHeader
def isAugmentedHeader(cls, name)
Definition: PoolFile.py:304
python.PoolFile.PoolFileCatalog.AllowedProtocols
AllowedProtocols
Definition: PoolFile.py:49
python.Helpers.ShutUp
Definition: Tools/PyUtils/python/Helpers.py:63
python.PoolFile.PoolFile.fileInfos
def fileInfos(self)
Definition: PoolFile.py:755
python.PoolFile.PoolOpts.isData
def isData(cls, name)
Definition: PoolFile.py:279
Get
T * Get(TFile &f, const std::string &n, const std::string &dir="", const chainmap_t *chainmap=0, std::vector< std::string > *saved=0)
get a histogram given a path, and an optional initial directory if histogram is not found,...
Definition: comparitor.cxx:179
python.PoolFile.DiffFiles.chkFile
chkFile
Definition: PoolFile.py:997
python.PoolFile.PoolFile.__processFile
def __processFile(self)
Definition: PoolFile.py:614
python.PoolFile.PoolOpts.augmentationName
def augmentationName(cls, name)
Definition: PoolFile.py:297
python.PoolFile.PoolOpts.TTreeNames
Definition: PoolFile.py:258
python.PoolFile.PoolFileCatalog.pfn
def pfn(self, url_or_fid)
Definition: PoolFile.py:174
python.PoolFile.Counter.__init__
def __init__(self, name, itemList)
Definition: PoolFile.py:1105
python.PoolFile.PoolFile.dataHeader
dataHeader
try to also handle non-T/P separated DataHeaders (from old files)...
Definition: PoolFile.py:533
python.PoolFile.PoolRecord.augName
augName
Definition: PoolFile.py:514
plotBeamSpotVxVal.range
range
Definition: plotBeamSpotVxVal.py:195
python.PoolFile.DiffFiles.strict
strict
Definition: PoolFile.py:988
add
bool add(const std::string &hname, TKey *tobj)
Definition: fastadd.cxx:55
python.PoolFile.PoolFile._fileInfos
_fileInfos
Definition: PoolFile.py:531
python.PoolFile._save_csv_report
def _save_csv_report(self, fileName)
Definition: PoolFile.py:945
python.PoolFile.Counter.itemList
itemList
Definition: PoolFile.py:1108
DerivationFramework::TriggerMatchingUtils::sorted
std::vector< typename T::value_type > sorted(T begin, T end)
Helper function to create a sorted vector from an unsorted one.
CxxUtils::set
constexpr std::enable_if_t< is_bitmask_v< E >, E & > set(E &lhs, E rhs)
Convenience function to set bits in a class enum bitmask.
Definition: bitmask.h:232
print
void print(char *figname, TCanvas *c1)
Definition: TRTCalib_StrawStatusPlots.cxx:25
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
python.PoolFile.PoolRecord.Sorter.allowedValues
def allowedValues()
Definition: PoolFile.py:494
python.PoolFile.PoolFile.detailedDump
def detailedDump(self, bufferName=None)
Definition: PoolFile.py:853
python.PoolFile.make_pool_record
def make_pool_record(branch, dirType)
Definition: PoolFile.py:441
python.PoolFile._setup_ssl
def _setup_ssl(root)
Definition: PoolFile.py:381
python.PoolFile.PoolFile.__init__
def __init__(self, fileName, verbose=True)
Definition: PoolFile.py:528
python.PoolFile.PoolOpts.isDataHeader
def isDataHeader(cls, name)
Definition: PoolFile.py:283
python.PoolFile.PoolFileCatalog
Definition: PoolFile.py:43
python.PoolFile.PoolFile.verbose
verbose
Definition: PoolFile.py:539
Trk::open
@ open
Definition: BinningType.h:40
python.PoolFile.DiffFiles.printSummary
def printSummary(self, out=sys.stdout)
Definition: PoolFile.py:1094
ActsTrk::detail::MakeDerivedVariant::extend
constexpr std::variant< Args..., T > extend(const std::variant< Args... > &, const T &)
Definition: MakeDerivedVariant.h:17
python.PoolFile.DiffFiles.allGood
allGood
Definition: PoolFile.py:1009
python.PoolFile.PoolRecord.name
name
Definition: PoolFile.py:507
python.PoolFile.PoolRecord.typeName
typeName
Definition: PoolFile.py:515
python.PoolFile.PoolRecord.details
details
Definition: PoolFile.py:513
python.CaloScaleNoiseConfig.type
type
Definition: CaloScaleNoiseConfig.py:78
python.PoolFile.isRNTuple
def isRNTuple(obj)
Definition: PoolFile.py:36
python.PoolFile.DiffFiles
Definition: PoolFile.py:978
pickleTool.object
object
Definition: pickleTool.py:30
str
Definition: BTagTrackIpAccessor.cxx:11
python.PoolFile.PoolFileCatalog.catalog
catalog
Definition: PoolFile.py:60
python.PoolFile._root_open
def _root_open(fname)
Definition: PoolFile.py:395
ROOT::Experimental
Definition: RNTCollection.h:28
python.PoolFile.PoolFile
Definition: PoolFile.py:519
python.PoolFile.extract_items
def extract_items(pool_file, verbose=True, items_type='eventdata')
Definition: PoolFile.py:454
python.PoolFile.PoolRecord.memSizeNoZip
memSizeNoZip
Definition: PoolFile.py:510
python.PoolFile._get_total_size
def _get_total_size(branch)
Definition: PoolFile.py:309
python.PoolFile.DiffFiles.__init__
def __init__(self, refFileName, chkFileName, verbose=False, ignoreList=None, strict=False)
Definition: PoolFile.py:984
python.PoolFile.PoolOpts.isEventData
def isEventData(cls, name)
Definition: PoolFile.py:289
python.PoolFile.Units
— data ---------------------------------------------------------------—
Definition: PoolFile.py:31
python.PoolFile.DiffFiles.ignList
ignList
Definition: PoolFile.py:998
readCCLHist.float
float
Definition: readCCLHist.py:83
Trk::split
@ split
Definition: LayerMaterialProperties.h:38
python.PoolFile.PoolFileCatalog.DefaultCatalog
DefaultCatalog
Definition: PoolFile.py:48
python.PoolFile.PoolOpts.isAugmentation
def isAugmentation(cls, name)
Definition: PoolFile.py:293
python.PoolFile.Counter
Definition: PoolFile.py:1100