ATLAS Offline Software
PoolFile.py
Go to the documentation of this file.
1 # Copyright (C) 2002-2024 CERN for the benefit of the ATLAS collaboration
2 
3 # @author: Sebastien Binet <binet@cern.ch>
4 # @date: March 2007
5 #
6 #
7 
8 __author__ = "Sebastien Binet <binet@cern.ch>"
9 
10 
11 __all__ = [
12  'PoolFileCatalog',
13  'PoolOpts',
14  'extract_items',
15  'PoolRecord',
16  'PoolFile',
17  'DiffFiles',
18  ]
19 
20 
21 import sys
22 import os
23 import shelve
24 
25 from dbm import whichdb
26 
27 from .Helpers import ShutUp
28 
29 
30 class Units (object):
31  kb = 1024.
32  Mb = 1024.*1024.
33 
34 
36  """ reverse-engineering of the POOL FileCatalog.
37  allows to retrieve the physical filename from a logical one, provided
38  that the file-id is known to the (real) PoolFileCatalog
39  """
40  DefaultCatalog = "xmlcatalog_file:PoolFileCatalog.xml"
41  AllowedProtocols = (
42  # see: PoolSvc::createCatalog
43  # http://alxr.usatlas.bnl.gov/lxr/source/atlas/Database/AthenaPOOL/PoolSvc/src/PoolSvc.cxx?v=head#736
44  "xmlcatalog_file:", # POOL default
45  "apcfile:", # ATLAS_POOLCOND_PATH
46  "prfile:", # file via PathResolver
47  "file:", # simple file on local FS
48  )
49 
50  def __init__ (self, catalog=None):
51  super (PoolFileCatalog, self).__init__()
52  self.catalog = None
53 
54  if catalog is None:
55  # chase poolfilecatalog location
56  catalog = os.environ.get("POOL_CATALOG", self.DefaultCatalog)
57 
58  if isinstance(catalog, str):
59  catalog = [catalog]
60 
61  if not isinstance (catalog, (str, list)):
62  raise TypeError(
63  "catalog contact string should be a string or a list thereof! (got %r)"%
64  type(catalog))
65 
66  osp = os.path
67  def osp_exp(x):
68  return osp.expanduser(osp.expandvars(x))
69 
70  def _handle_apcfile_old(x):
71  """ return $ATLAS_POOLCOND_PATH/poolcond/x
72  """
73  if 'ATLAS_POOLCOND_PATH' not in os.environ:
74  return osp_exp(x)
75  pcp = os.environ["ATLAS_POOLCOND_PATH"]
76  if x.startswith("apcfile:"):
77  x = x[len("apcfile:"):]
78  return osp_exp(osp.join(pcp, 'poolcond', x))
79 
80  def _handle_apcfile(x):
81  """ return $ATLAS_POOLCOND_PATH/x
82  """
83  if 'ATLAS_POOLCOND_PATH' not in os.environ:
84  return osp_exp(x)
85  pcp = os.environ["ATLAS_POOLCOND_PATH"]
86  if x.startswith("apcfile:"):
87  x = x[len("apcfile:"):]
88  return osp_exp(osp.join(pcp, x))
89 
90  def _handle_xmlcatalog_file(x):
91  return osp_exp(x[len("xmlcatalog_file:"):])
92 
93  def _handle_prfile(x):
94  x = x[len("prfile:"):]
95  x = osp_exp(x)
96  try:
97  import AthenaCommon.Utils.unixtools as u
98  return u.FindFile(x,
99  os.environ['DATAPATH'].split(os.pathsep),
100  os.R_OK)
101  except ImportError:
102  return x
103 
104  def _handle_file(x):
105  x = x[len("file:"):]
106  x = osp_exp(x)
107  return x
108 
109  cat_dispatch = {
110  "xmlcatalog_file:": _handle_xmlcatalog_file,
111  "apcfile:": _handle_apcfile,
112  "prfile:": _handle_prfile,
113  "file:": _handle_file,
114  }
115  assert sorted(cat_dispatch.keys()) == sorted(self.AllowedProtocols), \
116  "catalog dispatch keys does not match AllowedProtocols:" \
117  "\n%s\n%s" % (sorted(cat_dispatch.keys()),
118  sorted(self.AllowedProtocols))
119 
120  from . import xmldict
121  def _build_catalog(catalog):
122  if not catalog.startswith(self.AllowedProtocols):
123  raise ValueError(
124  "sorry PoolFile:PoolFileCatalog only supports %s"
125  " as a protocol for the POOL file catalog (got: '%s')"
126  % (self.AllowedProtocols, catalog)
127  )
128  for protocol, handler in cat_dispatch.iteritems():
129  if catalog.startswith(protocol):
130  catalog = handler(catalog)
131  break
132  # make sure the catalog exists...
133  import os
134 
135  if not os.path.exists (catalog):
136  return {}
137  # raise RuntimeError(
138  # 'could not find any PoolFileCatalog in [%s]' % catalog
139  # )
140 
141 
142  root = xmldict.ElementTree.parse (catalog).getroot()
143  return dict(xmldict.xml2dict(root))
144 
145  errors = []
146  cat = {'POOLFILECATALOG':{'File':[]}}
147  for c in catalog:
148  try:
149  bc = _build_catalog(c)
150  pc = bc.get('POOLFILECATALOG',{})
151  files = []
152  if pc:
153  files = pc.get('File',[])
154  if isinstance(files, dict):
155  files = [files]
156  cat['POOLFILECATALOG']['File'].extend(files)
157  except Exception as err:
158  errors.append(err)
159 
160  if errors:
161  raise errors[0] # FIXME : should we customize this a bit ?
162 
163  self.catalog = cat
164  pass
165 
166  def pfn (self, url_or_fid):
167  """find the physical file name given a url or a file-id"""
168  import os.path as osp
169  url_or_fid = osp.expanduser(osp.expandvars(url_or_fid))
170  import types
171  if isinstance (url_or_fid, types.ListType):
172  return [self._pfn(f) for f in url_or_fid]
173  else:
174  return self._pfn(url_or_fid)
175 
176  def _pfn (self, url_or_fid):
177  """find the physical file name given a url or a file-id"""
178  if not ('POOLFILECATALOG' in self.catalog):
179  return None
180  if not ('File' in self.catalog['POOLFILECATALOG']):
181  return None
182 
183  PFN_IDX = 0 # take this pfn when alternates exist
184 
185  files = self.catalog['POOLFILECATALOG']['File']
186  if isinstance(files, dict):
187  # in case there where only one entry in the catalog
188  files = [files]
189  import re
190  if url_or_fid.lower().startswith('fid:'):
191  url_or_fid = url_or_fid[len('fid:'):]
192  if re.compile (r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$').match (url_or_fid):
193  fid = url_or_fid.lower()
194  # better to check consistency of catalog over all entries
195  # than declare success on first match...
196  match = {}
197  for f in files:
198  if f.ID.lower() == fid:
199  match[fid] = []
200  pfn = f.physical.pfn
201  if isinstance(pfn, (list,tuple)):
202  match[fid].append([i.name for i in pfn])
203  else:
204  match[fid].append([pfn.name])
205  if len(match[fid])==1:
206  return match[fid][0][PFN_IDX]
207  if len(match[fid])>1:
208  raise LookupError (
209  "more than one match for FID='%s'!\n%r"%(fid,match)
210  )
211  raise KeyError ("no entry with FID='%s' in catalog" % fid)
212  else:
213  url = url_or_fid
214  if url.lower().startswith("lfn:"):
215  url = url[len("lfn:"):]
216  # better to check consistency of catalog over all entries
217  # than declare success on first match...
218  match = {}
219  for f in files:
220  if (f.logical != '' # no LFN for this entry
221  and f.logical.lfn.name == url):
222  match[url] = []
223  pfn = f.physical.pfn
224  if isinstance(pfn, (list,tuple)):
225  match[url].append([i.name for i in pfn])
226  else:
227  match[url].append([pfn.name])
228  if len(match[url])==1:
229  return match[url][0][PFN_IDX]
230  if len(match[url])>1:
231  raise LookupError (
232  "more than one match for LFN='%s'!\n%r"%(url,match)
233  )
234  raise KeyError ("no entry with LFN='%s' in catalog" % url)
235  # assume that if not LFN: then PFN:, no matter what...
236  if url.lower().startswith("pfn:"):
237  url = url[len("pfn:"):]
238  return url
239 
240  def __call__ (self, url_or_fid):
241  return self.pfn (url_or_fid)
242 
243  pass
244 
246  # default names of APR file storage elements
247  # copied here from RootUtils/APRDefaults.h for performance (as the first dictionary access takes 7 sec)
248  # see ATEAM-973 for a more detailed discussion
249  # the definitions here should be kept in sync with those!
250  class TTreeNames:
251  EventData = "CollectionTree"
252  EventTag = "POOLCollectionTree"
253  DataHeader = "POOLContainer"
254  MetaData = "MetaData"
256  EventData = "EventData"
257  EventTag = "EventTag"
258  DataHeader = "DataHeader"
259  MetaData = "MetaData"
260 
261  FAST_MODE = False
262  SUPER_DETAILED_BRANCH_SZ = False
263  READ_MODE = "READ"
264  POOL_HEADER = TTreeNames.DataHeader
265  EVENT_DATA = TTreeNames.EventData
266  META_DATA = TTreeNames.MetaData
267  HDR_FORMAT = " %11s %11s %11s %11s %5s %s"
268  ROW_FORMAT = "%12.3f kb %12.3f kb %12.3f kb %12.3f %8i %s"
269 
270  @classmethod
271  def isData(cls, name):
272  return not name.startswith("##") and not cls.isDataHeader(name)
273 
274  @classmethod
275  def isDataHeader(cls, name):
276  return name in {cls.TTreeNames.DataHeader
277  , cls.TTreeNames.DataHeader+"_DataHeader"
278  , cls.RNTupleNames.DataHeader}
279 
280  @classmethod
281  def isEventData(cls, name):
282  return name.startswith(PoolOpts.EVENT_DATA)
283 
284  @classmethod
285  def isAugmentation(cls, name):
286  return "_DAOD_" in name
287 
288  @classmethod
289  def augmentationName(cls, name):
290  s = (name+"__").split('_')[2]
291  if s.endswith("Form"):
292  s = s[:-4]
293  return s
294 
295  @classmethod
296  def isAugmentedHeader(cls, name):
297  return name.startswith(PoolOpts.POOL_HEADER) and cls.isAugmentation(name)
298 
299  pass # class PoolOpts
300 
301 def _get_total_size (branch):
302  if PoolOpts.FAST_MODE:
303  return -1.
304  if not PoolOpts.SUPER_DETAILED_BRANCH_SZ:
305  return branch.GetTotalSize()
306  brSize = 0
307  branch.LoadBaskets()
308  for bnum in range(0, branch.GetWriteBasket()):
309  basket = branch.GetBasket(bnum)
310  brSize += basket.GetObjlen() - 8
311  return brSize
312 
313 def file_name(fname):
314  """take a file name, return the pair (protocol, 'real' file name)
315  """
316  fname = os.path.expanduser(os.path.expandvars(fname))
317 
318  def _normalize_uri(uri):
319  if uri.startswith('/'):
320  return 'file:'+uri
321  return uri
322 
323  from urllib.parse import urlsplit
324  url = urlsplit(_normalize_uri(fname))
325  protocol = url.scheme
326  def _normalize(fname):
327  from posixpath import normpath
328  fname = normpath(fname)
329  if fname.startswith('//'): fname = fname[1:]
330  return fname
331 
332  if protocol in ('', 'file', 'pfn'):
333  protocol = ''
334  fname = _normalize(url.path)
335 
336 
337  if fname.startswith('/castor/'):
338  protocol = 'rfio'
339  fname = protocol + ':' + fname
340 
341  elif protocol in ('rfio', 'castor'):
342  protocol = 'rfio'
343  fname = _normalize(url.path)
344  fname = protocol+':'+fname
345 
346  elif protocol in ('root','dcap', 'dcache', 'http', 'https', 'dav', 'davs'):
347  pass
348 
349  elif protocol in ('gsidcap',):
350  protocol = 'gfal:gsidcap'
351  pass
352 
353  elif protocol in ('lfn','fid',):
354  # percolate through the PoolFileCatalog
355  from PyUtils.PoolFile import PoolFileCatalog as pfc
356  fname = pfc().pfn(protocol+':'+url.path)
357  pass
358 
359  elif protocol in ('ami',):
360  # !! keep order of tokens !
361  for token in ('ami:', '//', '/'):
362  if fname.startswith(token):
363  fname = fname[len(token):]
364  fname = 'ami://' + fname
365  pass
366 
367  else:
368  print(f'## warning: unknown protocol [{protocol}]. we will just return our input')
369  pass
370 
371  return (protocol, fname)
372 
373 def _setup_ssl(root):
374  x509_proxy = os.environ.get('X509_USER_PROXY', '')
375  if x509_proxy:
376  # setup proper credentials
377  root.TSSLSocket.SetUpSSL(
378  x509_proxy,
379  "/etc/grid-security/certificates",
380  x509_proxy,
381  x509_proxy)
382  else:
383  print("## warning: protocol https is requested but no X509_USER_PROXY was found! (opening the file might fail.)")
384  pass
385  return
386 
387 def _root_open(fname):
388  import PyUtils.RootUtils as ru
389  root = ru.import_root()
390  import re
391 
392  with ShutUp(filters=[
393  re.compile('TClass::TClass:0: RuntimeWarning: no dictionary for class.*') ]):
394  root.gSystem.Load('libRootCollection')
395  root_open = root.TFile.Open
396 
397  # we need to get back the protocol b/c of the special
398  # case of secure-http which needs to open TFiles as TWebFiles...
399  protocol, _ = file_name(fname)
400  if protocol == 'https':
401  _setup_ssl(root)
402  root_open = root.TWebFile.Open
403 
404  f = root_open(fname, 'READ')
405  if f is None or not f:
406  import errno
407  raise IOError(errno.ENOENT,
408  'No such file or directory',fname)
409  return f
410  return
411 
412 def retrieveBranchInfos( branch, poolRecord, ident = "" ):
413  fmt = "%s %3i %8.3f %8.3f %8.3f %s"
414  if 0:
415  out = fmt % ( ident,
416  branch.GetListOfBranches().GetSize(),
417  _get_total_size (branch),
418  branch.GetTotBytes(),
419  branch.GetZipBytes(),
420  branch.GetName() )
421  print(out)
422 
423  branches = branch.GetListOfBranches()
424  for b in branches:
425  poolRecord.memSize += _get_total_size (b) / Units.kb
426  if (b.GetZipBytes() < 0.001):
427  poolRecord.memSizeNoZip += _get_total_size (b) / Units.kb
428  poolRecord.diskSize += b.GetZipBytes() / Units.kb
429  poolRecord = retrieveBranchInfos ( b, poolRecord, ident+" " )
430 
431  return poolRecord
432 
433 def make_pool_record (branch, dirType):
434  memSize = _get_total_size (branch) / Units.kb
435  zipBytes = branch.GetZipBytes()
436  memSizeNoZip = memSize if zipBytes < 0.001 else 0.
437  diskSize = branch.GetZipBytes() / Units.kb
438  typeName = branch.GetClassName()
439  if not typeName and (leaf := branch.GetListOfLeaves().At(0)):
440  typeName = leaf.GetTypeName()
441  return PoolRecord(branch.GetName(), memSize, diskSize, memSizeNoZip,
442  branch.GetEntries(),
443  dirType=dirType,
444  typeName=typeName)
445 
446 def extract_items(pool_file, verbose=True, items_type='eventdata'):
447  """Helper function to read a POOL file and extract the item-list from the
448  DataHeader content.
449  @params
450  `pool_file` the name of the pool file to inspect
451  `verbose` self-explanatory
452  `items_type` what kind of items one is interested in
453  allowed values: 'eventdata' 'metadata'
454  Note: this function is actually executed in a forked sub-process
455  if `fork` is True
456  """
457  _allowed_values = ('eventdata',
458  'metadata',)
459  if items_type not in _allowed_values:
460  err = "".join([
461  "invalid argument for 'items_type'. ",
462  "got: [%s] " % items_type,
463  "(allowed values: %r)" % _allowed_values
464  ])
465  raise ValueError(err)
466 
467  key = '%s_items' % items_type
468  f_root = _root_open(pool_file)
469  import PyUtils.FilePeekerTool as fpt
470  fp = fpt.FilePeekerTool(f_root)
471  items = fp.getPeekedData(key)
472 
473  if items is None:
474  items = []
475  return items
476 
478  """
479  """
480  class Sorter:
481  DiskSize = "diskSize"
482  MemSize = "memSize"
483  ContainerName = "name"
484 
485  @staticmethod
487  return [ PoolRecord.Sorter.DiskSize,
488  PoolRecord.Sorter.MemSize,
489  PoolRecord.Sorter.ContainerName ]
490  pass
491  def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType,
492  detailedInfos = "", typeName = None):
493  """Initialize PoolRecord instance.
494 
495  dirType first letter of object type name that may distinguish the types:
496  "T" for TTree, "B" for TBranch,
497  "N" for RNTuple, "F" for RField
498  """
499  object.__init__(self)
500  self.name = name
501  self.memSize = memSize
502  self.diskSize = diskSize
503  self.memSizeNoZip = memSizeNoZip
504  self.nEntries = nEntries
505  self.dirType = dirType
506  self.details = detailedInfos
507  self.augName = ''
508  self.typeName = typeName
509  return
510 
512  """
513  A simple class to retrieve informations about the content of a POOL file.
514  It should be abstracted from the underlying technology used to create this
515  POOL file (Db, ROOT,...).
516  Right now, we are using the easy and loosy solution: going straight to the
517  ROOT 'API'.
518  """
519 
520  def __init__(self, fileName, verbose=True):
521  object.__init__(self)
522 
523  self._fileInfos = None
524  self.keys = None
525  self.dataHeader = PoolRecord("DataHeader", 0, 0, 0,
526  nEntries = 0,
527  dirType = "T")
528  self.augNames = set()
529  self.dataHeaderA = {}
530  self.data = []
531  self.verbose = verbose
532 
533  # get the "final" file name (handles all kind of protocols)
534  try:
535  protocol, fileName = file_name(fileName)
536  except Exception as err:
537  print("## warning: problem opening PoolFileCatalog:\n%s"%err)
538  import traceback
539  traceback.print_exc(err)
540  pass
541 
542  self.poolFile = None
543  dbFileName = whichdb( fileName )
544  if dbFileName not in ( None, '' ):
545  if self.verbose is True:
546  print("## opening file [%s]..." % str(fileName))
547  db = shelve.open( fileName, 'r' )
548  if self.verbose is True:
549  print("## opening file [OK]")
550  report = db['report']
551  self._fileInfos = report['fileInfos']
552  self.dataHeader = report['dataHeader']
553  self.data = report['data']
554  else:
555  if self.verbose is True:
556  print("## opening file [%s]..." % str(fileName))
557  self.__openPoolFile( fileName )
558  if self.verbose is True:
559  print("## opening file [OK]")
560  self.__processFile()
561 
562  return
563 
564  def __openPoolFile(self, fileName):
565  # hack to prevent ROOT from loading graphic libraries and hence bother
566  # our fellow Mac users
567  if self.verbose is True:
568  print("## importing ROOT...")
569  import PyUtils.RootUtils as ru
570  ROOT = ru.import_root()
571  self.ROOT = ROOT
572  if self.verbose is True:
573  print("## importing ROOT... [DONE]")
574  # prevent ROOT from being too verbose
575  rootMsg = ShutUp()
576  rootMsg.mute()
577  ROOT.gErrorIgnoreLevel = ROOT.kFatal
578 
579  poolFile = None
580  try:
581  poolFile = ROOT.TFile.Open( fileName, PoolOpts.READ_MODE )
582  except Exception as e:
583  rootMsg.unMute()
584  print("## Failed to open file [%s] !!" % fileName)
585  print("## Reason:")
586  print(e)
587  print("## Bailing out...")
588  raise IOError("Could not open file [%s]" % fileName)
589 
590  rootMsg.unMute()
591 
592  if poolFile is None:
593  print("## Failed to open file [%s] !!" % fileName)
594  msg = "Could not open file [%s]" % fileName
595  raise IOError(msg)
596 
597  self.poolFile = poolFile
598  assert self.poolFile.IsOpen() and not self.poolFile.IsZombie(), \
599  "Invalid POOL file or a Zombie one"
600  self._fileInfos = {
601  'name' : self.poolFile.GetName(),
602  'size' : self.poolFile.GetSize(),
603  }
604  return
605 
606  def __processFile(self):
607 
608  for name in {PoolOpts.TTreeNames.DataHeader, PoolOpts.RNTupleNames.DataHeader}:
609  dhKey = self.poolFile.FindKey( name )
610  if dhKey:
611  obj = self.poolFile.Get( name )
612  if isinstance(obj, self.ROOT.TTree):
613  nEntries = obj.GetEntries()
614  elif isinstance(obj, self.ROOT.Experimental.RNTuple):
615  nEntries = self.ROOT.Experimental.RNTupleReader.Open(obj).GetNEntries()
616  else:
617  raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
618  break
619  else:
620  nEntries = 0
621 
622  keys = []
623  containers = []
624  for k in self.poolFile.GetListOfKeys():
625  keyname = k.GetName()
626  obj = self.poolFile.Get( keyname )
627  if isinstance(obj, self.ROOT.TTree):
628  containerName = obj.GetName()
629  nEntries = obj.GetEntries()
630  dirType = "T"
631  elif isinstance(obj, self.ROOT.Experimental.RNTuple):
632  reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
633  containerName = reader.GetDescriptor().GetName()
634  nEntries = reader.GetNEntries()
635  dirType = "N"
636  else:
637  raise NotImplementedError(f"Keys of type {type(obj)!r} not supported")
638  if containerName not in containers:
639  keys.append(k)
640  containers.append(containerName)
641  pass
642  if keyname.startswith(PoolOpts.POOL_HEADER) and not keyname.endswith('Form'):
643  self.dataHeaderA[PoolOpts.augmentationName(keyname)] = \
644  PoolRecord("DataHeader", 0, 0, 0,
645  nEntries = nEntries,
646  dirType = dirType)
647 
648  keys.sort (key = lambda x: x.GetName())
649  self.keys = keys
650  del containers
651 
652  for k in keys:
653  obj = self.poolFile.Get( k.GetName() )
654  if isinstance(obj, self.ROOT.TTree):
655  name = obj.GetName()
656  elif isinstance(obj, self.ROOT.Experimental.RNTuple):
657  reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
658  name = reader.GetDescriptor().GetName()
659 
660  if PoolOpts.isDataHeader(name):
661  contName = "DataHeader"
662  if isinstance(obj, self.ROOT.TTree):
663  memSize = obj.GetTotBytes() / Units.kb
664  diskSize = obj.GetZipBytes() / Units.kb
665  memSizeNoZip = 0.0
666  if diskSize < 0.001:
667  memSizeNoZip = memSize
668  nEntries = obj.GetEntries()
669 
671  dhBranchNames = [
672  br.GetName() for br in obj.GetListOfBranches()
673  if br.GetName().count("DataHeader_p") > 0
674  ]
675  if len(dhBranchNames) == 1:
676  dhBranch = obj.GetBranch(dhBranchNames[0])
677  typeName = dhBranch.GetClassName()
678  if not typeName and (leaf := dhBranch.GetListOfLeaves().At(0)):
679  typeName = leaf.GetTypeName()
680  poolRecord = retrieveBranchInfos(
681  dhBranch,
682  PoolRecord( contName, 0., 0., 0.,
683  nEntries,
684  dirType = "T",
685  typeName = typeName ),
686  ident = " "
687  )
688  else:
689  poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
690  nEntries,
691  dirType = "T")
692 
693  self.dataHeader = poolRecord
694  elif isinstance(obj, self.ROOT.Experimental.RNTuple):
695  reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
696  inspector = self.ROOT.Experimental.RNTupleInspector.Create(obj)
697  diskSize = inspector.GetCompressedSize() / Units.kb
698  memSize = inspector.GetUncompressedSize() / Units.kb
699 
700  memSizeNoZip = 0.0
701  if diskSize < 0.001:
702  memSizeNoZip = memSize
703  nEntries = reader.GetNEntries()
704  poolRecord = PoolRecord(contName, memSize, diskSize, memSizeNoZip,
705  nEntries,
706  dirType = "N")
707  self.dataHeader = poolRecord
708  elif PoolOpts.isData(name):
709  if isinstance(obj, self.ROOT.TTree):
710  if not hasattr(obj, 'GetListOfBranches'):
711  continue
712  branches = obj.GetListOfBranches()
713  dirType = "T"
714  if name in (PoolOpts.EVENT_DATA, PoolOpts.META_DATA):
715  dirType = "B"
716  for branch in branches:
717  poolRecord = retrieveBranchInfos(
718  branch,
719  make_pool_record(branch, dirType),
720  ident = " "
721  )
722  poolRecord.augName = PoolOpts.augmentationName(name)
723  self.augNames.add(poolRecord.augName)
724  self.data += [ poolRecord ]
725  elif isinstance(obj, self.ROOT.Experimental.RNTuple):
726  reader = self.ROOT.Experimental.RNTupleReader.Open(obj)
727  descriptor = reader.GetDescriptor()
728  inspector = self.ROOT.Experimental.RNTupleInspector.Create(obj)
729  dirType = "N"
730  if name in {PoolOpts.RNTupleNames.EventData, PoolOpts.RNTupleNames.MetaData}:
731  dirType = "F"
732  fieldZeroId = descriptor.GetFieldZeroId()
733  for fieldDescriptor in descriptor.GetFieldIterable(fieldZeroId):
734  fieldId = fieldDescriptor.GetId()
735  fieldTreeInspector = inspector.GetFieldTreeInspector(fieldId)
736  diskSize = fieldTreeInspector.GetCompressedSize() / Units.kb
737  memSize = fieldTreeInspector.GetUncompressedSize() / Units.kb
738  fieldDescriptor = fieldTreeInspector.GetDescriptor()
739  typeName = fieldDescriptor.GetTypeName()
740  fieldName = fieldDescriptor.GetFieldName()
741  poolRecord = PoolRecord(fieldName, memSize, diskSize, memSize,
742  descriptor.GetNEntries(),
743  dirType=dirType,
744  typeName=typeName)
745  poolRecord.augName = PoolOpts.augmentationName(name)
746  self.augNames.add(poolRecord.augName)
747  self.data += [ poolRecord ]
748  # loop over keys
749 
750  return
751 
752  def fileInfos(self):
753  return os.linesep.join( [
754  "File:" + self._fileInfos['name'],
755  "Size: %12.3f kb" % (self._fileInfos['size'] / Units.kb),
756  "Nbr Events: %i" % self.dataHeader.nEntries
757  ] )
758 
759 
760  def checkFile(self, sorting = PoolRecord.Sorter.DiskSize):
761  if self.verbose is True:
762  print(self.fileInfos())
763  if len(self.augNames) > 1:
764  for aug in self.augNames:
765  if len(aug) > 0:
766  print( "Nbr %s Events: %i" % (aug, self.dataHeaderA[aug].nEntries) )
767 
768 
769  data = self.data
770  if sorting in PoolRecord.Sorter.allowedValues():
771  import operator
772  data.sort(key = operator.attrgetter(sorting) )
773 
774  def _get_val(x, dflt=-999.):
775  if PoolOpts.FAST_MODE:
776  return dflt
777  return x
778 
779  totMemSize = _get_val(self.dataHeader.memSize, dflt=0.)
780  totDiskSize = self.dataHeader.diskSize
781 
782  def _safe_div(num,den):
783  if float(den) == 0.:
784  return 0.
785  return num/den
786 
787  if self.verbose is True:
788  print("")
789  print("="*80)
790  print(PoolOpts.HDR_FORMAT % ( "Mem Size", "Disk Size","Size/Evt",
791  "MissZip/Mem","items",
792  "(X) Container Name (X=Tree|Branch)" ))
793  print("="*80)
794 
795  print(PoolOpts.ROW_FORMAT % (
796  _get_val (self.dataHeader.memSize),
797  self.dataHeader.diskSize,
798  _safe_div(self.dataHeader.diskSize,float(self.dataHeader.nEntries)),
799  _get_val (_safe_div(self.dataHeader.memSizeNoZip,
800  self.dataHeader.memSize)),
801  self.dataHeader.nEntries,
802  "("+self.dataHeader.dirType+") "+self.dataHeader.name
803  ))
804  print("-"*80)
805 
806  totMemSizeA = {}
807  totDiskSizeA = {}
808  for d in data:
809  totMemSize += 0. if PoolOpts.FAST_MODE else d.memSize
810  totDiskSize += d.diskSize
811  memSizeNoZip = d.memSizeNoZip/d.memSize if d.memSize != 0. else 0.
812  aug = d.augName
813  totMemSizeA[aug] = totMemSizeA.get(aug,0.) + d.memSize
814  totDiskSizeA[aug] = totDiskSizeA.get(aug,0.) + d.diskSize
815  if self.verbose is True:
816  print(PoolOpts.ROW_FORMAT % (
817  _get_val (d.memSize),
818  d.diskSize,
819  _safe_div(d.diskSize, float(self.dataHeader.nEntries)),
820  _get_val (memSizeNoZip),
821  d.nEntries,
822  "("+d.dirType+") "+d.name
823  ))
824 
825  if self.verbose is True:
826  print("="*80)
827  if len(self.augNames) > 1:
828  augs = sorted(self.augNames)
829  for a in augs:
830  print(PoolOpts.ROW_FORMAT % (
831  totMemSizeA[a], totDiskSizeA[a],
832  _safe_div(totDiskSizeA[a], float(self.dataHeaderA[a].nEntries)),
833  0.0,
834  self.dataHeaderA[a].nEntries,
835  "Aug Stream: " + ('MAIN' if a=='' else a)
836  ))
837  print("-"*80)
838  print(PoolOpts.ROW_FORMAT % (
839  totMemSize, totDiskSize,
840  _safe_div(totDiskSize, float(self.dataHeader.nEntries)),
841  0.0, self.dataHeader.nEntries,
842  "TOTAL (POOL containers)"
843  ))
844  print("="*80)
845  if PoolOpts.FAST_MODE:
846  print("::: warning: FAST_MODE was enabled: some columns' content ",)
847  print("is meaningless...")
848  return
849 
850  def detailedDump(self, bufferName = sys.stdout.name ):
851  if self.poolFile is None or \
852  self.keys is None:
853  print("Can't perform a detailedDump with a shelve file as input !")
854  return
855 
856  if bufferName == sys.stdout.name:
857  bufferName = "/dev/stdout"
858  out = open( bufferName, "w" )
859  sys.stdout.flush()
860  save_stdout_fileno = os.dup (sys.stdout.fileno())
861  os.dup2( out.fileno(), sys.stdout.fileno() )
862 
863  out.write( "#" * 80 + os.linesep )
864  out.write( "## detailed dump" + os.linesep )
865  out.flush()
866 
867  for key in self.keys:
868  tree = key.ReadObj()
869  name = tree.GetName()
870 
871  if PoolOpts.isDataHeader(name) or \
872  PoolOpts.isData(name):
873  try:
874  print ("=== [%s] ===" % name, file=sys.stderr)
875  tree.Print()
876  except Exception as err:
877  print ("Caught:",err, file=sys.stderr)
878  print (sys.exc_info()[0], file=sys.stderr)
879  print (sys.exc_info()[1], file=sys.stderr)
880  pass
881  pass
882  pass
883  out.write( "#" * 80 + os.linesep )
884  out.flush()
885  out.write( "#" * 80 + os.linesep )
886 
890  out.flush()
891  if bufferName != "<stdout>":
892  out.close()
893  sys.stdout.close()
894  sys.stdout = open (save_stdout_fileno, 'a')
895  return
896 
897  def poolRecord(self, name):
898  """
899  Return a PoolRecord according to its (branch) name
900  Raise KeyError if no match is found
901  """
902  for data in self.data:
903  if data.name == name:
904  return data
905  raise KeyError("No PoolRecord with name [%s]" % name)
906 
907  def saveReport (self, fileName):
908  """
909  Save all the gathered informations into a python shelve or a CSV file
910  (depending on the @param `fileName` extension)
911  """
912  import os
913  if os.path.splitext(fileName)[-1] == '.csv':
914  return self._save_csv_report (fileName)
915  return self._save_shelve_report (fileName)
916 
917  def _save_shelve_report(self, fileName):
918  """
919  Save all the gathered informations into a python shelve
920  Data can then be read like so:
921  >>> import shelve
922  >>> db = shelve.open( 'myfile.dat', 'r' )
923  >>> report = db['report']
924  >>> print ('fileSize:',report['fileSize'])
925  >>> print ('dataHeader/memSize:',report['dataHeader'].memSize)
926  >>> for d in report['data']:
927  ... print ('data:',d.name,d.nEntries,d.memSize)
928  """
929  import shelve, os
930  if os.path.exists (fileName):
931  os.unlink (fileName)
932  db = shelve.open (fileName)
933  db['report'] = {
934  'fileInfos' : self._fileInfos,
935  'nbrEvts' : self.dataHeader.nEntries,
936  'dataHeader' : self.dataHeader,
937  'data' : self.data
938  }
939  db.close()
940  return
941 
942  def _save_csv_report(self, fileName):
943  """
944  Save all the gathered informations into a CSV file
945  """
946  import csv, os
947  if os.path.exists (fileName):
948  os.unlink (fileName)
949  args = {'newline' : ''}
950  f = open (fileName, 'w', **args)
951  o = csv.writer (f)
952  o.writerow (['file name', self._fileInfos['name']])
953  o.writerow (['file size', self._fileInfos['size']])
954  o.writerow (['nbr evts', self.dataHeader.nEntries])
955  o.writerow (['mem size', 'disk size', 'mem size nozip', 'items',
956  'container name', 'branch type'])
957 
958  for d in self.data:
959  o.writerow ([d.memSize, d.diskSize, d.memSizeNoZip,
960  d.nEntries, d.name, d.dirType])
961  f.close()
962  return
963 
964  def __del__(self):
965  if self.poolFile and hasattr(self.poolFile, 'Close'):
966  try:
967  self.poolFile.Close()
968  self.poolFile = None
969  except Exception as err:
970  print("WARNING:",err)
971  pass
972 
973  pass # class PoolFile
974 
976  """
977  A helper class to compare 2 POOL files and check that they match, both in
978  terms of containers' content and containers' sizes
979  """
980 
981  def __init__(self, refFileName, chkFileName, verbose = False, ignoreList = None, strict = False):
982  object.__init__(self)
983 
984  self.verbose = verbose
985  self.strict = strict
986  refFileName = os.path.expandvars( os.path.expanduser( refFileName ) )
987  chkFileName = os.path.expandvars( os.path.expanduser( chkFileName ) )
988 
989  if ignoreList is None:
990  ignoreList = []
991 
992  try:
993  self.refFile = PoolFile( refFileName )
994  self.chkFile = PoolFile( chkFileName )
995  self.ignList = sorted( ignoreList )
996  except Exception as err:
997  print("## Caught exception [%s] !!" % str(err.__class__))
998  print("## What:",err)
999  print(sys.exc_info()[0])
1000  print(sys.exc_info()[1])
1001  err = "Error while opening POOL files !"
1002  err += " chk : %s%s" % ( chkFileName, os.linesep )
1003  err += " ref : %s%s" % ( refFileName, os.linesep )
1004  raise Exception(err)
1005 
1006  self.allGood = True
1007  self.summary = []
1008 
1009  self.__checkDiff()
1010  return
1011 
1012  def __checkDiff(self):
1013 
1014  self.summary += [
1015  "=" * 80,
1016  "::: Comparing POOL files...",
1017  " ref : %s" % self.refFile._fileInfos['name'],
1018  " chk : %s" % self.chkFile._fileInfos['name'],
1019  "-" * 80,
1020  ]
1021 
1022  if self.chkFile.dataHeader.nEntries != \
1023  self.refFile.dataHeader.nEntries :
1024  self.summary += [
1025  "## WARNING: files don't have the same number of entries !!",
1026  " ref : %r" % self.refFile.dataHeader.nEntries,
1027  " chk : %r" % self.chkFile.dataHeader.nEntries,
1028  ]
1029 
1030  refNames = sorted( [d.name for d in self.refFile.data] )
1031  chkNames = sorted( [d.name for d in self.chkFile.data] )
1032 
1033  if chkNames != refNames:
1034  self.summary += [
1035  "## ERROR: files don't have the same content !!",
1036  ]
1037  addNames = [ n for n in chkNames if n not in refNames ]
1038  if len( addNames ) > 0:
1039  self.summary += [ "## collections in 'chk' and not in 'ref'" ]
1040  for n in addNames:
1041  self.summary += [ " + %s" % n ]
1042  subNames = [ n for n in refNames if n not in chkNames ]
1043  if len( subNames ) > 0:
1044  self.summary += [ "## collections in 'ref' and not in 'chk'" ]
1045  for n in subNames:
1046  self.summary += [ " - %s" % n ]
1047  self.allGood = False
1048  pass
1049 
1050  if len(self.ignList) > 0:
1051  self.summary += [ "## Ignoring the following:" ]
1052  for n in self.ignList:
1053  self.summary += [ " %s" % n ]
1054 
1055  commonContent = [ d for d in chkNames if (d in refNames and d not in self.ignList)]
1056 
1057  if not self.allGood:
1058  self.summary += [ "=" * 80 ]
1059  self.summary += [ "::: comparing common content (mem-size / disk-size)..." ]
1060 
1061  for name in commonContent:
1062  chkMemSize = self.chkFile.poolRecord(name).memSize
1063  refMemSize = self.refFile.poolRecord(name).memSize
1064  chkDiskSize = self.chkFile.poolRecord(name).diskSize
1065  refDiskSize = self.refFile.poolRecord(name).diskSize
1066 
1067  if chkMemSize != refMemSize or (self.strict and chkDiskSize != refDiskSize):
1068  self.summary += [
1069  "[ERR] %12.3f / %12.3f kb (ref) ==> %12.3f / %12.3f kb (chk) | %s" % \
1070  ( refMemSize,refDiskSize,chkMemSize,chkDiskSize, name )
1071  ]
1072  self.allGood = False
1073  elif self.verbose:
1074  self.summary += [
1075  " [OK] %12.3f/%12.3f kb | %s" % \
1076  ( chkMemSize, chkDiskSize, name )
1077  ]
1078 
1079  self.summary += [ "=" * 80 ]
1080 
1081 
1082  if self.allGood: self.summary += [ "## Comparison : [OK]" ]
1083  else: self.summary += [ "## Comparison : [ERR]" ]
1084 
1085  return self.allGood
1086 
1087  def status(self):
1088  if self.allGood: return 0
1089  else: return 1
1090 
1091  def printSummary(self, out = sys.stdout):
1092  for i in self.summary:
1093  out.writelines( i + os.linesep )
1094  pass
1095  return
1096 
1098  """
1099  A counter just contains an item list (pairs class-name/sg-key) and the size
1100  """
1101  size = 0
1102  def __init__(self, name, itemList):
1103  object.__init__(self)
1104  self.name = name
1105  self.itemList = itemList
1106  pass # Counter
1107 
1108 
python.PoolFile.PoolRecord.diskSize
diskSize
Definition: PoolFile.py:501
python.PoolFile.DiffFiles.summary
summary
Definition: PoolFile.py:1007
python.PoolFile.PoolFile.augNames
augNames
Definition: PoolFile.py:528
python.PoolFile.DiffFiles.verbose
verbose
Definition: PoolFile.py:984
python.PoolFile.DiffFiles.__checkDiff
def __checkDiff(self)
Definition: PoolFile.py:1012
python.PoolFile.PoolFile.poolFile
poolFile
Definition: PoolFile.py:542
python.PoolFile.DiffFiles.status
def status(self)
Definition: PoolFile.py:1087
python.PoolFile.PoolFile.keys
keys
first we try to fetch the DataHeader
Definition: PoolFile.py:524
python.PoolFile.PoolRecord.memSize
memSize
Definition: PoolFile.py:500
python.PoolFile.PoolFileCatalog.__init__
def __init__(self, catalog=None)
Definition: PoolFile.py:50
python.PoolFile.PoolOpts
Definition: PoolFile.py:245
python.PoolFile.PoolFile.checkFile
def checkFile(self, sorting=PoolRecord.Sorter.DiskSize)
Definition: PoolFile.py:760
python.PoolFile.PoolRecord.__init__
def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType, detailedInfos="", typeName=None)
Definition: PoolFile.py:491
python.PoolFile._save_shelve_report
def _save_shelve_report(self, fileName)
Definition: PoolFile.py:917
python.PoolFile.PoolOpts.RNTupleNames
Definition: PoolFile.py:255
python.PoolFile.PoolFile.detailedDump
def detailedDump(self, bufferName=sys.stdout.name)
Definition: PoolFile.py:850
python.PoolFile.__del__
def __del__(self)
Definition: PoolFile.py:964
python.PoolFile.PoolFile.dataHeaderA
dataHeaderA
Definition: PoolFile.py:529
python.PoolFile.PoolRecord.dirType
dirType
Definition: PoolFile.py:504
python.PoolFile.Counter.name
name
Definition: PoolFile.py:1104
python.PoolFile.poolRecord
def poolRecord(self, name)
Definition: PoolFile.py:897
python.PoolFile.PoolFile.__openPoolFile
def __openPoolFile(self, fileName)
Definition: PoolFile.py:564
python.PoolFile.PoolFileCatalog._pfn
def _pfn(self, url_or_fid)
Definition: PoolFile.py:176
python.PoolFile.saveReport
def saveReport(self, fileName)
Definition: PoolFile.py:907
python.PoolFile.PoolRecord.nEntries
nEntries
Definition: PoolFile.py:503
python.PoolFile.PoolRecord
Definition: PoolFile.py:477
python.PoolFile.file_name
def file_name(fname)
Definition: PoolFile.py:313
python.PoolFile.PoolRecord.Sorter
Definition: PoolFile.py:480
python.PoolFile.PoolFile.data
data
Definition: PoolFile.py:530
python.PoolFile.PoolFileCatalog.__call__
def __call__(self, url_or_fid)
Definition: PoolFile.py:240
dumpHVPathFromNtuple.append
bool append
Definition: dumpHVPathFromNtuple.py:91
python.PoolFile.PoolFile.ROOT
ROOT
Definition: PoolFile.py:571
python.PoolFile.retrieveBranchInfos
def retrieveBranchInfos(branch, poolRecord, ident="")
Definition: PoolFile.py:412
XMLtoHeader.count
count
Definition: XMLtoHeader.py:85
python.PoolFile.DiffFiles.refFile
refFile
Definition: PoolFile.py:993
handler
void handler(int sig)
signal handler
Definition: rmain.cxx:98
python.PoolFile.PoolOpts.isAugmentedHeader
def isAugmentedHeader(cls, name)
Definition: PoolFile.py:296
python.PoolFile.PoolFileCatalog.AllowedProtocols
AllowedProtocols
Definition: PoolFile.py:41
python.Helpers.ShutUp
Definition: Tools/PyUtils/python/Helpers.py:63
python.PoolFile.PoolFile.fileInfos
def fileInfos(self)
Definition: PoolFile.py:752
python.PoolFile.PoolOpts.isData
def isData(cls, name)
Definition: PoolFile.py:271
Get
T * Get(TFile &f, const std::string &n, const std::string &dir="", const chainmap_t *chainmap=0, std::vector< std::string > *saved=0)
get a histogram given a path, and an optional initial directory if histogram is not found,...
Definition: comparitor.cxx:178
python.PoolFile.DiffFiles.chkFile
chkFile
Definition: PoolFile.py:994
python.PoolFile.PoolFile.__processFile
def __processFile(self)
Definition: PoolFile.py:606
python.PoolFile.PoolOpts.augmentationName
def augmentationName(cls, name)
Definition: PoolFile.py:289
python.PoolFile.PoolOpts.TTreeNames
Definition: PoolFile.py:250
python.PoolFile.PoolFileCatalog.pfn
def pfn(self, url_or_fid)
Definition: PoolFile.py:166
python.PoolFile.Counter.__init__
def __init__(self, name, itemList)
Definition: PoolFile.py:1102
python.PoolFile.PoolFile.dataHeader
dataHeader
try to also handle non-T/P separated DataHeaders (from old files)...
Definition: PoolFile.py:525
python.PoolFile.PoolRecord.augName
augName
Definition: PoolFile.py:506
plotBeamSpotVxVal.range
range
Definition: plotBeamSpotVxVal.py:195
python.PoolFile.DiffFiles.strict
strict
Definition: PoolFile.py:985
add
bool add(const std::string &hname, TKey *tobj)
Definition: fastadd.cxx:55
python.PoolFile.PoolFile._fileInfos
_fileInfos
Definition: PoolFile.py:523
python.PoolFile._save_csv_report
def _save_csv_report(self, fileName)
Definition: PoolFile.py:942
python.PoolFile.Counter.itemList
itemList
Definition: PoolFile.py:1105
DerivationFramework::TriggerMatchingUtils::sorted
std::vector< typename T::value_type > sorted(T begin, T end)
Helper function to create a sorted vector from an unsorted one.
CxxUtils::set
constexpr std::enable_if_t< is_bitmask_v< E >, E & > set(E &lhs, E rhs)
Convenience function to set bits in a class enum bitmask.
Definition: bitmask.h:224
TCS::join
std::string join(const std::vector< std::string > &v, const char c=',')
Definition: Trigger/TrigT1/L1Topo/L1TopoCommon/Root/StringUtils.cxx:10
python.PoolFile.PoolRecord.Sorter.allowedValues
def allowedValues()
Definition: PoolFile.py:486
python.PoolFile.make_pool_record
def make_pool_record(branch, dirType)
Definition: PoolFile.py:433
python.PoolFile._setup_ssl
def _setup_ssl(root)
Definition: PoolFile.py:373
python.PoolFile.PoolFile.__init__
def __init__(self, fileName, verbose=True)
Definition: PoolFile.py:520
python.PoolFile.PoolOpts.isDataHeader
def isDataHeader(cls, name)
Definition: PoolFile.py:275
python.PoolFile.PoolFileCatalog
Definition: PoolFile.py:35
python.PoolFile.PoolFile.verbose
verbose
Definition: PoolFile.py:531
Trk::open
@ open
Definition: BinningType.h:40
python.PoolFile.DiffFiles.printSummary
def printSummary(self, out=sys.stdout)
Definition: PoolFile.py:1091
python.PoolFile.DiffFiles.allGood
allGood
Definition: PoolFile.py:1006
python.PoolFile.PoolRecord.name
name
Definition: PoolFile.py:499
python.PoolFile.PoolRecord.typeName
typeName
Definition: PoolFile.py:507
python.PoolFile.PoolRecord.details
details
Definition: PoolFile.py:505
python.CaloScaleNoiseConfig.type
type
Definition: CaloScaleNoiseConfig.py:78
Muon::print
std::string print(const MuPatSegment &)
Definition: MuonTrackSteering.cxx:28
python.PoolFile.DiffFiles
Definition: PoolFile.py:975
pickleTool.object
object
Definition: pickleTool.py:30
str
Definition: BTagTrackIpAccessor.cxx:11
python.PoolFile.PoolFileCatalog.catalog
catalog
Definition: PoolFile.py:52
python.PoolFile._root_open
def _root_open(fname)
Definition: PoolFile.py:387
python.PoolFile.PoolFile
Definition: PoolFile.py:511
python.PoolFile.extract_items
def extract_items(pool_file, verbose=True, items_type='eventdata')
Definition: PoolFile.py:446
python.PoolFile.PoolRecord.memSizeNoZip
memSizeNoZip
Definition: PoolFile.py:502
python.PoolFile._get_total_size
def _get_total_size(branch)
Definition: PoolFile.py:301
python.PoolFile.DiffFiles.__init__
def __init__(self, refFileName, chkFileName, verbose=False, ignoreList=None, strict=False)
Definition: PoolFile.py:981
python.PoolFile.PoolOpts.isEventData
def isEventData(cls, name)
Definition: PoolFile.py:281
python.PoolFile.Units
— data ---------------------------------------------------------------—
Definition: PoolFile.py:30
python.PoolFile.DiffFiles.ignList
ignList
Definition: PoolFile.py:995
readCCLHist.float
float
Definition: readCCLHist.py:83
Trk::split
@ split
Definition: LayerMaterialProperties.h:38
python.PoolFile.PoolFileCatalog.DefaultCatalog
DefaultCatalog
Definition: PoolFile.py:40
python.PoolFile.PoolOpts.isAugmentation
def isAugmentation(cls, name)
Definition: PoolFile.py:285
python.PoolFile.Counter
Definition: PoolFile.py:1097