8 __author__ =
"Sebastien Binet <binet@cern.ch>"
25 from dbm
import whichdb
27 from .Helpers
import ShutUp
37 try:
from ROOT
import RNTuple
38 except(ImportError):
from ROOT.Experimental
import RNTuple
39 return isinstance( obj, RNTuple )
43 """ reverse-engineering of the POOL FileCatalog.
44 allows to retrieve the physical filename from a logical one, provided
45 that the file-id is known to the (real) PoolFileCatalog
47 DefaultCatalog =
"xmlcatalog_file:PoolFileCatalog.xml"
58 super (PoolFileCatalog, self).
__init__()
65 if isinstance(catalog, str):
68 if not isinstance (catalog, (str, list)):
70 "catalog contact string should be a string or a list thereof! (got %r)"%
75 return osp.expanduser(osp.expandvars(x))
77 def _handle_apcfile_old(x):
78 """ return $ATLAS_POOLCOND_PATH/poolcond/x
80 if 'ATLAS_POOLCOND_PATH' not in os.environ:
82 pcp = os.environ[
"ATLAS_POOLCOND_PATH"]
83 if x.startswith(
"apcfile:"):
84 x = x[len(
"apcfile:"):]
85 return osp_exp(osp.join(pcp,
'poolcond', x))
87 def _handle_apcfile(x):
88 """ return $ATLAS_POOLCOND_PATH/x
90 if 'ATLAS_POOLCOND_PATH' not in os.environ:
92 pcp = os.environ[
"ATLAS_POOLCOND_PATH"]
93 if x.startswith(
"apcfile:"):
94 x = x[len(
"apcfile:"):]
95 return osp_exp(osp.join(pcp, x))
97 def _handle_xmlcatalog_file(x):
98 return osp_exp(x[len(
"xmlcatalog_file:"):])
100 def _handle_prfile(x):
101 x = x[len(
"prfile:"):]
104 import AthenaCommon.Utils.unixtools
as u
106 os.environ[
'DATAPATH'].
split(os.pathsep),
117 "xmlcatalog_file:": _handle_xmlcatalog_file,
118 "apcfile:": _handle_apcfile,
119 "prfile:": _handle_prfile,
120 "file:": _handle_file,
123 "catalog dispatch keys does not match AllowedProtocols:" \
124 "\n%s\n%s" % (
sorted(cat_dispatch.keys()),
127 from .
import xmldict
128 def _build_catalog(catalog):
131 "sorry PoolFile:PoolFileCatalog only supports %s"
132 " as a protocol for the POOL file catalog (got: '%s')"
135 for protocol, handler
in cat_dispatch.iteritems():
136 if catalog.startswith(protocol):
142 if not os.path.exists (catalog):
149 root = xmldict.ElementTree.parse (catalog).getroot()
150 return dict(xmldict.xml2dict(root))
153 cat = {
'POOLFILECATALOG':{
'File':[]}}
156 bc = _build_catalog(c)
157 pc = bc.get(
'POOLFILECATALOG',{})
160 files = pc.get(
'File',[])
161 if isinstance(files, dict):
163 cat[
'POOLFILECATALOG'][
'File'].
extend(files)
164 except Exception
as err:
173 def pfn (self, url_or_fid):
174 """find the physical file name given a url or a file-id"""
175 import os.path
as osp
176 url_or_fid = osp.expanduser(osp.expandvars(url_or_fid))
178 if isinstance (url_or_fid, types.ListType):
179 return [self.
_pfn(f)
for f
in url_or_fid]
181 return self.
_pfn(url_or_fid)
184 """find the physical file name given a url or a file-id"""
185 if not (
'POOLFILECATALOG' in self.
catalog):
187 if not (
'File' in self.
catalog[
'POOLFILECATALOG']):
192 files = self.
catalog[
'POOLFILECATALOG'][
'File']
193 if isinstance(files, dict):
197 if url_or_fid.lower().startswith(
'fid:'):
198 url_or_fid = url_or_fid[len(
'fid:'):]
199 if re.compile (
r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$').match (url_or_fid):
200 fid = url_or_fid.lower()
205 if f.ID.lower() == fid:
208 if isinstance(pfn, (list,tuple)):
209 match[fid].
append([i.name
for i
in pfn])
211 match[fid].
append([pfn.name])
212 if len(match[fid])==1:
213 return match[fid][0][PFN_IDX]
214 if len(match[fid])>1:
216 "more than one match for FID='%s'!\n%r"%(fid,match)
218 raise KeyError (
"no entry with FID='%s' in catalog" % fid)
221 if url.lower().startswith(
"lfn:"):
222 url = url[len(
"lfn:"):]
228 and f.logical.lfn.name == url):
231 if isinstance(pfn, (list,tuple)):
232 match[url].
append([i.name
for i
in pfn])
234 match[url].
append([pfn.name])
235 if len(match[url])==1:
236 return match[url][0][PFN_IDX]
237 if len(match[url])>1:
239 "more than one match for LFN='%s'!\n%r"%(url,match)
241 raise KeyError (
"no entry with LFN='%s' in catalog" % url)
243 if url.lower().startswith(
"pfn:"):
244 url = url[len(
"pfn:"):]
248 return self.
pfn (url_or_fid)
258 EventData =
"CollectionTree"
259 EventTag =
"POOLCollectionTree"
260 DataHeader =
"POOLContainer"
261 MetaData =
"MetaData"
263 EventData =
"EventData"
264 EventTag =
"EventTag"
265 DataHeader =
"DataHeader"
266 MetaData =
"MetaData"
269 SUPER_DETAILED_BRANCH_SZ =
False
271 POOL_HEADER = TTreeNames.DataHeader
272 EVENT_DATA = TTreeNames.EventData
273 META_DATA = TTreeNames.MetaData
274 HDR_FORMAT =
" %11s %11s %11s %11s %5s %s"
275 ROW_FORMAT =
"%12.3f kb %12.3f kb %12.3f kb %12.3f %8i %s"
279 return not name.startswith(
"##")
and not cls.
isDataHeader(name)
289 return name.startswith(PoolOpts.EVENT_DATA)
293 return "_DAOD_" in name
297 s = (name+
"__").
split(
'_')[2]
298 if s.endswith(
"Form"):
304 return name.startswith(PoolOpts.POOL_HEADER)
and cls.
isAugmentation(name)
309 if PoolOpts.FAST_MODE:
311 if not PoolOpts.SUPER_DETAILED_BRANCH_SZ:
312 return branch.GetTotalSize()
315 for bnum
in range(0, branch.GetWriteBasket()):
316 basket = branch.GetBasket(bnum)
317 brSize += basket.GetObjlen() - 8
321 """take a file name, return the pair (protocol, 'real' file name)
323 fname = os.path.expanduser(os.path.expandvars(fname))
325 def _normalize_uri(uri):
326 if uri.startswith(
'/'):
330 from urllib.parse
import urlsplit
331 url = urlsplit(_normalize_uri(fname))
332 protocol = url.scheme
333 def _normalize(fname):
334 from posixpath
import normpath
335 fname = normpath(fname)
336 if fname.startswith(
'//'): fname = fname[1:]
339 if protocol
in (
'',
'file',
'pfn'):
341 fname = _normalize(url.path)
344 if fname.startswith(
'/castor/'):
346 fname = protocol +
':' + fname
348 elif protocol
in (
'rfio',
'castor'):
350 fname = _normalize(url.path)
351 fname = protocol+
':'+fname
353 elif protocol
in (
'root',
'dcap',
'dcache',
'http',
'https',
'dav',
'davs'):
356 elif protocol
in (
'gsidcap',):
357 protocol =
'gfal:gsidcap'
360 elif protocol
in (
'lfn',
'fid',):
362 from PyUtils.PoolFile
import PoolFileCatalog
as pfc
363 fname = pfc().pfn(protocol+
':'+url.path)
366 elif protocol
in (
'ami',):
368 for token
in (
'ami:',
'//',
'/'):
369 if fname.startswith(token):
370 fname = fname[len(token):]
371 fname =
'ami://' + fname
375 print(f
'## warning: unknown protocol [{protocol}]. we will just return our input')
378 return (protocol, fname)
381 x509_proxy = os.environ.get(
'X509_USER_PROXY',
'')
384 root.TSSLSocket.SetUpSSL(
386 "/etc/grid-security/certificates",
390 print(
"## warning: protocol https is requested but no X509_USER_PROXY was found! (opening the file might fail.)")
395 import PyUtils.RootUtils
as ru
396 root = ru.import_root()
400 re.compile(
'TClass::TClass:0: RuntimeWarning: no dictionary for class.*') ]):
401 root.gSystem.Load(
'libRootCollection')
402 root_open = root.TFile.Open
407 if protocol ==
'https':
409 root_open = root.TWebFile.Open
411 f = root_open(fname,
'READ')
412 if f
is None or not f:
414 raise IOError(errno.ENOENT,
415 'No such file or directory',fname)
420 fmt =
"%s %3i %8.3f %8.3f %8.3f %s"
423 branch.GetListOfBranches().GetSize(),
424 _get_total_size (branch),
425 branch.GetTotBytes(),
426 branch.GetZipBytes(),
430 branches = branch.GetListOfBranches()
432 poolRecord.memSize += _get_total_size (b) / Units.kb
433 if (b.GetZipBytes() < 0.001):
434 poolRecord.memSizeNoZip += _get_total_size (b) / Units.kb
435 poolRecord.diskSize += b.GetZipBytes() / Units.kb
436 poolRecord = retrieveBranchInfos ( b, poolRecord, ident+
" " )
441 memSize = _get_total_size (branch) / Units.kb
442 zipBytes = branch.GetZipBytes()
443 memSizeNoZip = memSize
if zipBytes < 0.001
else 0.
444 diskSize = branch.GetZipBytes() / Units.kb
445 typeName = branch.GetClassName()
446 if not typeName
and (leaf := branch.GetListOfLeaves().At(0)):
447 typeName = leaf.GetTypeName()
448 return PoolRecord(branch.GetName(), memSize, diskSize, memSizeNoZip,
457 DiskSize =
"diskSize"
459 ContainerName =
"name"
463 return [ PoolRecord.Sorter.DiskSize,
464 PoolRecord.Sorter.MemSize,
465 PoolRecord.Sorter.ContainerName ]
467 def __init__(self, name, memSize, diskSize, memSizeNoZip, nEntries, dirType,
468 detailedInfos = "", typeName = None):
469 """Initialize PoolRecord instance.
471 dirType first letter of object type name that may distinguish the types:
472 "T" for TTree, "B" for TBranch,
473 "N" for RNTuple, "F" for RField
475 object.__init__(self)
489 A simple class to retrieve informations about the content of a POOL file.
490 It should be abstracted from the underlying technology used to create this
491 POOL file (Db, ROOT,...).
492 Right now, we are using the easy and loosy solution: going straight to the
497 object.__init__(self)
512 except Exception
as err:
513 print(
"## warning: problem opening PoolFileCatalog:\n%s"%err)
515 traceback.print_exc(err)
519 dbFileName = whichdb( fileName )
520 if dbFileName
not in (
None,
'' ):
522 print(
"## opening file [%s]..." %
str(fileName))
523 db = shelve.open( fileName,
'r' )
525 print(
"## opening file [OK]")
526 report = db[
'report']
529 self.
data = report[
'data']
532 print(
"## opening file [%s]..." %
str(fileName))
535 print(
"## opening file [OK]")
544 print(
"## importing ROOT...")
545 import PyUtils.RootUtils
as ru
546 ROOT = ru.import_root()
549 print(
"## importing ROOT... [DONE]")
553 ROOT.gErrorIgnoreLevel = ROOT.kFatal
557 poolFile = ROOT.TFile.Open( fileName, PoolOpts.READ_MODE )
558 except Exception
as e:
560 print(
"## Failed to open file [%s] !!" % fileName)
563 print(
"## Bailing out...")
564 raise IOError(
"Could not open file [%s]" % fileName)
569 print(
"## Failed to open file [%s] !!" % fileName)
570 msg =
"Could not open file [%s]" % fileName
575 "Invalid POOL file or a Zombie one"
584 for name
in {PoolOpts.TTreeNames.DataHeader, PoolOpts.RNTupleNames.DataHeader}:
585 dhKey = self.
poolFile.FindKey( name )
588 if isinstance(obj, self.
ROOT.TTree):
589 nEntries = obj.GetEntries()
592 nEntries = self.
ROOT.Experimental.RNTupleReader.Open(obj).GetNEntries()
593 except AttributeError:
595 nEntries = self.
ROOT.RNTupleReader.Open(obj).GetNEntries()
597 raise NotImplementedError(f
"Keys of type {type(obj)!r} not supported")
604 for k
in self.
poolFile.GetListOfKeys():
605 keyname = k.GetName()
607 if isinstance(obj, self.
ROOT.TTree):
608 containerName = obj.GetName()
609 nEntries = obj.GetEntries()
613 reader = self.
ROOT.Experimental.RNTupleReader.Open(obj)
614 except AttributeError:
616 reader = self.
ROOT.RNTupleReader.Open(obj)
617 containerName = reader.GetDescriptor().GetName()
618 nEntries = reader.GetNEntries()
621 raise NotImplementedError(f
"Keys of type {type(obj)!r} not supported")
622 if containerName
not in containers:
624 containers.append(containerName)
626 if keyname.startswith(PoolOpts.POOL_HEADER)
and not keyname.endswith(
'Form'):
627 self.
dataHeaderA[PoolOpts.augmentationName(keyname)] = \
632 keys.sort (key =
lambda x: x.GetName())
638 if isinstance(obj, self.
ROOT.TTree):
642 inspector = self.
ROOT.Experimental.RNTupleInspector.Create(obj)
643 except AttributeError:
644 inspector = self.
ROOT.RNTupleInspector.Create(obj)
645 name = inspector.GetDescriptor().GetName()
647 if PoolOpts.isDataHeader(name):
648 contName =
"DataHeader"
649 if isinstance(obj, self.
ROOT.TTree):
650 memSize = obj.GetTotBytes() / Units.kb
651 diskSize = obj.GetZipBytes() / Units.kb
654 memSizeNoZip = memSize
655 nEntries = obj.GetEntries()
659 br.GetName()
for br
in obj.GetListOfBranches()
660 if br.GetName().
count(
"DataHeader_p") > 0
662 if len(dhBranchNames) == 1:
663 dhBranch = obj.GetBranch(dhBranchNames[0])
664 typeName = dhBranch.GetClassName()
665 if not typeName
and (leaf := dhBranch.GetListOfLeaves().At(0)):
666 typeName = leaf.GetTypeName()
672 typeName = typeName ),
676 poolRecord =
PoolRecord(contName, memSize, diskSize, memSizeNoZip,
682 diskSize = inspector.GetCompressedSize() / Units.kb
683 memSize = inspector.GetUncompressedSize() / Units.kb
687 memSizeNoZip = memSize
688 nEntries = inspector.GetDescriptor().GetNEntries()
689 poolRecord =
PoolRecord(contName, memSize, diskSize, memSizeNoZip,
693 elif PoolOpts.isData(name):
694 if isinstance(obj, self.
ROOT.TTree):
695 if not hasattr(obj,
'GetListOfBranches'):
697 branches = obj.GetListOfBranches()
699 if name
in (PoolOpts.EVENT_DATA, PoolOpts.META_DATA):
701 for branch
in branches:
707 poolRecord.augName = PoolOpts.augmentationName(name)
709 self.
data += [ poolRecord ]
711 descriptor = inspector.GetDescriptor()
713 if name
in {PoolOpts.RNTupleNames.EventData, PoolOpts.RNTupleNames.MetaData}:
715 fieldZeroId = descriptor.GetFieldZeroId()
716 for fieldDescriptor
in descriptor.GetFieldIterable(fieldZeroId):
717 fieldId = fieldDescriptor.GetId()
718 fieldTreeInspector = inspector.GetFieldTreeInspector(fieldId)
719 diskSize = fieldTreeInspector.GetCompressedSize() / Units.kb
720 memSize = fieldTreeInspector.GetUncompressedSize() / Units.kb
721 typeName = fieldDescriptor.GetTypeName()
722 fieldName = fieldDescriptor.GetFieldName()
723 poolRecord =
PoolRecord(fieldName, memSize, diskSize, memSize,
724 descriptor.GetNEntries(),
727 poolRecord.augName = PoolOpts.augmentationName(name)
729 self.
data += [ poolRecord ]
735 return os.linesep.join( [
737 "Size: %12.3f kb" % (self.
_fileInfos[
'size'] / Units.kb),
742 def checkFile(self, sorting = PoolRecord.Sorter.DiskSize):
752 if sorting
in PoolRecord.Sorter.allowedValues():
754 data.sort(key = operator.attrgetter(sorting) )
756 def _get_val(x, dflt=-999.):
757 if PoolOpts.FAST_MODE:
761 totMemSize = _get_val(self.
dataHeader.memSize, dflt=0.)
764 def _safe_div(num,den):
772 print(PoolOpts.HDR_FORMAT % (
"Mem Size",
"Disk Size",
"Size/Evt",
773 "MissZip/Mem",
"items",
774 "(X) Container Name (X=Tree|Branch)" ))
777 print(PoolOpts.ROW_FORMAT % (
781 _get_val (_safe_div(self.
dataHeader.memSizeNoZip,
791 totMemSize += 0.
if PoolOpts.FAST_MODE
else d.memSize
792 totDiskSize += d.diskSize
793 memSizeNoZip = d.memSizeNoZip/d.memSize
if d.memSize != 0.
else 0.
795 totMemSizeA[aug] = totMemSizeA.get(aug,0.) + d.memSize
796 totDiskSizeA[aug] = totDiskSizeA.get(aug,0.) + d.diskSize
798 print(PoolOpts.ROW_FORMAT % (
799 _get_val (d.memSize),
802 _get_val (memSizeNoZip),
804 "("+d.dirType+
") "+d.name
812 print(PoolOpts.ROW_FORMAT % (
813 totMemSizeA[a], totDiskSizeA[a],
817 "Aug Stream: " + (
'MAIN' if a==
'' else a)
820 print(PoolOpts.ROW_FORMAT % (
821 totMemSize, totDiskSize,
824 "TOTAL (POOL containers)"
827 if PoolOpts.FAST_MODE:
828 print(
"::: warning: FAST_MODE was enabled: some columns' content ",)
829 print(
"is meaningless...")
835 print(
"Can't perform a detailedDump with a shelve file as input !")
838 if bufferName
is None:
839 bufferName =
"/dev/stdout"
840 out =
open( bufferName,
"w" )
842 save_stdout_fileno = os.dup (sys.stdout.fileno())
843 os.dup2( out.fileno(), sys.stdout.fileno() )
845 out.write(
"#" * 80 + os.linesep )
846 out.write(
"## detailed dump" + os.linesep )
849 for key
in self.
keys:
851 name = tree.GetName()
853 if PoolOpts.isDataHeader(name)
or \
854 PoolOpts.isData(name):
856 print (
"=== [%s] ===" % name, file=sys.stderr)
858 except Exception
as err:
859 print (
"Caught:",err, file=sys.stderr)
860 print (sys.exc_info()[0], file=sys.stderr)
861 print (sys.exc_info()[1], file=sys.stderr)
865 out.write(
"#" * 80 + os.linesep )
867 out.write(
"#" * 80 + os.linesep )
873 if bufferName !=
"<stdout>":
876 sys.stdout = open (save_stdout_fileno,
'a')
881 Return a PoolRecord according to its (branch) name
882 Raise KeyError if no match is found
884 for data
in self.data:
885 if data.name == name:
887 raise KeyError(
"No PoolRecord with name [%s]" % name)
891 Save all the gathered informations into a python shelve or a CSV file
892 (depending on the @param `fileName` extension)
895 if os.path.splitext(fileName)[-1] ==
'.csv':
896 return self._save_csv_report (fileName)
897 return self._save_shelve_report (fileName)
901 Save all the gathered informations into a python shelve
902 Data can then be read like so:
904 >>> db = shelve.open( 'myfile.dat', 'r' )
905 >>> report = db['report']
906 >>> print ('fileSize:',report['fileSize'])
907 >>> print ('dataHeader/memSize:',report['dataHeader'].memSize)
908 >>> for d in report['data']:
909 ... print ('data:',d.name,d.nEntries,d.memSize)
912 if os.path.exists (fileName):
914 db = shelve.open (fileName)
916 'fileInfos' : self._fileInfos,
917 'nbrEvts' : self.dataHeader.nEntries,
918 'dataHeader' : self.dataHeader,
926 Save all the gathered informations into a CSV file
929 if os.path.exists (fileName):
931 args = {
'newline' :
''}
932 f = open (fileName,
'w', **args)
934 o.writerow ([
'file name', self._fileInfos[
'name']])
935 o.writerow ([
'file size', self._fileInfos[
'size']])
936 o.writerow ([
'nbr evts', self.dataHeader.nEntries])
937 o.writerow ([
'mem size',
'disk size',
'mem size nozip',
'items',
938 'container name',
'branch type'])
941 o.writerow ([d.memSize, d.diskSize, d.memSizeNoZip,
942 d.nEntries, d.name, d.dirType])
947 if self.poolFile
and hasattr(self.poolFile,
'Close'):
949 self.poolFile.Close()
951 except Exception
as err:
952 print(
"WARNING:",err)
959 A helper class to compare 2 POOL files and check that they match, both in
960 terms of containers' content and containers' sizes
963 def __init__(self, refFileName, chkFileName, verbose = False, ignoreList = None, strict = False):
964 object.__init__(self)
968 refFileName = os.path.expandvars( os.path.expanduser( refFileName ) )
969 chkFileName = os.path.expandvars( os.path.expanduser( chkFileName ) )
971 if ignoreList
is None:
978 except Exception
as err:
979 print(
"## Caught exception [%s] !!" %
str(err.__class__))
980 print(
"## What:",err)
981 print(sys.exc_info()[0])
982 print(sys.exc_info()[1])
983 err =
"Error while opening POOL files !"
984 err +=
" chk : %s%s" % ( chkFileName, os.linesep )
985 err +=
" ref : %s%s" % ( refFileName, os.linesep )
998 "::: Comparing POOL files...",
999 " ref : %s" % self.
refFile._fileInfos[
'name'],
1000 " chk : %s" % self.
chkFile._fileInfos[
'name'],
1004 if self.
chkFile.dataHeader.nEntries != \
1005 self.
refFile.dataHeader.nEntries :
1007 "## WARNING: files don't have the same number of entries !!",
1008 " ref : %r" % self.
refFile.dataHeader.nEntries,
1009 " chk : %r" % self.
chkFile.dataHeader.nEntries,
1015 if chkNames != refNames:
1017 "## ERROR: files don't have the same content !!",
1019 addNames = [ n
for n
in chkNames
if n
not in refNames ]
1020 if len( addNames ) > 0:
1021 self.
summary += [
"## collections in 'chk' and not in 'ref'" ]
1023 self.
summary += [
" + %s" % n ]
1024 subNames = [ n
for n
in refNames
if n
not in chkNames ]
1025 if len( subNames ) > 0:
1026 self.
summary += [
"## collections in 'ref' and not in 'chk'" ]
1028 self.
summary += [
" - %s" % n ]
1033 self.
summary += [
"## Ignoring the following:" ]
1037 commonContent = [ d
for d
in chkNames
if (d
in refNames
and d
not in self.
ignList)]
1041 self.
summary += [
"::: comparing common content (mem-size / disk-size)..." ]
1043 for name
in commonContent:
1049 if chkMemSize != refMemSize
or (self.
strict and chkDiskSize != refDiskSize):
1051 "[ERR] %12.3f / %12.3f kb (ref) ==> %12.3f / %12.3f kb (chk) | %s" % \
1052 ( refMemSize,refDiskSize,chkMemSize,chkDiskSize, name )
1057 " [OK] %12.3f/%12.3f kb | %s" % \
1058 ( chkMemSize, chkDiskSize, name )
1065 else: self.
summary += [
"## Comparison : [ERR]" ]
1075 out.writelines( i + os.linesep )
1081 A counter just contains an item list (pairs class-name/sg-key) and the size
1085 object.__init__(self)