ATLAS Offline Software
copyRootTree.cxx
Go to the documentation of this file.
1 /*
2 Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3 */
4 
5 #include "copyRootTree.h"
6 
7 // Root tree copy function
8 //
9 // NOTE: I don't recommend trying to understand the variable buffer
10 // classes until you see the general workflow.
11 //
12 // The main work is done in copy_root_tree below, skip down to that to
13 // understand what is going on.
14 //
15 
16 #include "HDF5Utils/HdfTuple.h"
17 
18 #include "TFile.h"
19 #include "TTree.h"
20 #include "TLeaf.h"
21 
22 #include <memory>
23 #include <regex>
24 #include <iostream>
25 #include <set>
26 
27 // ______________________________________________________________________
28 // Variable buffer classes
29 //
30 // These classes act as glue between HDF and ROOT: they hold pointers
31 // to the buffers that ROOT will read into, and as such are
32 // "ROOT-side" buffers.
33 //
34 // The constructors are given a reference to a `VariableFillers`
35 // container, which contains the "HDF5-side" logic. Each of the
36 // VariableFiller objects contains a "filler" function that the HDF5
37 // writer will call to read from the ROOT buffer.
38 //
39 // Some of the filler functions close over an index vector. This
40 // vector is incremented by the HDF5 Writer to read out successive
41 // entires in a std::vector stored in the root file.
42 //
43 // If this all seems complicated, it's because it's the deep internals
44 // of the code. Try to understand the copy_root_tree function first.
45 
46 // Base class, just to hold a virtual destructor
47 namespace H5Utils {
48  namespace {
49  class IBuffer
50  {
51  public:
52  virtual ~IBuffer() {}
53  };
54 
55 // Simple variables types are stored in the `Buffer` class.
56  template <typename T>
57  class Buffer: public IBuffer
58  {
59  public:
60  Buffer(VariableFillers& vars, TTree& tt, const std::string& name);
61  private:
63  };
64 
65 // Buffer for vector types
66  template <typename T>
67  class VBuf: public IBuffer
68  {
69  public:
70  // These require an index for the vector (`idx`)
71  VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
72  const std::string& name, T default_value = T());
73  ~VBuf();
74  private:
75  std::vector<T>* m_buffer;
76  };
77 
78 // Buffer for vectors of vectors
79  template <typename T>
80  class VVBuf: public IBuffer
81  {
82  public:
83  VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
84  const std::string& name, T default_value = T());
85  ~VVBuf();
86  private:
87  std::vector<std::vector<T> >* m_buffer;
88  };
89 
90 
91  // Selection function
92  //
93  // Define a bool function that returns false if event doesn't
94  // pass TTreeFormula cut.
95  bool passTTreeCut(TTreeFormula* selection) {
96  // GetNdim looks to see how many valid arguements there are.
97  // Returns false for none.
98  if ( !selection || !selection->GetNdim()) return true;
99  Int_t ndata = selection->GetNdata();
100  for(Int_t current = 0; current < ndata; current++) {
101  if (selection->EvalInstance(current) == 0) return false;
102  }
103  return true;
104  }
105 
106 
107  } // close anonymous namespace
108 
109 // _____________________________________________________________________
110 // main copy root tree function
111 //
112 // The basic workflow is as follows:
113 //
114 // - Define the ROOT-side buffers that we'll read the information
115 // into. At the same time, define HDF-side objects that read out of
116 // these buffers and copy the information into the HDF5 buffer.
117 //
118 // - Build the output HDF5 files
119 //
120 // - Loop over the input tree and fill the output files
121 //
122 // Note that this means the event loop happens last: most of the work
123 // is just setting up the read and write buffers.
124 
125  void copyRootTree(TTree& tt, H5::Group& fg, const TreeCopyOpts& opts) {
126 
127  // define the buffers for root to read into
128  std::vector<std::unique_ptr<IBuffer> > buffers;
129 
130  // this keeps track of the things we couldn't read
131  std::set<std::string> skipped;
132 
133 
134  // Each `VariableFiller` must be constructed with a "filler"
135  // function (or callable object), which takes no arguments and
136  // returns the variable we want to write out. In this case they are
137  // implemented as closures over the buffers that ROOT is reading
138  // into.
139 
140  // This is the 1d variables
141  VariableFillers vars;
142  std::vector<size_t> idx_dummy;
143 
144  // These are 2d variables (i.e. vector<T> in the root file)
145  //
146  // We also need an index which the HDF5 writer increments as it
147  // fills. This is shared with the ROOT buffers to index entries in
148  // std::vectors
149  VariableFillers vars2d;
150  std::vector<size_t> idx(1,0);
151 
152  // 3d variables (index is now 2d)
153  VariableFillers vars3d;
154  std::vector<size_t> idx2(2,0);
155 
156  // Iterate over all the leaf names. There are some duplicates in the
157  // list of keys, so we have to build the set ourselves.
158  std::regex branch_filter(opts.branch_regex);
159  TIter next(tt.GetListOfLeaves());
160  TLeaf* leaf;
161  std::set<std::string> leaf_names;
162  while ((leaf = dynamic_cast<TLeaf*>(next()))) {
163  leaf_names.insert(leaf->GetName());
164  }
165  if (leaf_names.size() == 0) throw std::logic_error("no branches found");
166 
167  // Loop over all the leafs, assign buffers to each
168  //
169  // These `Buffer` classes are defined above. The buffers turn the
170  // branchs on, so we can set them all off to start.
171  tt.SetBranchStatus("*", false);
172  for (const auto& lname: leaf_names) {
173  bool keep = true;
174  if (opts.branch_regex.size() > 0) {
175  keep = std::regex_search(lname, branch_filter);
176  }
177  if (opts.verbose) {
178  std::cout << (keep ? "found " : "rejecting ") << lname << std::endl;
179  }
180  if (!keep) continue;
181 
182  leaf = tt.GetLeaf(lname.c_str());
183  std::string branchName = leaf->GetBranch()->GetName();
184  std::string leaf_type = leaf->GetTypeName();
185  if (leaf_type == "Int_t") {
186  buffers.emplace_back(new Buffer<int>(vars, tt, branchName));
187  } else if (leaf_type == "Float_t") {
188  buffers.emplace_back(new Buffer<float>(vars, tt, branchName));
189  } else if (leaf_type == "Double_t") {
190  buffers.emplace_back(new Buffer<double>(vars, tt, branchName));
191  } else if (leaf_type == "Bool_t") {
192  buffers.emplace_back(new Buffer<bool>(vars, tt, branchName));
193  } else if (leaf_type == "Long64_t") {
194  buffers.emplace_back(new Buffer<long long>(vars, tt, branchName));
195  } else if (leaf_type == "ULong64_t") {
196  buffers.emplace_back(new Buffer<unsigned long long>(vars, tt, branchName));
197  } else if (leaf_type == "UInt_t") {
198  buffers.emplace_back(new Buffer<unsigned int>(vars, tt, branchName));
199  } else if (leaf_type == "UChar_t") {
200  buffers.emplace_back(new Buffer<unsigned char>(vars, tt, branchName));
201  } else if (leaf_type == "vector<float>") {
202  buffers.emplace_back(new VBuf<float>(vars2d, idx, tt, branchName, NAN));
203  } else if (leaf_type == "vector<double>") {
204  buffers.emplace_back(new VBuf<double>(vars2d, idx, tt, branchName, NAN));
205  } else if (leaf_type == "vector<int>") {
206  buffers.emplace_back(new VBuf<int>(vars2d, idx, tt, branchName, 0));
207  } else if (leaf_type == "vector<unsigned int>") {
208  buffers.emplace_back(new VBuf<unsigned int>(vars2d, idx, tt, branchName, 0));
209  } else if (leaf_type == "vector<unsigned char>") {
210  buffers.emplace_back(new VBuf<unsigned char>(vars2d, idx, tt, branchName, 0));
211  } else if (leaf_type == "vector<bool>") {
212  buffers.emplace_back(new VBuf<bool>(vars2d, idx, tt, branchName, 0));
213  } else if (leaf_type == "vector<vector<int> >") {
214  buffers.emplace_back(new VVBuf<int>(vars3d, idx2, tt, branchName, 0));
215  } else if (leaf_type == "vector<vector<unsigned int> >") {
216  buffers.emplace_back(new VVBuf<unsigned int>(vars3d, idx2, tt, branchName, 0));
217  } else if (leaf_type == "vector<vector<unsigned char> >") {
218  buffers.emplace_back(new VVBuf<unsigned char>(vars3d, idx2, tt, branchName, 0));
219  } else if (leaf_type == "vector<vector<float> >") {
220  buffers.emplace_back(new VVBuf<float>(vars3d, idx2, tt, branchName, NAN));
221  } else if (leaf_type == "vector<vector<double> >") {
222  buffers.emplace_back(new VVBuf<double>(vars3d, idx2, tt, branchName, NAN));
223  } else if (leaf_type == "vector<vector<bool> >") {
224  buffers.emplace_back(new VVBuf<bool>(vars3d, idx2, tt, branchName, 0));
225  } else {
226  skipped.insert(leaf_type);
227  }
228  }
229 
230  // Build HDF5 Outputs
231  //
232  // In the simple case where we're not reading vectors, we store one
233  // dataset with the same name as the tree. If there are vectors, we
234  // instead create a group with the same name as the tree, and name
235  // the datasets 1d, 2d, etc.
236  //
237  const std::string tree_name = tt.GetName();
238 
239  std::unique_ptr<WriterXd> writer1d;
240  std::unique_ptr<WriterXd> writer2d;
241  std::unique_ptr<WriterXd> writer3d;
242  std::unique_ptr<H5::Group> top_group;
243  if (opts.vector_lengths.size() > 0) {
244  if (opts.vector_lengths.size() > 2) throw std::logic_error(
245  "we don't support outputs with rank > 3");
246  size_t length = opts.vector_lengths.at(0);
247  top_group.reset(new H5::Group(fg.createGroup(tree_name)));
248  if (opts.vector_lengths.size() > 1) {
249  size_t length2 = opts.vector_lengths.at(1);
250  if (vars3d.size() > 0) {
251  writer3d.reset(new WriterXd(*top_group, "3d", vars3d,
252  {length, length2}, opts.chunk_size));
253  }
254  }
255  if (vars2d.size() > 0) {
256  writer2d.reset(new WriterXd(*top_group, "2d", vars2d,
257  {length}, opts.chunk_size));
258  }
259  if (vars.size() > 0) {
260  writer1d.reset(new WriterXd(*top_group, "1d",
261  vars, {}, opts.chunk_size));
262  }
263  } else {
264  if (vars.size() > 0) {
265  writer1d.reset(new WriterXd(fg, tree_name, vars, {}, opts.chunk_size));
266  }
267  }
268 
269  // Main event loop
270  //
271  // Very little actually happens here since the buffers are already
272  // defined, as are the HDF5 reader functions.
273  //
274 
275  // Get the selection string and build a new TTreeFormula
276  std::string cut_string = opts.selection;
277  const char * cut_char = cut_string.c_str();
278  TTreeFormula *cut =0;
279  if(!cut_string.empty()){
280  // This is so a cut can be applied without requiring the
281  // branch to be output to the hdf5 file.
282  tt.SetBranchStatus("*", true);
283  cut = new TTreeFormula("selection", cut_char, &tt);
284  }
285 
286  size_t n_entries = tt.GetEntries();
287  if (opts.n_entries) n_entries = std::min(n_entries, opts.n_entries);
288  int print_interval = opts.print_interval;
289  if (print_interval == -1) {
290  print_interval = std::max(1UL, n_entries / 100);
291  }
292 
293  for (size_t iii = 0; iii < n_entries; iii++) {
294  if (print_interval && (iii % print_interval == 0)) {
295  std::cout << "events processed: " << iii
296  << " (" << std::round(iii*1e2 / n_entries) << "% of "
297  << n_entries << ")" << std::endl;
298  }
299  tt.GetEntry(iii);
300  if(cut) cut->UpdateFormulaLeaves();
301  if (!passTTreeCut(cut)) continue;
302  if (writer1d) writer1d->fillWhileIncrementing(idx_dummy);
303  if (writer2d) writer2d->fillWhileIncrementing(idx);
304  if (writer3d) writer3d->fillWhileIncrementing(idx2);
305  }
306 
307  // Flush the memory buffers on the HDF5 side. (This is done by the
308  // destructor automatically, but we do it here to make any errors
309  // more explicit.)
310  if (writer1d) writer1d->flush();
311  if (writer2d) writer2d->flush();
312  if (writer3d) writer3d->flush();
313 
314  // Print the names of any classes that we were't able to read from
315  // the root file.
316  if (opts.verbose) {
317  for (const auto& name: skipped) {
318  std::cerr << "could not read branch of type " << name << std::endl;
319  }
320  }
321  } // end copyRootTree
322 
323 // ______________________________________________________________________
324 // Buffer implementation
325 
326  namespace {
327 // 1d buffer
328 
329  template <typename T>
330  Buffer<T>::Buffer(VariableFillers& vars, TTree& tt,
331  const std::string& name)
332  {
333  tt.SetBranchStatus(name.c_str(), true);
334  tt.SetBranchAddress(name.c_str(), &m_buffer);
335  T& buf = m_buffer;
336  vars.add<T>(name, [&buf](){return buf;});
337  }
338 
339 // 2d buffer
340  template <typename T>
341  VBuf<T>::VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
342  const std::string& name, T default_value):
343  m_buffer(new std::vector<T>)
344  {
345  tt.SetBranchStatus(name.c_str(), true);
346  tt.SetBranchAddress(name.c_str(), &m_buffer);
347  std::vector<T>& buf = *m_buffer;
348  auto filler = [&buf, &idx, default_value]() -> T {
349  if (idx.at(0) < buf.size()) {
350  return buf.at(idx.at(0));
351  } else {
352  return default_value;
353  }
354  };
355  vars.add<T>(name, filler);
356  }
357  template <typename T>
358  VBuf<T>::~VBuf() {
359  delete m_buffer;
360  }
361 
362 
363 // 3d buffers
364  template <typename T>
365  VVBuf<T>::VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
366  const std::string& name, T default_value):
367  m_buffer(new std::vector<std::vector<T> >)
368  {
369  tt.SetBranchStatus(name.c_str(), true);
370  tt.SetBranchAddress(name.c_str(), &m_buffer);
371  std::vector<std::vector<T> >& buf = *m_buffer;
372  auto filler = [&buf, &idx, default_value]() -> T {
373  size_t idx1 = idx.at(0);
374  size_t idx2 = idx.at(1);
375  if (idx1 < buf.size()) {
376  if (idx2 < buf.at(idx1).size()) {
377  return buf.at(idx1).at(idx2);
378  }
379  }
380  return default_value;
381  };
382  vars.add<T>(name, filler);
383  }
384  template <typename T>
385  VVBuf<T>::~VVBuf() {
386  delete m_buffer;
387  }
388 
389  } // close anonymous namespace
390 } // close H5 namespace
H5Utils::TreeCopyOpts
Definition: treeCopyOpts.h:18
fillPileUpNoiseLumi.current
current
Definition: fillPileUpNoiseLumi.py:52
TrigDefs::Group
Group
Properties of a chain group.
Definition: GroupProperties.h:13
max
#define max(a, b)
Definition: cfImp.cxx:41
HdfTuple.h
H5Utils::WriterXd::fillWhileIncrementing
void fillWhileIncrementing(std::vector< size_t > &indices)
Definition: HdfTuple.cxx:90
MuonGM::round
float round(const float toRound, const unsigned int decimals)
Definition: Mdt.cxx:27
CSV_InDetExporter.new
new
Definition: CSV_InDetExporter.py:145
H5Utils::WriterXd
WriterXd.
Definition: HdfTuple.h:154
copyRootTree.h
PrepareReferenceFile.regex
regex
Definition: PrepareReferenceFile.py:43
m_buffer
T m_buffer
Definition: copyRootTree.cxx:62
fillPileUpNoiseLumi.next
next
Definition: fillPileUpNoiseLumi.py:52
python.DecayParser.buf
buf
print ("=> [%s]"cmd)
Definition: DecayParser.py:27
ReadCalibFromCool.keep
keep
Definition: ReadCalibFromCool.py:85
vector
Definition: MultiHisto.h:13
BindingsTest.cut
cut
This script demonstrates how to call a C++ class from Python Also how to use PyROOT is shown.
Definition: BindingsTest.py:13
H5Utils::copyRootTree
void copyRootTree(TTree &tt, H5::Group &fg, const TreeCopyOpts &opts)
Definition: copyRootTree.cxx:125
H5Utils
HDF5 Tuple Writer.
Definition: common.h:20
selection
std::string selection
Definition: fbtTestBasics.cxx:75
min
#define min(a, b)
Definition: cfImp.cxx:40
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:221
Buffer::Buffer
Buffer(std::unique_ptr< EventStorage::DataReader > &&reader, size_t size)
Definition: trigbs_orderedMerge.cxx:140
Buffer
Definition: trigbs_orderedMerge.cxx:114
H5Utils::WriterXd::flush
void flush()
Definition: HdfTuple.cxx:115
egammaEnergyPositionAllSamples::e2
double e2(const xAOD::CaloCluster &cluster)
return the uncorrected cluster energy in 2nd sampling
LArNewCalib_DelayDump_OFC_Cali.idx
idx
Definition: LArNewCalib_DelayDump_OFC_Cali.py:69
H5Utils::VariableFillers
Variable filler arrays.
Definition: HdfTuple.h:118
athena.opts
opts
Definition: athena.py:88
passTTreeCut
bool passTTreeCut(TTreeFormula &cut)
TileDCSDataPlotter.tt
tt
Definition: TileDCSDataPlotter.py:874
length
double length(const pvec &v)
Definition: FPGATrackSimLLPDoubletHoughTransformTool.cxx:26
TSU::T
unsigned long long T
Definition: L1TopoDataTypes.h:35