ATLAS Offline Software
Loading...
Searching...
No Matches
copyRootTree.cxx
Go to the documentation of this file.
1/*
2Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3*/
4
5#include "copyRootTree.h"
6
7// Root tree copy function
8//
9// NOTE: I don't recommend trying to understand the variable buffer
10// classes until you see the general workflow.
11//
12// The main work is done in copy_root_tree below, skip down to that to
13// understand what is going on.
14//
15
16#include "HDF5Utils/HdfTuple.h"
17
18#include "TFile.h"
19#include "TTree.h"
20#include "TLeaf.h"
21
22#include <memory>
23#include <regex>
24#include <iostream>
25#include <set>
26#include <cmath>
27
28// ______________________________________________________________________
29// Variable buffer classes
30//
31// These classes act as glue between HDF and ROOT: they hold pointers
32// to the buffers that ROOT will read into, and as such are
33// "ROOT-side" buffers.
34//
35// The constructors are given a reference to a `VariableFillers`
36// container, which contains the "HDF5-side" logic. Each of the
37// VariableFiller objects contains a "filler" function that the HDF5
38// writer will call to read from the ROOT buffer.
39//
40// Some of the filler functions close over an index vector. This
41// vector is incremented by the HDF5 Writer to read out successive
42// entires in a std::vector stored in the root file.
43//
44// If this all seems complicated, it's because it's the deep internals
45// of the code. Try to understand the copy_root_tree function first.
46
47// Base class, just to hold a virtual destructor
48namespace H5Utils {
49 namespace {
50 class IBuffer
51 {
52 public:
53 virtual ~IBuffer() {}
54 };
55
56// Simple variables types are stored in the `Buffer` class.
57 template <typename T>
58 class Buffer: public IBuffer
59 {
60 public:
61 Buffer(VariableFillers& vars, TTree& tt, const std::string& name);
62 private:
63 T m_buffer;
64 };
65
66// Buffer for vector types
67 template <typename T>
68 class VBuf: public IBuffer
69 {
70 public:
71 // These require an index for the vector (`idx`)
72 VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
73 const std::string& name, T default_value = T());
74 ~VBuf();
75 private:
76 std::vector<T>* m_buffer;
77 };
78
79// Buffer for vectors of vectors
80 template <typename T>
81 class VVBuf: public IBuffer
82 {
83 public:
84 VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
85 const std::string& name, T default_value = T());
86 ~VVBuf();
87 private:
88 std::vector<std::vector<T> >* m_buffer;
89 };
90
91
92 // Selection function
93 //
94 // Define a bool function that returns false if event doesn't
95 // pass TTreeFormula cut.
96 bool passTTreeCut(TTreeFormula* selection) {
97 // GetNdim looks to see how many valid arguements there are.
98 // Returns false for none.
99 if ( !selection || !selection->GetNdim()) return true;
100 Int_t ndata = selection->GetNdata();
101 for(Int_t current = 0; current < ndata; current++) {
102 if (selection->EvalInstance(current) == 0) return false;
103 }
104 return true;
105 }
106
107
108 } // close anonymous namespace
109
110// _____________________________________________________________________
111// main copy root tree function
112//
113// The basic workflow is as follows:
114//
115// - Define the ROOT-side buffers that we'll read the information
116// into. At the same time, define HDF-side objects that read out of
117// these buffers and copy the information into the HDF5 buffer.
118//
119// - Build the output HDF5 files
120//
121// - Loop over the input tree and fill the output files
122//
123// Note that this means the event loop happens last: most of the work
124// is just setting up the read and write buffers.
125
126 void copyRootTree(TTree& tt, H5::Group& fg, const TreeCopyOpts& opts) {
127
128 // define the buffers for root to read into
129 std::vector<std::unique_ptr<IBuffer> > buffers;
130
131 // this keeps track of the things we couldn't read
132 std::set<std::string> skipped;
133
134
135 // Each `VariableFiller` must be constructed with a "filler"
136 // function (or callable object), which takes no arguments and
137 // returns the variable we want to write out. In this case they are
138 // implemented as closures over the buffers that ROOT is reading
139 // into.
140
141 // This is the 1d variables
142 VariableFillers vars;
143 std::vector<size_t> idx_dummy;
144
145 // These are 2d variables (i.e. vector<T> in the root file)
146 //
147 // We also need an index which the HDF5 writer increments as it
148 // fills. This is shared with the ROOT buffers to index entries in
149 // std::vectors
150 VariableFillers vars2d;
151 std::vector<size_t> idx(1,0);
152
153 // 3d variables (index is now 2d)
154 VariableFillers vars3d;
155 std::vector<size_t> idx2(2,0);
156
157 // Iterate over all the leaf names. There are some duplicates in the
158 // list of keys, so we have to build the set ourselves.
159 std::regex branch_filter(opts.branch_regex);
160 TIter next(tt.GetListOfLeaves());
161 TLeaf* leaf;
162 std::set<std::string> leaf_names;
163 while ((leaf = dynamic_cast<TLeaf*>(next()))) {
164 leaf_names.insert(leaf->GetName());
165 }
166 if (leaf_names.size() == 0) throw std::logic_error("no branches found");
167
168 // Loop over all the leafs, assign buffers to each
169 //
170 // These `Buffer` classes are defined above. The buffers turn the
171 // branchs on, so we can set them all off to start.
172 tt.SetBranchStatus("*", false);
173 for (const auto& lname: leaf_names) {
174 bool keep = true;
175 if (opts.branch_regex.size() > 0) {
176 keep = std::regex_search(lname, branch_filter);
177 }
178 if (opts.verbose) {
179 std::cout << (keep ? "found " : "rejecting ") << lname << std::endl;
180 }
181 if (!keep) continue;
182
183 leaf = tt.GetLeaf(lname.c_str());
184 std::string branchName = leaf->GetBranch()->GetName();
185 std::string leaf_type = leaf->GetTypeName();
186 if (leaf_type == "Int_t") {
187 buffers.emplace_back(new Buffer<int>(vars, tt, branchName));
188 } else if (leaf_type == "Float_t") {
189 buffers.emplace_back(new Buffer<float>(vars, tt, branchName));
190 } else if (leaf_type == "Double_t") {
191 buffers.emplace_back(new Buffer<double>(vars, tt, branchName));
192 } else if (leaf_type == "Bool_t") {
193 buffers.emplace_back(new Buffer<bool>(vars, tt, branchName));
194 } else if (leaf_type == "Long64_t") {
195 buffers.emplace_back(new Buffer<long long>(vars, tt, branchName));
196 } else if (leaf_type == "ULong64_t") {
197 buffers.emplace_back(new Buffer<unsigned long long>(vars, tt, branchName));
198 } else if (leaf_type == "UInt_t") {
199 buffers.emplace_back(new Buffer<unsigned int>(vars, tt, branchName));
200 } else if (leaf_type == "UChar_t") {
201 buffers.emplace_back(new Buffer<unsigned char>(vars, tt, branchName));
202 } else if (leaf_type == "vector<float>") {
203 buffers.emplace_back(new VBuf<float>(vars2d, idx, tt, branchName, NAN));
204 } else if (leaf_type == "vector<double>") {
205 buffers.emplace_back(new VBuf<double>(vars2d, idx, tt, branchName, NAN));
206 } else if (leaf_type == "vector<int>") {
207 buffers.emplace_back(new VBuf<int>(vars2d, idx, tt, branchName, 0));
208 } else if (leaf_type == "vector<unsigned int>") {
209 buffers.emplace_back(new VBuf<unsigned int>(vars2d, idx, tt, branchName, 0));
210 } else if (leaf_type == "vector<unsigned char>") {
211 buffers.emplace_back(new VBuf<unsigned char>(vars2d, idx, tt, branchName, 0));
212 } else if (leaf_type == "vector<bool>") {
213 buffers.emplace_back(new VBuf<bool>(vars2d, idx, tt, branchName, 0));
214 } else if (leaf_type == "vector<vector<int> >") {
215 buffers.emplace_back(new VVBuf<int>(vars3d, idx2, tt, branchName, 0));
216 } else if (leaf_type == "vector<vector<unsigned int> >") {
217 buffers.emplace_back(new VVBuf<unsigned int>(vars3d, idx2, tt, branchName, 0));
218 } else if (leaf_type == "vector<vector<unsigned char> >") {
219 buffers.emplace_back(new VVBuf<unsigned char>(vars3d, idx2, tt, branchName, 0));
220 } else if (leaf_type == "vector<vector<float> >") {
221 buffers.emplace_back(new VVBuf<float>(vars3d, idx2, tt, branchName, NAN));
222 } else if (leaf_type == "vector<vector<double> >") {
223 buffers.emplace_back(new VVBuf<double>(vars3d, idx2, tt, branchName, NAN));
224 } else if (leaf_type == "vector<vector<bool> >") {
225 buffers.emplace_back(new VVBuf<bool>(vars3d, idx2, tt, branchName, 0));
226 } else {
227 skipped.insert(std::move(leaf_type));
228 }
229 }
230
231 // Build HDF5 Outputs
232 //
233 // In the simple case where we're not reading vectors, we store one
234 // dataset with the same name as the tree. If there are vectors, we
235 // instead create a group with the same name as the tree, and name
236 // the datasets 1d, 2d, etc.
237 //
238 const std::string tree_name = tt.GetName();
239
240 std::unique_ptr<WriterXd> writer1d;
241 std::unique_ptr<WriterXd> writer2d;
242 std::unique_ptr<WriterXd> writer3d;
243 std::unique_ptr<H5::Group> top_group;
244 if (opts.vector_lengths.size() > 0) {
245 if (opts.vector_lengths.size() > 2) throw std::logic_error(
246 "we don't support outputs with rank > 3");
247 size_t length = opts.vector_lengths.at(0);
248 top_group.reset(new H5::Group(fg.createGroup(tree_name)));
249 if (opts.vector_lengths.size() > 1) {
250 size_t length2 = opts.vector_lengths.at(1);
251 if (vars3d.size() > 0) {
252 writer3d.reset(new WriterXd(*top_group, "3d", std::move(vars3d),
253 {length, length2}, opts.chunk_size));
254 }
255 }
256 if (vars2d.size() > 0) {
257 writer2d.reset(new WriterXd(*top_group, "2d", std::move(vars2d),
258 {length}, opts.chunk_size));
259 }
260 if (vars.size() > 0) {
261 writer1d.reset(new WriterXd(*top_group, "1d",
262 std::move(vars), {}, opts.chunk_size));
263 }
264 } else {
265 if (vars.size() > 0) {
266 writer1d.reset(new WriterXd(fg, tree_name, std::move(vars), {}, opts.chunk_size));
267 }
268 }
269
270 // Main event loop
271 //
272 // Very little actually happens here since the buffers are already
273 // defined, as are the HDF5 reader functions.
274 //
275
276 // Get the selection string and build a new TTreeFormula
277 std::string cut_string = opts.selection;
278 const char * cut_char = cut_string.c_str();
279 TTreeFormula *cut =0;
280 if(!cut_string.empty()){
281 // This is so a cut can be applied without requiring the
282 // branch to be output to the hdf5 file.
283 tt.SetBranchStatus("*", true);
284 cut = new TTreeFormula("selection", cut_char, &tt);
285 }
286
287 size_t n_entries = tt.GetEntries();
288 if (opts.n_entries) n_entries = std::min(n_entries, opts.n_entries);
289 int print_interval = opts.print_interval;
290 if (print_interval == -1) {
291 print_interval = std::max(1UL, n_entries / 100);
292 }
293
294 for (size_t iii = 0; iii < n_entries; iii++) {
295 if (print_interval && (iii % print_interval == 0)) {
296 std::cout << "events processed: " << iii
297 << " (" << std::round(iii*1e2 / n_entries) << "% of "
298 << n_entries << ")" << std::endl;
299 }
300 tt.GetEntry(iii);
301 if(cut) cut->UpdateFormulaLeaves();
302 if (!passTTreeCut(cut)) continue;
303 if (writer1d) writer1d->fillWhileIncrementing(idx_dummy);
304 if (writer2d) writer2d->fillWhileIncrementing(idx);
305 if (writer3d) writer3d->fillWhileIncrementing(idx2);
306 }
307
308 // Flush the memory buffers on the HDF5 side. (This is done by the
309 // destructor automatically, but we do it here to make any errors
310 // more explicit.)
311 if (writer1d) writer1d->flush();
312 if (writer2d) writer2d->flush();
313 if (writer3d) writer3d->flush();
314
315 // Print the names of any classes that we were't able to read from
316 // the root file.
317 if (opts.verbose) {
318 for (const auto& name: skipped) {
319 std::cerr << "could not read branch of type " << name << std::endl;
320 }
321 }
322 } // end copyRootTree
323
324// ______________________________________________________________________
325// Buffer implementation
326
327 namespace {
328// 1d buffer
329
330 template <typename T>
331 Buffer<T>::Buffer(VariableFillers& vars, TTree& tt,
332 const std::string& name)
333 {
334 tt.SetBranchStatus(name.c_str(), true);
335 tt.SetBranchAddress(name.c_str(), &m_buffer);
336 T& buf = m_buffer;
337 vars.add<T>(name, [&buf](){return buf;});
338 }
339
340// 2d buffer
341 template <typename T>
342 VBuf<T>::VBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
343 const std::string& name, T default_value):
344 m_buffer(new std::vector<T>)
345 {
346 tt.SetBranchStatus(name.c_str(), true);
347 tt.SetBranchAddress(name.c_str(), &m_buffer);
348 std::vector<T>& buf = *m_buffer;
349 auto filler = [&buf, &idx, default_value]() -> T {
350 if (idx.at(0) < buf.size()) {
351 return buf.at(idx.at(0));
352 } else {
353 return default_value;
354 }
355 };
356 vars.add<T>(name, filler);
357 }
358 template <typename T>
359 VBuf<T>::~VBuf() {
360 delete m_buffer;
361 }
362
363
364// 3d buffers
365 template <typename T>
366 VVBuf<T>::VVBuf(VariableFillers& vars, std::vector<size_t>& idx, TTree& tt,
367 const std::string& name, T default_value):
368 m_buffer(new std::vector<std::vector<T> >)
369 {
370 tt.SetBranchStatus(name.c_str(), true);
371 tt.SetBranchAddress(name.c_str(), &m_buffer);
372 std::vector<std::vector<T> >& buf = *m_buffer;
373 auto filler = [&buf, &idx, default_value]() -> T {
374 size_t idx1 = idx.at(0);
375 size_t idx2 = idx.at(1);
376 if (idx1 < buf.size()) {
377 if (idx2 < buf.at(idx1).size()) {
378 return buf.at(idx1).at(idx2);
379 }
380 }
381 return default_value;
382 };
383 vars.add<T>(name, filler);
384 }
385 template <typename T>
386 VVBuf<T>::~VVBuf() {
387 delete m_buffer;
388 }
389
390 } // close anonymous namespace
391} // close H5 namespace
double length(const pvec &v)
Buffer(std::unique_ptr< EventStorage::DataReader > &&reader, size_t size)
Variable filler arrays.
Definition HdfTuple.h:118
bool passTTreeCut(TTreeFormula &cut)
const std::string selection
HDF5 Tuple Writer.
Definition common.h:20
void copyRootTree(TTree &tt, H5::Group &fg, const TreeCopyOpts &opts)
unsigned long long T
STL namespace.