ATLAS Offline Software
Loading...
Searching...
No Matches
HistogramMerger.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2026 CERN for the benefit of the ATLAS collaboration
3*/
4
8
9#include "H5Cpp.h"
10
11#include <cstdint>
12#include <memory>
13#include <numeric>
14#include <sstream>
15#include <stdexcept>
16#include <string>
17#include <utility>
18#include <vector>
19
20
21// ============================================================
22// File-local helpers
23// ============================================================
24
25namespace {
26
27using namespace H5Utils::hist::detail;
28
29// ------------------------------------------------------------------
30// Attribute read helpers
31// ------------------------------------------------------------------
32std::string read_str_attr(const H5::H5Object& obj,
33 const std::string& key)
34{
35 H5::Attribute attr = obj.openAttribute(key);
36 std::string val;
37 attr.read(attr.getDataType(), val);
38 return val;
39}
40
41bool read_bool_attr(const H5::H5Object& obj, const std::string& key) {
42 H5::Attribute attr = obj.openAttribute(key);
43 uint8_t val = 0;
44 attr.read(H5::PredType::NATIVE_UINT8, &val);
45 return val != 0;
46}
47
48double read_double_attr(const H5::H5Object& obj, const std::string& key) {
49 H5::Attribute attr = obj.openAttribute(key);
50 double val = 0.0;
51 attr.read(H5::PredType::NATIVE_DOUBLE, &val);
52 return val;
53}
54
55int64_t read_int_attr(const H5::H5Object& obj, const std::string& key) {
56 H5::Attribute attr = obj.openAttribute(key);
57 int64_t val = 0;
58 attr.read(H5::PredType::NATIVE_INT64, &val);
59 return val;
60}
61
62// ------------------------------------------------------------------
63// VL-string category reader (RAII-safe)
64// ------------------------------------------------------------------
65std::vector<std::string> read_string_categories(const H5::DataSet& ds)
66{
67 H5::StrType strtype(H5::PredType::C_S1, H5T_VARIABLE);
68 H5::DataSpace sp = ds.getSpace();
69 hsize_t npts = static_cast<hsize_t>(sp.getSimpleExtentNpoints());
70 std::vector<char*> ptrs(npts, nullptr);
71 // RAII guard must be created before ds.read so that H5Treclaim is
72 // called even if ds.read throws after partial allocation.
73 // H5Treclaim on nullptr entries is a no-op, so early construction
74 // is safe.
75 struct VlGuard {
76 hid_t tid, sid;
77 void* buf;
78 ~VlGuard() { H5Treclaim(tid, sid, H5P_DEFAULT, buf); }
79 } guard{strtype.getId(), sp.getId(), ptrs.data()};
80 ds.read(ptrs.data(), strtype);
81 std::vector<std::string> labels;
82 labels.reserve(npts);
83 for (char* p : ptrs) {
84 labels.emplace_back(p ? p : "");
85 }
86 return labels;
87}
88
89// ------------------------------------------------------------------
90// Axis I/O
91// ------------------------------------------------------------------
92std::vector<Axis> read_axes(const H5::Group& grp) {
93 H5::Group ref_axes = grp.openGroup("ref_axes");
94 std::vector<Axis> axes;
95 for (int n = 0; ; ++n) {
96 std::string ax_name = "axis_" + std::to_string(n);
97 htri_t exists = H5Lexists(ref_axes.getId(), ax_name.c_str(), H5P_DEFAULT);
98 if (exists <= 0) break;
99
100 H5::Group ax_grp = ref_axes.openGroup(ax_name);
101 Axis ax;
102
103 std::string type = read_str_attr(ax_grp, "type");
104 if (type == "regular") {
105 double lower = read_double_attr(ax_grp, "lower");
106 double upper = read_double_attr(ax_grp, "upper");
107 int64_t bins = read_int_attr (ax_grp, "bins");
108 ax.edges = regular_axis_t{lower, upper, static_cast<size_t>(bins)};
109 } else if (type == "variable") {
110 H5::DataSet edge_ds = ax_grp.openDataSet("edges");
111 hsize_t npts = edge_ds.getSpace().getSimpleExtentNpoints();
112 std::vector<double> edges(npts);
113 edge_ds.read(edges.data(), H5::PredType::NATIVE_DOUBLE);
114 ax.edges = std::move(edges);
115 } else if (type == "integer") {
116 int64_t start = read_int_attr(ax_grp, "start");
117 int64_t stop = read_int_attr(ax_grp, "stop");
118 ax.edges = std::pair<int64_t,int64_t>{start, stop};
119 } else if (type == "category") {
120 H5::DataSet cat_ds = ax_grp.openDataSet("categories");
121 H5::DataType dtype = cat_ds.getDataType();
122 hsize_t npts = cat_ds.getSpace().getSimpleExtentNpoints();
123 if (dtype.getClass() == H5T_STRING) {
124 ax.edges = read_string_categories(cat_ds);
125 } else {
126 std::vector<int64_t> int_vals(npts);
127 cat_ds.read(int_vals.data(), H5::PredType::NATIVE_INT64);
128 ax.edges = std::move(int_vals);
129 }
130 }
131
132 ax.underflow = read_bool_attr(ax_grp, "underflow");
133 ax.overflow = read_bool_attr(ax_grp, "overflow");
134
135 // Read optional axis name from metadata subgroup
136 htri_t has_meta = H5Lexists(ax_grp.getId(), "metadata", H5P_DEFAULT);
137 if (has_meta > 0) {
138 H5::Group meta = ax_grp.openGroup("metadata");
139 if (meta.attrExists("metadata")) {
140 ax.name = read_str_attr(meta, "metadata");
141 }
142 }
143
144 axes.push_back(std::move(ax));
145 }
146 return axes;
147}
148
149// ------------------------------------------------------------------
150// Storage dimension helpers
151// ------------------------------------------------------------------
152
153// Read the shape of the values dataset (authoritative dimensions).
154std::vector<hsize_t> read_dims(const H5::Group& grp) {
155 H5::DataSet ds = grp.openGroup("storage").openDataSet("values");
156 H5::DataSpace sp = ds.getSpace();
157 int ndims = sp.getSimpleExtentNdims();
158 std::vector<hsize_t> dims(static_cast<size_t>(ndims));
159 sp.getSimpleExtentDims(dims.data());
160 return dims;
161}
162
163size_t flat_size(const std::vector<hsize_t>& dims) {
164 return std::accumulate(dims.begin(), dims.end(),
165 hsize_t(1), std::multiplies<hsize_t>{});
166}
167
168// ------------------------------------------------------------------
169// Dataset read/write helpers (templated)
170// ------------------------------------------------------------------
171
172// Returns {data, on-disk HDF5 type}. The dtype is an independent copy
173// (HDF5 calls H5Tcopy internally in getDataType()) so the caller owns it.
174template <typename T>
175std::pair<std::vector<T>, H5::DataType>
176read_dataset(const H5::Group& grp, const std::string& name)
177{
178 H5::DataSet ds = grp.openDataSet(name);
179 H5::DataType dtype = ds.getDataType();
180 hsize_t n = ds.getSpace().getSimpleExtentNpoints();
181 std::vector<T> data(n);
182 ds.read(data.data(), hdf5_t<T>);
183 return {std::move(data), dtype};
184}
185
186// Returns a checker lambda that validates p.second == expected and returns p.first.
187auto get_type_checker(const H5::DataType& expected) {
188 return [expected](auto p) {
189 if (p.second != expected)
190 throw std::runtime_error(
191 "histogram datasets have inconsistent HDF5 types");
192 return std::move(p.first);
193 };
194}
195
196// Writes data with memory type T but on-disk type file_dtype.
197// HDF5 performs the conversion automatically (e.g. double→float).
198template <typename T>
199void write_dataset(H5::Group& grp,
200 const std::string& name,
201 const std::vector<T>& data,
202 const std::vector<hsize_t>& dims,
203 const H5::DataType& file_dtype)
204{
205 H5::DataSpace space(static_cast<int>(dims.size()), dims.data());
206 H5::DSetCreatPropList props;
207 props.setChunk(static_cast<int>(dims.size()), dims.data());
208 props.setDeflate(7);
209 chkerr(H5Pset_dset_no_attrs_hint(props.getId(), true), "no attribute hint");
210 if (space.getSelectNpoints() != static_cast<hssize_t>(data.size()))
211 throw std::runtime_error(
212 "write_dataset: dataspace size does not match data size");
213 grp.createDataSet(name, file_dtype, space, props)
214 .write(data.data(), hdf5_t<T>);
215}
216
217// ------------------------------------------------------------------
218// Axes consistency check
219// ------------------------------------------------------------------
220void check_axes_consistent(const std::vector<Axis>& stored,
221 const std::vector<Axis>& incoming)
222{
223 if (stored != incoming)
224 throw std::runtime_error("histogram axes are inconsistent between input files");
225}
226
227// ------------------------------------------------------------------
228// Group navigation/creation helper
229// ------------------------------------------------------------------
230H5::Group open_or_create_group(H5::Group& parent, const std::string& name) {
231 htri_t exists = H5Lexists(parent.getId(), name.c_str(), H5P_DEFAULT);
232 if (exists > 0) return parent.openGroup(name);
233 return parent.createGroup(name);
234}
235
236// ==================================================================
237// Concrete histogram implementations
238// ==================================================================
239
240// ------------------------------------------------------------------
241// SimpleHistogram<T>: "double" and "int" storage types
242// ------------------------------------------------------------------
243template <typename T>
244class SimpleHistogram : public H5Utils::hist::IHistogram {
245public:
246 SimpleHistogram(std::vector<Axis> axes,
247 std::vector<hsize_t> dims)
248 : m_axes(std::move(axes))
249 , m_dims(std::move(dims))
250 , m_values(flat_size(m_dims), T(0))
251 {}
252
253 void add(const H5::Group& src) override {
254 check_axes_consistent(m_axes, read_axes(src));
255 H5::Group storage = src.openGroup("storage");
256 auto [v, dtype] = read_dataset<T>(storage, "values");
257 m_storage_dtype = dtype;
258 if (v.size() != m_values.size())
259 throw std::runtime_error(
260 "histogram bin count in input file does not match expected size");
261 for (size_t i = 0; i < m_values.size(); ++i) m_values[i] += v[i];
262 }
263
264 void write(H5::Group& parent, const std::string& name) const override {
265 H5::Group grp = parent.createGroup(name);
266 write_int_attr(grp, "uhi_schema", 1);
267 grp.createGroup("metadata");
268 grp.createGroup("writer_info");
269 write_axes(grp, m_axes);
270
271 H5::Group storage = grp.createGroup("storage");
272 write_str_attr(storage, "type", std::is_integral_v<T> ? "int" : "double");
273 write_dataset<T>(storage, "values", m_values, m_dims, m_storage_dtype);
274 }
275
276private:
277 std::vector<Axis> m_axes;
278 std::vector<hsize_t> m_dims;
279 std::vector<T> m_values;
280 H5::DataType m_storage_dtype; // captured from "values" in add()
281};
282
283// ------------------------------------------------------------------
284// WeightedHistogram: "weighted" storage type
285// ------------------------------------------------------------------
286class WeightedHistogram : public H5Utils::hist::IHistogram {
287public:
288 WeightedHistogram(std::vector<Axis> axes, std::vector<hsize_t> dims)
289 : m_axes(std::move(axes))
290 , m_dims(std::move(dims))
291 , m_values(flat_size(m_dims), 0.0)
292 , m_variances(flat_size(m_dims), 0.0)
293 {}
294
295 void add(const H5::Group& src) override {
296 check_axes_consistent(m_axes, read_axes(src));
297 H5::Group storage = src.openGroup("storage");
298 auto [v, dtype] = read_dataset<double>(storage, "values");
299 m_storage_dtype = dtype;
300 auto check = get_type_checker(dtype);
301 std::vector<double> w = check(read_dataset<double>(storage, "variances"));
302 if (v.size() != m_values.size() || w.size() != m_variances.size())
303 throw std::runtime_error(
304 "histogram bin count in input file does not match expected size");
305 for (size_t i = 0; i < m_values.size(); ++i) {
306 m_values[i] += v[i];
307 m_variances[i] += w[i];
308 }
309 }
310
311 void write(H5::Group& parent, const std::string& name) const override {
312 H5::Group grp = parent.createGroup(name);
313 write_int_attr(grp, "uhi_schema", 1);
314 grp.createGroup("metadata");
315 grp.createGroup("writer_info");
316 write_axes(grp, m_axes);
317
318 H5::Group storage = grp.createGroup("storage");
319 write_str_attr(storage, "type", "weighted");
320 write_dataset<double>(storage, "values", m_values, m_dims, m_storage_dtype);
321 write_dataset<double>(storage, "variances", m_variances, m_dims, m_storage_dtype);
322 }
323
324private:
325 std::vector<Axis> m_axes;
326 std::vector<hsize_t> m_dims;
327 std::vector<double> m_values;
328 std::vector<double> m_variances;
329 H5::DataType m_storage_dtype; // captured from "values" in add()
330};
331
332// ------------------------------------------------------------------
333// MeanHistogram: "mean" storage type
334//
335// On disk: values = mean, variances = variance, count = n
336// Internal: m_sums = sum(count * mean), m_M2 = sum(count * variance) accumulated
337// with Chan's parallel formula for numerically stable variance.
338// ------------------------------------------------------------------
339class MeanHistogram : public H5Utils::hist::IHistogram {
340public:
341 MeanHistogram(std::vector<Axis> axes, std::vector<hsize_t> dims)
342 : m_axes(std::move(axes))
343 , m_dims(std::move(dims))
344 , m_n(flat_size(m_dims), 0.0)
345 , m_sums(flat_size(m_dims), 0.0)
346 , m_M2(flat_size(m_dims), 0.0)
347 {}
348
349 void add(const H5::Group& src) override {
350 check_axes_consistent(m_axes, read_axes(src));
351 H5::Group storage = src.openGroup("storage");
352 auto [means, dtype] = read_dataset<double>(storage, "values");
353 m_storage_dtype = dtype;
354 auto check = get_type_checker(dtype);
355 std::vector<double> counts = check(read_dataset<double>(storage, "count"));
356 // variances may not be present; default to zero
357 std::vector<double> variances(m_n.size(), 0.0);
358 htri_t has_var = H5Lexists(storage.getId(), "variances", H5P_DEFAULT);
359 if (has_var > 0) {
360 variances = check(read_dataset<double>(storage, "variances"));
361 }
362
363 if (counts.size() != m_n.size() || means.size() != m_n.size())
364 throw std::runtime_error(
365 "histogram bin count in input file does not match expected size");
366
367 for (size_t i = 0; i < m_n.size(); ++i) {
368 double na = m_n[i];
369 double nb = counts[i];
370 double nc = na + nb;
371
372 if (nc == 0.0) continue;
373
374 double mean_b = means[i];
375 double mean_a = (na > 0.0) ? m_sums[i] / na : 0.0;
376 double delta = mean_b - mean_a;
377
378 // Chan's parallel formula for sum of squared deviations.
379 // boost::histogram mean accumulator stores Bessel-corrected (sample)
380 // variance: var = M2 / (n-1), so M2 = var * (n-1).
381 // For n <= 1 the variance is defined as 0, so M2 = 0.
382 double M2_b = (nb > 1.0) ? variances[i] * (nb - 1.0) : 0.0;
383 m_M2[i] += M2_b + delta * delta * na * nb / nc;
384 m_sums[i] = na * mean_a + nb * mean_b; // = nc * new_mean
385 m_n[i] = nc;
386 }
387 }
388
389 void write(H5::Group& parent, const std::string& name) const override {
390 H5::Group grp = parent.createGroup(name);
391 write_int_attr(grp, "uhi_schema", 1);
392 grp.createGroup("metadata");
393 grp.createGroup("writer_info");
394 write_axes(grp, m_axes);
395
396 H5::Group storage = grp.createGroup("storage");
397 write_str_attr(storage, "type", "mean");
398
399 size_t n = m_n.size();
400 std::vector<double> values(n);
401 std::vector<double> variances(n);
402 for (size_t i = 0; i < n; ++i) {
403 if (m_n[i] > 0.0) {
404 values[i] = m_sums[i] / m_n[i];
405 // Write Bessel-corrected (sample) variance to match the input format.
406 variances[i] = (m_n[i] > 1.0) ? m_M2[i] / (m_n[i] - 1.0) : 0.0;
407 }
408 }
409
410 write_dataset<double>(storage, "count", m_n, m_dims, m_storage_dtype);
411 write_dataset<double>(storage, "values", values, m_dims, m_storage_dtype);
412 write_dataset<double>(storage, "variances", variances, m_dims, m_storage_dtype);
413 }
414
415private:
416 std::vector<Axis> m_axes;
417 std::vector<hsize_t> m_dims;
418 std::vector<double> m_n; // accumulated count per bin
419 std::vector<double> m_sums; // accumulated sum(count * mean) per bin
420 std::vector<double> m_M2; // accumulated sum-of-squared-deviations per bin
421 H5::DataType m_storage_dtype; // captured from "values" in add()
422};
423
424// ------------------------------------------------------------------
425// WeightedMeanHistogram: "weighted_mean" storage type
426//
427// On disk: values = weighted mean = sum(w*x)/sum(w),
428// sum_of_weights, sum_of_weights_squared, count
429// Internal: accumulate each sum directly; compute values on write.
430// ------------------------------------------------------------------
431class WeightedMeanHistogram : public H5Utils::hist::IHistogram {
432public:
433 WeightedMeanHistogram(std::vector<Axis> axes, std::vector<hsize_t> dims)
434 : m_axes(std::move(axes))
435 , m_dims(std::move(dims))
436 , m_sum_w(flat_size(m_dims), 0.0)
437 , m_sum_w2(flat_size(m_dims), 0.0)
438 , m_counts(flat_size(m_dims), 0.0)
439 , m_weighted_sums(flat_size(m_dims), 0.0)
440 {}
441
442 void add(const H5::Group& src) override {
443 check_axes_consistent(m_axes, read_axes(src));
444 H5::Group storage = src.openGroup("storage");
445 auto [values, dtype] = read_dataset<double>(storage, "values");
446 m_storage_dtype = dtype;
447 auto check = get_type_checker(dtype);
448 std::vector<double> sum_w = check(
449 read_dataset<double>(storage, "sum_of_weights"));
450 std::vector<double> sum_w2 = check(
451 read_dataset<double>(storage, "sum_of_weights_squared"));
452
453 if (values.size() != m_sum_w.size())
454 throw std::runtime_error(
455 "histogram bin count in input file does not match expected size");
456
457 // count is optional
458 std::vector<double> counts(m_sum_w.size(), 0.0);
459 htri_t has_count = H5Lexists(storage.getId(), "count", H5P_DEFAULT);
460 if (has_count > 0) {
461 counts = check(read_dataset<double>(storage, "count"));
462 }
463
464 for (size_t i = 0; i < m_sum_w.size(); ++i) {
465 m_weighted_sums[i] += sum_w[i] * values[i];
466 m_sum_w[i] += sum_w[i];
467 m_sum_w2[i] += sum_w2[i];
468 m_counts[i] += counts[i];
469 }
470 }
471
472 void write(H5::Group& parent, const std::string& name) const override {
473 H5::Group grp = parent.createGroup(name);
474 write_int_attr(grp, "uhi_schema", 1);
475 grp.createGroup("metadata");
476 grp.createGroup("writer_info");
477 write_axes(grp, m_axes);
478
479 H5::Group storage = grp.createGroup("storage");
480 write_str_attr(storage, "type", "weighted_mean");
481
482 size_t n = m_sum_w.size();
483 std::vector<double> values(n, 0.0);
484 for (size_t i = 0; i < n; ++i) {
485 if (m_sum_w[i] != 0.0) {
486 values[i] = m_weighted_sums[i] / m_sum_w[i];
487 }
488 }
489
490 write_dataset<double>(storage, "values", values, m_dims, m_storage_dtype);
491 write_dataset<double>(storage, "sum_of_weights", m_sum_w, m_dims, m_storage_dtype);
492 write_dataset<double>(storage, "sum_of_weights_squared", m_sum_w2, m_dims, m_storage_dtype);
493 write_dataset<double>(storage, "count", m_counts, m_dims, m_storage_dtype);
494 }
495
496private:
497 std::vector<Axis> m_axes;
498 std::vector<hsize_t> m_dims;
499 std::vector<double> m_sum_w; // sum of weights
500 std::vector<double> m_sum_w2; // sum of weights squared
501 std::vector<double> m_counts; // unweighted fill count
502 std::vector<double> m_weighted_sums; // sum(weight * value)
503 H5::DataType m_storage_dtype; // captured from "values" in add()
504};
505
506} // anonymous namespace
507
508
509// ==================================================================
510// HistogramMerger implementation
511// ==================================================================
512
513namespace H5Utils::hist {
514
516
517std::unique_ptr<IHistogram>
518HistogramMerger::make(const H5::Group& src) {
519 std::vector<Axis> axes = read_axes(src);
520 std::vector<hsize_t> dims = read_dims(src);
521
522 H5::Group storage = src.openGroup("storage");
523 std::string type = read_str_attr(storage, "type");
524
525 if (type == "double") {
526 return std::make_unique<SimpleHistogram<double>>(
527 std::move(axes), std::move(dims));
528 }
529 if (type == "int") {
530 return std::make_unique<SimpleHistogram<int64_t>>(
531 std::move(axes), std::move(dims));
532 }
533 if (type == "weighted") {
534 return std::make_unique<WeightedHistogram>(
535 std::move(axes), std::move(dims));
536 }
537 if (type == "mean") {
538 return std::make_unique<MeanHistogram>(
539 std::move(axes), std::move(dims));
540 }
541 if (type == "weighted_mean") {
542 return std::make_unique<WeightedMeanHistogram>(
543 std::move(axes), std::move(dims));
544 }
545 throw std::runtime_error("HistogramMerger: unknown UHI storage type: " + type);
546}
547
548void HistogramMerger::add(const std::string& path, const H5::Group& src) {
549 if (!m_hists.count(path)) {
550 m_hists.emplace(path, make(src));
551 }
552 m_hists.at(path)->add(src);
553}
554
555void HistogramMerger::write(H5::Group& root) const {
556 for (const auto& [path, hist] : m_hists) {
557 // Split the absolute HDF5 path (e.g. "/a/b/c") into components.
558 std::vector<std::string> parts;
559 std::istringstream ss(path);
560 std::string token;
561 while (std::getline(ss, token, '/')) {
562 if (!token.empty()) parts.push_back(token);
563 }
564 if (parts.empty()) continue;
565
566 // Navigate/create intermediate groups, then write the histogram.
567 H5::Group parent = root;
568 for (size_t i = 0; i + 1 < parts.size(); ++i) {
569 parent = open_or_create_group(parent, parts[i]);
570 }
571 hist->write(parent, parts.back());
572 }
573}
574
575} //> end namespace H5Utils::hist
char data[hepevt_bytes_allocation_ATLAS]
Definition HepEvt.cxx:11
int upper(int c)
static Double_t sp
static Double_t ss
static const std::vector< std::string > bins
std::unique_ptr< IHistogram > make(const H5::Group &src)
Factory: read the storage type from src and return a zero-initialised concrete IHistogram of the appr...
std::map< std::string, std::unique_ptr< IHistogram > > m_hists
void add(const std::string &path, const H5::Group &src)
Accumulate one UHI histogram group.
void write(H5::Group &root) const
Write every accumulated histogram into the output file.
Abstract interface for a mergeable histogram.
Definition IHistogram.h:20
bool exists(const std::string &filename)
does a file exist
bool add(const std::string &hname, TKey *tobj)
Definition fastadd.cxx:55
void chkerr(herr_t code, const std::string &error)
void write_axes(H5::Group &hist_grp, const std::vector< Axis > &axes)
void write_str_attr(H5::H5Object &obj, const std::string &key, const std::string &val)
const H5::DataType hdf5_t
void write_int_attr(H5::H5Object &obj, const std::string &key, int64_t val)
unsigned long long T
-diff