ATLAS Offline Software
Loading...
Searching...
No Matches
buffers.py
Go to the documentation of this file.
1# Copyright (C) 2002-2026 CERN for the benefit of the ATLAS collaboration
2#
3# @author Giordon Stark
4
5# Buffer extraction and reconstruction utilities for PythonToolHandle.
6# Generalizes the pattern from test/muon_eff_sf_example_PHYSLITE.py into
7# reusable functions that work with any columnar CP tool.
8
9import itertools
10
11import awkward as ak
12import numpy as np
13
14from ColumnarToolWrapperPython.python_tool_handle import ColumnAccessMode
15
16
18 """Return the inner-most singly-jagged ListOffsetArray of ``array``.
19
20 For a ``var * var * T`` input (e.g. NumTrkPt500), returns a view whose
21 layout is a ``ListOffsetArray`` with ``NumpyArray`` content — i.e. the
22 per-particle list structure collapsed across the full event range.
23 Use ``result.layout.offsets.data`` for cumulative per-particle inner
24 offsets and ``result.layout.content.data`` for the flat numeric buffer.
25
26 Solution from Peter Fackeldey: traverse the layout with ``ak.transform``
27 and capture the deepest singly-jagged node.
28 """
29 layout = array.layout
30 layout_depth = layout.purelist_depth
31
32 is_branched, _ = layout.branch_depth
33 if is_branched:
34 raise ValueError(
35 f"{layout} has branching, cannot extract inner-most ListOffsetArray"
36 )
37
38 def _is_singly_jagged(lay):
39 return (
40 isinstance(lay, ak.contents.ListOffsetArray)
41 and isinstance(lay.content, ak.contents.NumpyArray)
42 )
43
44 found = None
45
46 def _capture(lay, depth, **_kwargs):
47 nonlocal found
48 if depth == (layout_depth - 1) and _is_singly_jagged(lay):
49 found = lay.materialize()
50
51 ak.transform(_capture, layout, return_value="none")
52
53 if found is None:
54 raise ValueError(
55 "Did not find a singly-jagged ListOffsetArray at the inner-most depth"
56 )
57 return ak.Array(found)
58
59
61 """Strip the trailing ``.data`` suffix from a nested-vector column name."""
62 return name[:-5] if name.endswith(".data") else name
63
64
65def classify_columns(columns):
66 """Group ColumnInfo objects by container and role.
67
68 Parameters
69 ----------
70 columns:
71 Iterable of ColumnInfo objects as returned by PythonToolHandle.columns.
72
73 Returns
74 -------
75 dict
76 Keyed by container offset name (e.g. "EventInfo", "Muons"). Each value
77 is a dict with:
78
79 - ``"offset"``: the ColumnInfo for this container's offset column
80 - ``"inputs"``: list of input ColumnInfo belonging to this container
81 - ``"outputs"``: list of output ColumnInfo belonging to this container
82 - ``"nested_offsets"``: dict of name -> ColumnInfo for offset columns
83 that are children of this container (e.g. "Muons.NumTrkPt500.offset")
84
85 Notes
86 -----
87 Container offsets have ``is_offset=True`` and an ``offset_name`` of either
88 ``''`` (root, e.g. "EventInfo") or the name of another container offset
89 (e.g. "Muons" has ``offset_name="EventInfo"``). Nested-vector offsets also
90 have ``is_offset=True`` but their name contains a dot; they are stored under
91 ``"nested_offsets"`` of their parent container rather than as top-level keys.
92 """
93 # Separate offset columns from data columns
94 offset_cols = {col.name: col for col in columns if col.is_offset}
95 data_cols = [col for col in columns if not col.is_offset]
96
97 # Determine which offset columns are "container" offsets vs nested-vector
98 # offsets. A container offset is one whose offset_name is either '' (root)
99 # or points to another container offset. For MuonEffSF: EventInfo
100 # (offset_name='') and Muons (offset_name='EventInfo') are both containers.
101 # A nested-vector offset (e.g. "Particles.NumTrkPt500.offset") has a dotted
102 # name and its offset_name points to a container; it is stored under that
103 # container's "nested_offsets" as a dict with offset/inputs/outputs.
104 container_offsets = {}
105 nested_offsets_by_container = {}
106
107 for name, col in offset_cols.items():
108 parent = col.offset_name
109 if parent == "" or parent in offset_cols:
110 if "." in name:
111 # Nested-vector offset — goes under its parent container
112 nested_offsets_by_container.setdefault(parent, {})[name] = {
113 "offset": col,
114 "inputs": [],
115 "outputs": [],
116 }
117 else:
118 container_offsets[name] = col
119 else:
120 # offset_name not found in any offset column — treat as root
121 container_offsets[name] = col
122
123 # Build the classified dict
124 classified = {
125 name: {
126 "offset": col,
127 "inputs": [],
128 "outputs": [],
129 "nested_offsets": nested_offsets_by_container.get(name, {}),
130 }
131 for name, col in container_offsets.items()
132 }
133
134 # Reverse map: nested_offset_name -> parent container name, for routing
135 # data columns whose offset_name points to a nested offset.
136 nested_to_container = {
137 nested_name: container
138 for container, nested_map in nested_offsets_by_container.items()
139 for nested_name in nested_map
140 }
141
142 # Assign data columns to their container or nested-offset bucket
143 for col in data_cols:
144 target = col.offset_name
145 is_output = col.access_mode == ColumnAccessMode.output
146
147 if target in classified:
148 bucket = classified[target]
149 elif target in nested_to_container:
150 container = nested_to_container[target]
151 bucket = classified[container]["nested_offsets"][target]
152 else:
153 # Shouldn't happen with well-formed tool output
154 continue
155
156 if is_output:
157 bucket["outputs"].append(col)
158 else:
159 bucket["inputs"].append(col)
160
161 return classified
162
163
164def resolve_optional_columns(classified, events):
165 """Remove optional input columns absent from events.
166
167 Checks each optional input column against ``ak.fields(events)`` and drops
168 it if not present. Returns a new dict (shallow copy per container); the
169 original classified dict is not modified.
170
171 Parameters
172 ----------
173 classified:
174 Output of ``classify_columns``.
175 events:
176 An ak.Array whose fields are checked for optional column presence.
177
178 Returns
179 -------
180 dict
181 Same structure as ``classify_columns`` output, with absent optional
182 columns removed from each container's ``"inputs"`` list.
183 """
184 available = set(ak.fields(events))
185 result = {}
186 for container, info in classified.items():
187 result[container] = dict(info)
188 result[container]["inputs"] = [
189 col
190 for col in info["inputs"]
191 if not col.is_optional or col.name in available
192 ]
193 return result
194
195
196def extract_buffers(events, classified):
197 """Extract flat numpy buffers from an awkward array.
198
199 Returns a dict mapping column name -> numpy array, covering all container
200 offsets, nested-vector offsets, and input data columns. Output column
201 buffers are not included (allocate_outputs handles those).
202
203 Parameters
204 ----------
205 events:
206 An ak.Array (real or zero-length after typetracer conversion).
207 classified:
208 Output of classify_columns or resolve_optional_columns.
209 """
210 buffers = {}
211 num_events = int(ak.num(events, axis=0))
212
213 for container_name, info in classified.items():
214 nested_offsets = info["nested_offsets"]
215
216 # Nested-vector inputs: extract inner offsets and data via the
217 # inner-most ListOffsetArray helper before processing flat inputs.
218 for nested_offset_name, nested in nested_offsets.items():
219 for col in nested["inputs"]:
220 base = _branch_name_for_column(col.name)
221 inner = _inner_most_list_offset_array(events[base])
222 raw_offsets = np.asarray(inner.layout.offsets.data)
223 start = int(raw_offsets[0])
224 end = int(raw_offsets[-1])
225 # ROOT baskets can provide a larger raw buffer than the
226 # offsets reference — normalize to [0, end-start] range.
227 buffers[nested_offset_name] = np.ascontiguousarray(
228 raw_offsets - start, dtype=np.uint64
229 )
230 buffers[col.name] = np.ascontiguousarray(
231 inner.layout.content.data[start:end]
232 )
233
234 flat_inputs = info["inputs"]
235
236 if not flat_inputs and not nested_offsets:
237 # No inputs at all — synthesize an offset so outputs can be sized
238 buffers[container_name] = np.array([0, num_events], dtype=np.uint64)
239 continue
240
241 if not flat_inputs:
242 # No flat inputs, but nested-vector inputs exist: derive the outer
243 # container offset from the first nested-vector field's outer layout.
244 any_nested_input = next(
245 (col for nested in nested_offsets.values() for col in nested["inputs"]),
246 None,
247 )
248 if any_nested_input is not None:
249 base = _branch_name_for_column(any_nested_input.name)
250 buffers[container_name] = np.ascontiguousarray(
251 events[base].layout.offsets.data, dtype=np.uint64
252 )
253 else:
254 buffers[container_name] = np.array([0, num_events], dtype=np.uint64)
255 continue
256
257 # Group flat input columns by their offset_name, then zip + to_buffers
258 # each group. This is the same pattern as the original example script.
259 sorted_cols = sorted(flat_inputs, key=lambda c: c.offset_name)
260
261 for offset_name, cols_iter in itertools.groupby(
262 sorted_cols, key=lambda c: c.offset_name
263 ):
264 cols = list(cols_iter)
265 unzipped = {col.name: events[col.name] for col in cols}
266 zipped = ak.zip(unzipped)
267
268 # NB: form_key not crucial, but helpful for debugging
269 form, length, raw_buffers = ak.to_buffers(
270 zipped, form_key=f"{offset_name}{{id}}"
271 )
272
273 if isinstance(form, ak.forms.RecordForm):
274 # EventInfo-like: one record per event.
275 # Use ak.to_numpy per field instead of walking raw_buffers:
276 # when events is masked/indexed, form.content(field) is an
277 # IndexedForm and the "-data" key doesn't exist in raw_buffers.
278 buffers[container_name] = np.array(
279 [0, length], dtype=np.uint64
280 )
281 for col in cols:
282 buffers[col.name] = ak.to_numpy(events[col.name])
283 elif isinstance(form, ak.forms.ListOffsetForm):
284 # Particle container: extract offsets, cast to uint64
285 # for the C++ side
286 offset_key = next(
287 key for key in raw_buffers if key.endswith("-offsets")
288 )
289 buffers[container_name] = np.asarray(
290 raw_buffers[offset_key]
291 ).astype(np.uint64)
292
293 # Data buffers from the inner RecordForm
294 inner = form.content
295 for field in inner.fields:
296 buffers[field] = np.asarray(
297 raw_buffers[f"{inner.content(field).form_key}-data"]
298 )
299 else:
300 raise RuntimeError(
301 f"Cannot handle form {type(form)} for "
302 f"container {container_name}"
303 )
304
305 return buffers
306
307
308def allocate_outputs(classified, buffer_dict):
309 """Allocate zero-filled numpy arrays for each output column.
310
311 Sizes each output array using ``offsets[-1]`` of the referenced offset
312 buffer. Arrays are added into ``buffer_dict`` in-place and also returned.
313
314 Parameters
315 ----------
316 classified:
317 Output of ``classify_columns`` or ``resolve_optional_columns``.
318 buffer_dict:
319 Dict of column name -> numpy array, as returned by ``extract_buffers``.
320 Modified in-place to include the newly allocated output arrays.
321
322 Returns
323 -------
324 dict
325 Mapping of output column name -> zero-filled numpy array (same objects
326 also inserted into ``buffer_dict``).
327 """
328 nested_offset_names = {
329 nested_name
330 for info in classified.values()
331 for nested_name in info["nested_offsets"]
332 }
333 output_buffers = {}
334 for _container_name, info in classified.items():
335 for col in info["outputs"]:
336 if col.offset_name in nested_offset_names:
337 raise NotImplementedError(
338 f"Nested-vector output columns are not supported "
339 f"(column '{col.name}' has nested offset '{col.offset_name}')"
340 )
341 offset_data = buffer_dict.get(col.offset_name)
342 if offset_data is None:
343 msg = (
344 f"Cannot find offset buffer '{col.offset_name}' "
345 f"needed for output column '{col.name}'"
346 )
347 raise RuntimeError(msg)
348 size = int(offset_data[-1])
349 arr = np.zeros(size, dtype=col.dtype)
350 output_buffers[col.name] = arr
351 buffer_dict[col.name] = arr
352 return output_buffers
353
354
355def reconstruct_output(classified, buffer_dict, num_events):
356 """Build an awkward array from output column buffers.
357
358 Parameters
359 ----------
360 classified:
361 Output of ``classify_columns`` or ``resolve_optional_columns``.
362 buffer_dict:
363 Dict of column name -> numpy array, containing both offset buffers and
364 the output arrays populated by ``allocate_outputs`` and ``call()``.
365 num_events:
366 Number of events (outer axis length of the returned array).
367
368 Returns
369 -------
370 ak.Array
371 Record array with one field per output column, each a variable-length
372 list of per-particle values (i.e. ``var * dtype``).
373 """
374 form_fields = []
375 form_contents = []
376 out_buffers = {}
377
378 # node0 = RecordArray; each output column needs a pair of nodes
379 node_index = 1
380 for _container_name, info in classified.items():
381 for col in info["outputs"]:
382 node_offset = f"node{2 * node_index}"
383 node_data = f"node{2 * node_index + 1}"
384 node_index += 1
385
386 form_fields.append(col.name)
387 form_contents.append(
388 {
389 "class": "ListOffsetArray",
390 "offsets": "i64",
391 "content": {
392 "class": "NumpyArray",
393 "primitive": col.dtype,
394 "form_key": node_data,
395 },
396 "form_key": node_offset,
397 }
398 )
399
400 out_buffers[f"{node_data}-data"] = buffer_dict[col.name]
401 out_buffers[f"{node_offset}-offsets"] = buffer_dict[col.offset_name]
402
403 form = {
404 "class": "RecordArray",
405 "fields": form_fields,
406 "contents": form_contents,
407 "form_key": "node0",
408 }
409
410 return ak.from_buffers(form, num_events, out_buffers)
STL class.
reconstruct_output(classified, buffer_dict, num_events)
Definition buffers.py:355
resolve_optional_columns(classified, events)
Definition buffers.py:164
_branch_name_for_column(name)
Definition buffers.py:60
extract_buffers(events, classified)
Definition buffers.py:196
_inner_most_list_offset_array(array)
Definition buffers.py:17
allocate_outputs(classified, buffer_dict)
Definition buffers.py:308
classify_columns(columns)
Definition buffers.py:65