Loading [MathJax]/extensions/tex2jax.js
ATLAS Offline Software
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
BenchmarkAlg.cxx
Go to the documentation of this file.
1 /*
2  Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3 */
4 
13 #include "AthenaKernel/Chrono.h"
14 
16 {
18  {
19  ATH_MSG_INFO("Running on the FPGA accelerator");
20 
22 
23  ATH_CHECK(m_chronoSvc.retrieve());
24 
25  {
26  Athena::Chrono chrono("Platform and device initlize", m_chronoSvc.get());
28  }
29 
30  {
31  Athena::Chrono chrono("CL::loadProgram", m_chronoSvc.get());
33  }
34  ATH_MSG_INFO("loading "<<m_xclbin);
39 
40  ATH_CHECK(m_xaodClusterMaker.retrieve());
41  ATH_CHECK(m_testVectorTool.retrieve());
42  ATH_CHECK(m_FPGADataFormatTool.retrieve());
43  return StatusCode::SUCCESS;
44  }
45 
46  StatusCode BenchmarkAlg::execute(const EventContext &ctx) const
47  {
48  ATH_MSG_DEBUG("Executing BenchmarkAlg");
49  m_numEvents++;
50 
51  // Create host side output vectors
52  std::vector<uint64_t> pixelOutput(EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE, 0);
53  std::vector<uint64_t> stripOutput(EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE, 0);
54 
55  if (m_runPassThrough)
56  {
57  ATH_CHECK(runPassThrough(pixelOutput, stripOutput, ctx));
58  }
59  else // using the actual FPGA kernel chain
60  {
61  ATH_CHECK(runDataPrep(pixelOutput, stripOutput, ctx));
62  }
63 
64  // use 64-bit pointer to access output
65  uint64_t *stripClusters = stripOutput.data();
66  uint64_t *pixelClusters = pixelOutput.data();
67 
68  unsigned int numStripClusters = stripClusters[0];
69  ATH_MSG_DEBUG("numStripClusters: " << numStripClusters);
70 
71  unsigned int numPixelClusters = pixelClusters[0];
72  ATH_MSG_DEBUG("numPixelClusters: " << numPixelClusters);
73 
74  std::unique_ptr<EFTrackingTransient::Metadata> metadata =
75  std::make_unique<EFTrackingTransient::Metadata>();
76 
77  metadata->numOfStripClusters = numStripClusters;
78  metadata->scRdoIndexSize = numStripClusters;
79  metadata->numOfPixelClusters = numPixelClusters;
80  metadata->pcRdoIndexSize = numPixelClusters;
81 
84 
85  // Declare a few vairiables to be used in the loop
86  int row = 0;
87  uint64_t rdo;
88  int rdoCounter = 0;
89 
90  // Make strip cluster aux input
91  {
92  Athena::Chrono chrono("Make strip cluster container", m_chronoSvc.get());
93  for (unsigned int i = 0; i < numStripClusters; i++)
94  {
95  rdoCounter = 0;
96  row = 0; // idhash
97  scAux.idHash.push_back(stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
98  row = 1; // id
99  scAux.id.push_back(stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
100  row = 2; // rdo w1
101  rdo = stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
102  if (rdo)
103  {
104  scAux.rdoList.push_back(rdo);
105  rdoCounter++;
106  }
107  row = 3; // rdo w2
108  rdo = stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
109  if (rdo)
110  {
111  scAux.rdoList.push_back(rdo);
112  rdoCounter++;
113  }
114  row = 4; // rdo w3
115  rdo = stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
116  if (rdo)
117  {
118  scAux.rdoList.push_back(rdo);
119  rdoCounter++;
120  }
121  row = 5; // rdo w4
122  rdo = stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
123  if (rdo)
124  {
125  scAux.rdoList.push_back(rdo);
126  rdoCounter++;
127  }
128  row = 6; // local x
129  scAux.localPosition.push_back(*(double *)&stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
130  row = 8; // local covariance xx
131  scAux.localCovariance.push_back(*(double *)&stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
132  row = 9; // global x
133  scAux.globalPosition.push_back(*(double *)&stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
134  row = 10; // global y
135  scAux.globalPosition.push_back(*(double *)&stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
136  row = 11; // global z
137  scAux.globalPosition.push_back(*(double *)&stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
138  row = 12; // channels in phi
139  scAux.channelsInPhi.push_back(stripClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
140 
141  metadata->scRdoIndex[i] = rdoCounter;
142  }
143  ATH_CHECK(m_xaodClusterMaker->makeStripClusterContainer(scAux, metadata.get(), ctx));
144  // print out the strip cluster aux input
145  if (msgLvl(MSG::DEBUG))
146  {
147  for (unsigned int i = 0; i < numStripClusters; i++)
148  {
149  ATH_MSG_DEBUG("Strip cluster " << i << " idHash: " << scAux.idHash[i]);
150  ATH_MSG_DEBUG("Strip cluster " << i << " id: " << scAux.id[i]);
151  ATH_MSG_DEBUG("Strip cluster " << i << " localPosition x: " << scAux.localPosition[i]);
152  ATH_MSG_DEBUG("Strip cluster " << i << " localCovariance: " << scAux.localCovariance[i]);
153  ATH_MSG_DEBUG("Strip cluster " << i << " globalPosition x: " << scAux.globalPosition[i * 3]);
154  ATH_MSG_DEBUG("Strip cluster " << i << " globalPosition y: " << scAux.globalPosition[i * 3 + 1]);
155  ATH_MSG_DEBUG("Strip cluster " << i << " globalPosition z: " << scAux.globalPosition[i * 3 + 2]);
156  ATH_MSG_DEBUG("Strip cluster " << i << " channelsInPhi: " << scAux.channelsInPhi[i]);
157  ATH_MSG_DEBUG("Strip cluster " << i << " rdoList size: " << metadata->scRdoIndex[i]);
158  }
159  }
160  }
161 
162  // Make pixel cluster aux input
163  {
164  Athena::Chrono chrono("Make pixel cluster container", m_chronoSvc.get());
165  for (unsigned int i = 0; i < numPixelClusters; i++)
166  {
167  rdoCounter = 0;
168  row = 0; // id hash
169  pcAux.idHash.push_back(pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
170 
171  row = 1; // id
172  pcAux.id.push_back(pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
173 
174  row = 2; // rdo w1
175  rdo = pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
176  if (rdo)
177  {
178  pcAux.rdoList.push_back(rdo);
179  rdoCounter++;
180  }
181 
182  row = 3; // rdo w2
183  rdo = pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
184  if (rdo)
185  {
186  pcAux.rdoList.push_back(rdo);
187  rdoCounter++;
188  }
189 
190  row = 4; // rdo w3
191  rdo = pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
192  if (rdo)
193  {
194  pcAux.rdoList.push_back(rdo);
195  rdoCounter++;
196  }
197 
198  row = 5; // rdo w4
199  rdo = pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8];
200  if (rdo)
201  {
202  pcAux.rdoList.push_back(rdo);
203  rdoCounter++;
204  }
205 
206  row = 6; // local x
207  pcAux.localPosition.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
208 
209  row = 7; // local y
210  pcAux.localPosition.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
211 
212  row = 8; // local covariance xx
213  pcAux.localCovariance.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
214 
215  row = 9; // local covariance yy
216  pcAux.localCovariance.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
217 
218  row = 10; // global x
219  pcAux.globalPosition.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
220 
221  row = 11; // global y
222  pcAux.globalPosition.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
223 
224  row = 12; // global z
225  pcAux.globalPosition.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
226 
227  row = 13; // channels in phi
228  pcAux.channelsInPhi.push_back(pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
229 
230  row = 14; // channels in eta
231  pcAux.channelsInEta.push_back(pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
232 
233  row = 15; // width in eta
234  pcAux.widthInEta.push_back(*(double *)&pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
235 
236  row = 18; // total ToT
237  pcAux.totalToT.push_back(pixelClusters[row * EFTrackingTransient::MAX_NUM_CLUSTERS + i + 8]);
238 
239  metadata->pcRdoIndex[i] = rdoCounter;
240  }
241 
242  ATH_CHECK(m_xaodClusterMaker->makePixelClusterContainer(pcAux, metadata.get(), ctx));
243 
244  // print out pixel cluster aux input
245  if (msgLvl(MSG::DEBUG))
246  {
247  for (unsigned int i = 0; i < numPixelClusters; i++)
248  {
249  ATH_MSG_DEBUG("Pixel cluster " << i << " idHash: " << pcAux.idHash[i]);
250  ATH_MSG_DEBUG("Pixel cluster " << i << " id: " << pcAux.id[i]);
251  ATH_MSG_DEBUG("Pixel cluster " << i << " localPosition x: " << pcAux.localPosition[i * 2]);
252  ATH_MSG_DEBUG("Pixel cluster " << i << " localPosition y: " << pcAux.localPosition[i * 2 + 1]);
253  ATH_MSG_DEBUG("Pixel cluster " << i << " localCovariance xx: " << pcAux.localCovariance[i * 2]);
254  ATH_MSG_DEBUG("Pixel cluster " << i << " localCovariance yy: " << pcAux.localCovariance[i * 2 + 1]);
255  ATH_MSG_DEBUG("Pixel cluster " << i << " globalPosition x: " << pcAux.globalPosition[i * 3]);
256  ATH_MSG_DEBUG("Pixel cluster " << i << " globalPosition y: " << pcAux.globalPosition[i * 3 + 1]);
257  ATH_MSG_DEBUG("Pixel cluster " << i << " globalPosition z: " << pcAux.globalPosition[i * 3 + 2]);
258  ATH_MSG_DEBUG("Pixel cluster " << i << " channelsInPhi: " << pcAux.channelsInPhi[i]);
259  ATH_MSG_DEBUG("Pixel cluster " << i << " channelsInEta: " << pcAux.channelsInEta[i]);
260  ATH_MSG_DEBUG("Pixel cluster " << i << " widthInEta: " << pcAux.widthInEta[i]);
261  ATH_MSG_DEBUG("Pixel cluster " << i << " totalToT: " << pcAux.totalToT[i]);
262  ATH_MSG_DEBUG("Pixel cluster " << i << " rdoList size: " << metadata->pcRdoIndex[i]);
263  }
264  }
265  }
266 
267  return StatusCode::SUCCESS;
268  }
269 
270  StatusCode BenchmarkAlg::runPassThrough(std::vector<uint64_t> &pixelChainOutput, std::vector<uint64_t> &stripChainOutput, const EventContext &ctx) const
271  {
272  cl_int err = 0;
273 
274  // Load the ITk clusters from SG
276  if (!scContainerHandle.isValid())
277  {
278  ATH_MSG_ERROR("Failed to retrieve: " << m_inputStripClusterKey);
279  return StatusCode::FAILURE;
280  }
281 
283  if (!pcContainerHandle.isValid())
284  {
285  ATH_MSG_ERROR("Failed to retrieve: " << m_inputPixelClusterKey);
286  return StatusCode::FAILURE;
287  }
288 
289  // Encode ITK clusters into byte stream
290  std::vector<uint64_t> encodedStripClusters;
291  std::vector<uint64_t> encodedPixelClusters;
292  ATH_CHECK(m_testVectorTool->encodeStripL2G(scContainerHandle.get(), encodedStripClusters));
293  ATH_CHECK(m_testVectorTool->encodePixelL2G(pcContainerHandle.get(), encodedPixelClusters));
294 
295  // Create local CL buffers and kernel object for pixel and strip
296  cl::Buffer inputPixelBuffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
297  cl::Buffer inputStripBuffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::STRIP_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
298  cl::Buffer outputPixelBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE * sizeof(uint64_t), NULL, &err);
299  cl::Buffer outputStripBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE * sizeof(uint64_t), NULL, &err);
300 
301  cl::Kernel kernel(m_program, m_edmKernelName.value().data());
302  kernel.setArg<cl::Buffer>(0, inputPixelBuffer);
303  kernel.setArg<cl::Buffer>(1, inputStripBuffer);
304  kernel.setArg<cl::Buffer>(2, outputPixelBuffer);
305  kernel.setArg<cl::Buffer>(3, outputStripBuffer);
306 
307  // Migrate the input test vectors to the accelerator
308  cl::CommandQueue acc_queue(m_context, m_accelerator);
309  acc_queue.enqueueWriteBuffer(inputPixelBuffer, CL_FALSE, 0, sizeof(uint64_t) * encodedPixelClusters.size(), encodedPixelClusters.data(), NULL, NULL);
310  acc_queue.enqueueWriteBuffer(inputStripBuffer, CL_FALSE, 0, sizeof(uint64_t) * encodedStripClusters.size(), encodedStripClusters.data(), NULL, NULL);
311  acc_queue.finish();
312 
313  // enqueue the kernel
314  {
315  Athena::Chrono chrono("EDMPrep kernel execution", m_chronoSvc.get());
316  acc_queue.enqueueTask(kernel);
317  acc_queue.finish();
318  }
319 
320  // Read back the results
321  {
322  Athena::Chrono chrono("Read buffers", m_chronoSvc.get());
323  acc_queue.enqueueReadBuffer(outputPixelBuffer, CL_FALSE, 0, sizeof(uint64_t) * pixelChainOutput.size(), pixelChainOutput.data(), NULL, NULL);
324  acc_queue.enqueueReadBuffer(outputStripBuffer, CL_FALSE, 0, sizeof(uint64_t) * stripChainOutput.size(), stripChainOutput.data(), NULL, NULL);
325  acc_queue.finish();
326  }
327 
328  return StatusCode::SUCCESS;
329  }
330 
331  StatusCode BenchmarkAlg::runDataPrep(std::vector<uint64_t> &pixelChainOutput, std::vector<uint64_t> &stripChainOutput, const EventContext &ctx) const
332  {
333  ATH_MSG_DEBUG("Running DataPrep on FPGA");
334  cl_int err = 0;
335 
336  // Get the RDOs from the SG
337  auto pixelRDOHandle = SG::makeHandle(m_pixelRDOKey, ctx);
338  auto stripRDOHandle = SG::makeHandle(m_stripRDOKey, ctx);
339 
340  // Encode RDO into byte stream
341  std::vector<uint64_t> encodedPixelRDO;
342  std::vector<uint64_t> encodedStripRDO;
343 
344  // Encode RDOs into byte stream
345  ATH_CHECK(m_FPGADataFormatTool->convertPixelHitsToFPGADataFormat(*pixelRDOHandle, encodedPixelRDO, ctx));
346  ATH_CHECK(m_FPGADataFormatTool->convertStripHitsToFPGADataFormat(*stripRDOHandle, encodedStripRDO, ctx));
347 
348  for (unsigned int i = 0; i < encodedPixelRDO.size(); i++)
349  {
350  ATH_MSG_DEBUG("Pixel RDO[" << i << "]: " << std::hex << encodedPixelRDO[i] << std::dec);
351  }
352  for (unsigned int i = 0; i < encodedStripRDO.size(); i++)
353  {
354  ATH_MSG_DEBUG("Strip RDO[" << i << "]: " << std::hex << encodedStripRDO[i] << std::dec);
355  }
356 
357  // Create local CL buffers
358  // Clustering
359  cl::Buffer pixelClusterInputBuffer(m_context, CL_MEM_READ_ONLY, sizeof(uint64_t) * encodedPixelRDO.size(), NULL, &err);
360  cl::Buffer stripClusterInputBuffer(m_context, CL_MEM_READ_ONLY, sizeof(uint64_t) * encodedStripRDO.size(), NULL, &err);
361  cl::Buffer pixelClusterOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err); // Don't care in DataPrep
362  cl::Buffer stripClusterOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err); // Don't care in DataPrep
363  cl::Buffer pixelClusterEDMOutputBuffer(m_context, CL_MEM_READ_WRITE,EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
364  cl::Buffer stripClusterEDMOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
365  // L2G
366  cl::Buffer pixelL2GOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err); // Don't care in DataPrep
367  cl::Buffer stripL2GOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err); // Don't care in DataPrep
368  cl::Buffer pixelL2GEDMOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
369  cl::Buffer stripL2GEDMOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_BLOCK_BUF_SIZE * sizeof(uint64_t), NULL, &err);
370  // EDMPrep
371  cl::Buffer edmPixelOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE * sizeof(uint64_t), NULL, &err);
372  cl::Buffer edmStripOutputBuffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE * sizeof(uint64_t), NULL, &err);
373 
374  // Create local CL kernel objects
375  // Clustering
376  // Kernel names are hard-coded for the current development
377  cl::Kernel pixelClusteringKernel(m_program, m_pixelClusterKernelName.value().data());
378  pixelClusteringKernel.setArg<cl::Buffer>(0, pixelClusterInputBuffer);
379  pixelClusteringKernel.setArg<cl::Buffer>(1, pixelClusterOutputBuffer);
380  pixelClusteringKernel.setArg<cl::Buffer>(2, pixelClusterEDMOutputBuffer);
381 
382  cl::Kernel stripClusteringKernel(m_program, m_stripClusterKernelName.value().data());
383  stripClusteringKernel.setArg<cl::Buffer>(0, stripClusterInputBuffer);
384  stripClusteringKernel.setArg<cl::Buffer>(1, stripClusterOutputBuffer);
385  stripClusteringKernel.setArg<cl::Buffer>(2, stripClusterEDMOutputBuffer);
386  stripClusteringKernel.setArg<unsigned int>(3, encodedStripRDO.size());
387 
388  // L2G
389  cl::Kernel pixelL2GKernel(m_program, m_pixelL2GKernelName.value().data());
390  pixelL2GKernel.setArg<cl::Buffer>(0, pixelClusterOutputBuffer);
391  pixelL2GKernel.setArg<cl::Buffer>(1, pixelClusterEDMOutputBuffer);
392  pixelL2GKernel.setArg<cl::Buffer>(2, pixelL2GOutputBuffer);
393  pixelL2GKernel.setArg<cl::Buffer>(3, pixelL2GEDMOutputBuffer);
394 
395  cl::Kernel stripL2GKernel(m_program, m_stripL2GKernelName.value().data());
396  stripL2GKernel.setArg<cl::Buffer>(0, stripClusterOutputBuffer);
397  stripL2GKernel.setArg<cl::Buffer>(1, stripClusterEDMOutputBuffer);
398  stripL2GKernel.setArg<cl::Buffer>(2, stripL2GOutputBuffer);
399  stripL2GKernel.setArg<cl::Buffer>(3, stripL2GEDMOutputBuffer);
400 
401  // Create EDMPrep kernel object and connect to buffers
402  cl::Kernel edmPrepKernel(m_program, m_edmKernelName.value().data());
403  edmPrepKernel.setArg<cl::Buffer>(0, pixelL2GEDMOutputBuffer);
404  edmPrepKernel.setArg<cl::Buffer>(1, stripL2GEDMOutputBuffer);
405  edmPrepKernel.setArg<cl::Buffer>(2, edmPixelOutputBuffer);
406  edmPrepKernel.setArg<cl::Buffer>(3, edmStripOutputBuffer);
407 
408  cl::CommandQueue acc_queue(m_context, m_accelerator, CL_QUEUE_PROFILING_ENABLE, &err);
409 
410  cl::Event cl_evt_write_pixel_input;
411  cl::Event cl_evt_write_strip_input;
412  acc_queue.enqueueWriteBuffer(pixelClusterInputBuffer, CL_FALSE, 0, sizeof(uint64_t) * encodedPixelRDO.size(), encodedPixelRDO.data(), NULL, &cl_evt_write_pixel_input);
413  acc_queue.enqueueWriteBuffer(stripClusterInputBuffer, CL_FALSE, 0, sizeof(uint64_t) * encodedStripRDO.size(), encodedStripRDO.data(), NULL, &cl_evt_write_strip_input);
414  std::vector<cl::Event> cl_evt_vec_pixel_input{cl_evt_write_pixel_input};
415  std::vector<cl::Event> cl_evt_vec_strip_input{cl_evt_write_strip_input};
416  // Ideally, `finish` shouldn't be here because the kernels are invoked by event dependencies,
417  // but we use this to temporarily enable kernel profiling.
418  acc_queue.finish();
419 
420  cl::Event cl_evt_pixel_clustering;
421  cl::Event cl_evt_strip_clustering;
422  cl::Event cl_evt_pixel_l2g;
423  cl::Event cl_evt_strip_l2g;
424  cl::Event cl_evt_edm_prep;
425  {
426  Athena::Chrono chrono("Kernel execution", m_chronoSvc.get());
427  acc_queue.enqueueTask(pixelClusteringKernel, &cl_evt_vec_pixel_input, &cl_evt_pixel_clustering);
428  acc_queue.enqueueTask(stripClusteringKernel, &cl_evt_vec_strip_input, &cl_evt_strip_clustering);
429 
430  std::vector<cl::Event> cl_evt_vec_pixel_clustering{cl_evt_pixel_clustering};
431  std::vector<cl::Event> cl_evt_vec_strip_clustering{cl_evt_strip_clustering};
432  acc_queue.enqueueTask(pixelL2GKernel, &cl_evt_vec_pixel_clustering, &cl_evt_pixel_l2g);
433  acc_queue.enqueueTask(stripL2GKernel, &cl_evt_vec_strip_clustering, &cl_evt_strip_l2g);
434  std::vector<cl::Event> cl_evt_vec_l2g{cl_evt_pixel_l2g, cl_evt_strip_l2g};
435 
436  acc_queue.enqueueTask(edmPrepKernel, &cl_evt_vec_l2g, &cl_evt_edm_prep);
437  // Ideally, `finish` shouldn't be here because the kernels are invoked by event dependencies,
438  // but we use this to temporarily enable kernel profiling. CPU wall time
439  acc_queue.finish();
440  }
441 
442  cl::Event cl_evt_pixel_cluster_output;
443  cl::Event cl_evt_strip_cluster_output;
444  acc_queue.enqueueReadBuffer(edmPixelOutputBuffer, CL_FALSE, 0, sizeof(uint64_t) * pixelChainOutput.size(), pixelChainOutput.data(), NULL, &cl_evt_pixel_cluster_output);
445  acc_queue.enqueueReadBuffer(edmStripOutputBuffer, CL_FALSE, 0, sizeof(uint64_t) * stripChainOutput.size(), stripChainOutput.data(), NULL, &cl_evt_strip_cluster_output);
446  acc_queue.finish();
447 
448  // calculate the time for the kernel execution
449  // get the time of writing pixel input buffer
450  cl_ulong pixel_input_start = cl_evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
451  cl_ulong pixel_input_end = cl_evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>();
452  cl_ulong pixel_input_time = pixel_input_end - pixel_input_start;
453  m_pixelInputTime += pixel_input_time;
454  ATH_MSG_DEBUG("Pixel input buffer write time: " << pixel_input_time / 1e6 << " ms");
455 
456  // get the time of writing strip input buffer
457  cl_ulong strip_input_start = cl_evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
458  cl_ulong strip_input_end = cl_evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>();
459  cl_ulong strip_input_time = strip_input_end - strip_input_start;
460  m_stripInputTime += strip_input_time;
461  ATH_MSG_DEBUG("Strip input buffer write time: " << strip_input_time / 1e6 << " ms");
462 
463  // get the time of pixel clustering
464  cl_ulong pixel_clustering_start = cl_evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
465  cl_ulong pixel_clustering_end = cl_evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>();
466  cl_ulong pixel_clustering_time = pixel_clustering_end - pixel_clustering_start;
467  m_pixelClusteringTime += pixel_clustering_time;
468  ATH_MSG_DEBUG("Pixel clustering time: " << pixel_clustering_time / 1e6 << " ms");
469 
470  // get the time of strip clustering
471  cl_ulong strip_clustering_start = cl_evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
472  cl_ulong strip_clustering_end = cl_evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>();
473  cl_ulong strip_clustering_time = strip_clustering_end - strip_clustering_start;
474  m_stripClusteringTime += strip_clustering_time;
475  ATH_MSG_DEBUG("Strip clustering time: " << strip_clustering_time / 1e6 << " ms");
476 
477  // get the time of pixel L2G
478  cl_ulong pixel_l2g_start = cl_evt_pixel_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
479  cl_ulong pixel_l2g_end = cl_evt_pixel_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>();
480  cl_ulong pixel_l2g_time = pixel_l2g_end - pixel_l2g_start;
481  m_pixelL2GTime += pixel_l2g_time;
482  ATH_MSG_DEBUG("Pixel L2G time: " << pixel_l2g_time / 1e6 << " ms");
483 
484  // get the time of strip L2G
485  cl_ulong strip_l2g_start = cl_evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
486  cl_ulong strip_l2g_end = cl_evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>();
487  cl_ulong strip_l2g_time = strip_l2g_end - strip_l2g_start;
488  m_stripL2GTime += strip_l2g_time;
489  ATH_MSG_DEBUG("Strip L2G time: " << strip_l2g_time / 1e6 << " ms");
490 
491  // get the time of EDMPrep
492  cl_ulong edm_prep_start = cl_evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
493  cl_ulong edm_prep_end = cl_evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>();
494  cl_ulong edm_prep_time = edm_prep_end - edm_prep_start;
495  m_edmPrepTime += edm_prep_time;
496  ATH_MSG_DEBUG("EDMPrep time: " << edm_prep_time / 1e6 << " ms");
497 
498  // get the time of the whole kernel execution
499  cl_ulong kernel_start = cl_evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
500  cl_ulong kernel_end = cl_evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>();
501  cl_ulong kernel_time = kernel_end - kernel_start;
502  m_kernelTime += kernel_time;
503  ATH_MSG_DEBUG("Kernel execution time: " << kernel_time / 1e6 << " ms");
504 
505  // get the time of reading pixel output buffer
506  cl_ulong pixel_output_start = cl_evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
507  cl_ulong pixel_output_end = cl_evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>();
508  cl_ulong pixel_output_time = pixel_output_end - pixel_output_start;
509  m_pixelOutputTime += pixel_output_time;
510  ATH_MSG_DEBUG("Pixel output buffer read time: " << pixel_output_time / 1e6 << " ms");
511 
512  // get the time of reading strip output buffer
513  cl_ulong strip_output_start = cl_evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
514  cl_ulong strip_output_end = cl_evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>();
515  cl_ulong strip_output_time = strip_output_end - strip_output_start;
516  m_stripOutputTime += strip_output_time;
517  ATH_MSG_DEBUG("Strip output buffer read time: " << strip_output_time / 1e6 << " ms");
518 
519  return StatusCode::SUCCESS;
520  }
521 
523  {
524  if (!m_runPassThrough)
525  {
526  ATH_MSG_INFO("Finalizing BenchmarkAlg");
527  ATH_MSG_INFO("Number of events: " << m_numEvents);
528  ATH_MSG_INFO("Pixel input time: " << m_pixelInputTime / m_numEvents / 1e6 << " ms");
529  ATH_MSG_INFO("Strip input time: " << m_stripInputTime / m_numEvents / 1e6 << " ms");
530  ATH_MSG_INFO("Pixel clustering time: " << m_pixelClusteringTime / m_numEvents / 1e6 << " ms");
531  ATH_MSG_INFO("Strip clustering time: " << m_stripClusteringTime / m_numEvents / 1e6 << " ms");
532  ATH_MSG_INFO("Pixel L2G time: " << m_pixelL2GTime / m_numEvents / 1e6 << " ms");
533  ATH_MSG_INFO("Strip L2G time: " << m_stripL2GTime / m_numEvents / 1e6 << " ms");
534  ATH_MSG_INFO("EDMPrep time: " << m_edmPrepTime / m_numEvents / 1e6 << " ms");
535  ATH_MSG_INFO("Kernel execution time: " << m_kernelTime / m_numEvents / 1e6 << " ms");
536  ATH_MSG_INFO("Pixel output time: " << m_pixelOutputTime / m_numEvents / 1e6 << " ms");
537  ATH_MSG_INFO("Strip output time: " << m_stripOutputTime / m_numEvents / 1e6 << " ms");
538  }
539 
540  return StatusCode::SUCCESS;
541  }
542 } // namespace EFTrackingFPGAIntegration
query_example.row
row
Definition: query_example.py:24
EFTrackingTransient::PixelClusterAuxInput::localPosition
std::vector< float > localPosition
Definition: EFTrackingTransient.h:221
EFTrackingFPGAIntegration::BenchmarkAlg::m_edmKernelName
Gaudi::Property< std::string > m_edmKernelName
Name of the FPGA kernel.
Definition: BenchmarkAlg.h:74
IntegrationBase::m_accelerator
cl::Device m_accelerator
Device object for the accelerator card.
Definition: IntegrationBase.h:66
IntegrationBase::initialize
virtual StatusCode initialize() override
Detect the OpenCL devices and prepare OpenCL context.
Definition: IntegrationBase.cxx:16
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripClusteringTime
std::atomic< cl_ulong > m_stripClusteringTime
Time for strip clustering.
Definition: BenchmarkAlg.h:96
EFTrackingTransient::PixelClusterAuxInput::channelsInEta
std::vector< int > channelsInEta
Definition: EFTrackingTransient.h:226
EFTrackingFPGAIntegration::BenchmarkAlg::m_chronoSvc
ServiceHandle< IChronoSvc > m_chronoSvc
Service for timing the algorithm.
Definition: BenchmarkAlg.h:46
EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE
constexpr unsigned long PIXEL_CONTAINER_BUF_SIZE
Definition: EFTrackingTransient.h:34
EFTrackingTransient::PixelClusterAuxInput
The PixelClusterAuxInput struct is used to simplify the creaction of the xAOD::PixelClusterContainer.
Definition: EFTrackingTransient.h:218
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelL2GKernelName
Gaudi::Property< std::string > m_pixelL2GKernelName
Name of the pixel L2G kernel.
Definition: BenchmarkAlg.h:83
EFTrackingTransient::StripClusterAuxInput::globalPosition
std::vector< float > globalPosition
Definition: EFTrackingTransient.h:208
ATH_MSG_INFO
#define ATH_MSG_INFO(x)
Definition: AthMsgStreamMacros.h:31
EFTrackingTransient::MAX_NUM_CLUSTERS
constexpr unsigned int MAX_NUM_CLUSTERS
Definition: EFTrackingTransient.h:27
SG::ReadHandle
Definition: StoreGate/StoreGate/ReadHandle.h:67
BenchmarkAlg.h
JiveXML::Event
struct Event_t Event
Definition: ONCRPCServer.h:65
IntegrationBase::m_context
cl::Context m_context
Context object for the application.
Definition: IntegrationBase.h:67
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelL2GTime
std::atomic< cl_ulong > m_pixelL2GTime
Time for pixel L2G.
Definition: BenchmarkAlg.h:97
EFTrackingTransient::StripClusterAuxInput::localCovariance
std::vector< float > localCovariance
Definition: EFTrackingTransient.h:205
AthCommonMsg< Gaudi::Algorithm >::msgLvl
bool msgLvl(const MSG::Level lvl) const
Definition: AthCommonMsg.h:30
EFTrackingFPGAIntegration::BenchmarkAlg::initialize
virtual StatusCode initialize() override final
Detect the OpenCL devices and prepare OpenCL context.
Definition: BenchmarkAlg.cxx:17
EFTrackingFPGAIntegration::BenchmarkAlg::m_inputPixelClusterKey
SG::ReadHandleKey< xAOD::PixelClusterContainer > m_inputPixelClusterKey
Key to access input pixel clusters.
Definition: BenchmarkAlg.h:61
Chrono.h
Exception-safe IChronoSvc caller.
EFTrackingFPGAIntegration::BenchmarkAlg::finalize
virtual StatusCode finalize() override final
Definition: BenchmarkAlg.cxx:522
EFTrackingTransient::StripClusterAuxInput::rdoList
std::vector< unsigned long long > rdoList
Definition: EFTrackingTransient.h:209
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelRDOKey
SG::ReadHandleKey< PixelRDO_Container > m_pixelRDOKey
Definition: BenchmarkAlg.h:67
EFTrackingFPGAIntegration::BenchmarkAlg::execute
virtual StatusCode execute(const EventContext &ctx) const override final
Should be overriden by derived classes to perform meaningful work.
Definition: BenchmarkAlg.cxx:46
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripOutputTime
std::atomic< cl_ulong > m_stripOutputTime
Time for strip output buffer read.
Definition: BenchmarkAlg.h:101
python.checkMetadata.metadata
metadata
Definition: checkMetadata.py:175
SG::makeHandle
SG::ReadCondHandle< T > makeHandle(const SG::ReadCondHandleKey< T > &key, const EventContext &ctx=Gaudi::Hive::currentContext())
Definition: ReadCondHandle.h:274
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripRDOKey
SG::ReadHandleKey< SCT_RDO_Container > m_stripRDOKey
Definition: BenchmarkAlg.h:69
EFTrackingTransient::PIXEL_BLOCK_BUF_SIZE
constexpr unsigned long PIXEL_BLOCK_BUF_SIZE
Definition: EFTrackingTransient.h:32
EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE
constexpr unsigned long STRIP_CONTAINER_BUF_SIZE
Definition: EFTrackingTransient.h:35
ATH_MSG_ERROR
#define ATH_MSG_ERROR(x)
Definition: AthMsgStreamMacros.h:33
Athena::Chrono
Exception-safe IChronoSvc caller.
Definition: Chrono.h:50
EFTrackingTransient::PixelClusterAuxInput::rdoList
std::vector< unsigned long long > rdoList
Definition: EFTrackingTransient.h:224
dqt_zlumi_pandas.err
err
Definition: dqt_zlumi_pandas.py:182
lumiFormat.i
int i
Definition: lumiFormat.py:85
EFTrackingTransient::PixelClusterAuxInput::localCovariance
std::vector< float > localCovariance
Definition: EFTrackingTransient.h:222
EL::StatusCode
::StatusCode StatusCode
StatusCode definition for legacy code.
Definition: PhysicsAnalysis/D3PDTools/EventLoop/EventLoop/StatusCode.h:22
SG::ReadHandle::get
const_pointer_type get() const
Dereference the pointer, but don't cache anything.
ATH_MSG_DEBUG
#define ATH_MSG_DEBUG(x)
Definition: AthMsgStreamMacros.h:29
EFTrackingTransient::StripClusterAuxInput::idHash
std::vector< unsigned int > idHash
Definition: EFTrackingTransient.h:206
EFTrackingTransient::PixelClusterAuxInput::totalToT
std::vector< int > totalToT
Definition: EFTrackingTransient.h:229
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelInputTime
std::atomic< cl_ulong > m_pixelInputTime
Time for pixel input buffer write.
Definition: BenchmarkAlg.h:93
xAOD::uint64_t
uint64_t
Definition: EventInfo_v1.cxx:123
ATH_CHECK
#define ATH_CHECK
Definition: AthCheckMacros.h:40
IntegrationBase::precheck
StatusCode precheck(const std::vector< Gaudi::Property< std::string >> &inputs) const
Check if the the desired Gaudi properties are set.
Definition: IntegrationBase.cxx:150
SG::VarHandleKey::initialize
StatusCode initialize(bool used=true)
If this object is used as a property, then this should be called during the initialize phase.
Definition: AthToolSupport/AsgDataHandles/Root/VarHandleKey.cxx:103
EFTrackingFPGAIntegration
Definition: BenchmarkAlg.h:28
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripL2GKernelName
Gaudi::Property< std::string > m_stripL2GKernelName
Name of the strip L2G kernelS.
Definition: BenchmarkAlg.h:86
EFTrackingFPGAIntegration::BenchmarkAlg::m_FPGADataFormatTool
ToolHandle< FPGADataFormatTool > m_FPGADataFormatTool
Tool for formatting FPGA data.
Definition: BenchmarkAlg.h:58
EFTrackingFPGAIntegration::BenchmarkAlg::m_xclbin
Gaudi::Property< std::string > m_xclbin
Path and name of the xclbin file.
Definition: BenchmarkAlg.h:71
SG::ReadHandle::isValid
virtual bool isValid() override final
Can the handle be successfully dereferenced?
EFTrackingTransient::PixelClusterAuxInput::idHash
std::vector< unsigned int > idHash
Definition: EFTrackingTransient.h:220
EFTrackingFPGAIntegration::BenchmarkAlg::m_testVectorTool
ToolHandle< TestVectorTool > m_testVectorTool
Tool for preparing test vectors.
Definition: BenchmarkAlg.h:55
IntegrationBase::loadProgram
StatusCode loadProgram(const std::string &xclbin)
Find the xclbin file and load it into the OpenCL program object.
Definition: IntegrationBase.cxx:111
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripClusterKernelName
Gaudi::Property< std::string > m_stripClusterKernelName
Name of the strip clustering kernel.
Definition: BenchmarkAlg.h:80
EFTrackingTransient::StripClusterAuxInput
The StripClusterAuxInput struct is used to simplify the creaction of the xAOD::StripClusterContainer.
Definition: EFTrackingTransient.h:203
EFTrackingFPGAIntegration::BenchmarkAlg::m_xaodClusterMaker
ToolHandle< xAODClusterMaker > m_xaodClusterMaker
Tool for creating xAOD containers.
Definition: BenchmarkAlg.h:49
EFTrackingTransient::StripClusterAuxInput::localPosition
std::vector< float > localPosition
Definition: EFTrackingTransient.h:204
EFTrackingTransient.h
EFTrackingFPGAIntegration::BenchmarkAlg::m_numEvents
std::atomic< ulonglong > m_numEvents
Number of events processed.
Definition: BenchmarkAlg.h:92
EFTrackingTransient::PixelClusterAuxInput::channelsInPhi
std::vector< int > channelsInPhi
Definition: EFTrackingTransient.h:225
EFTrackingTransient::STRIP_BLOCK_BUF_SIZE
constexpr unsigned long STRIP_BLOCK_BUF_SIZE
Definition: EFTrackingTransient.h:33
EFTrackingFPGAIntegration::BenchmarkAlg::m_kernelTime
std::atomic< cl_ulong > m_kernelTime
Time for kernel execution.
Definition: BenchmarkAlg.h:102
EFTrackingFPGAIntegration::BenchmarkAlg::m_inputStripClusterKey
SG::ReadHandleKey< xAOD::StripClusterContainer > m_inputStripClusterKey
Key to access input strip clusters.
Definition: BenchmarkAlg.h:64
EFTrackingTransient::PixelClusterAuxInput::id
std::vector< long unsigned int > id
Definition: EFTrackingTransient.h:219
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripInputTime
std::atomic< cl_ulong > m_stripInputTime
Time for strip input buffer write.
Definition: BenchmarkAlg.h:94
EFTrackingTransient::PixelClusterAuxInput::widthInEta
std::vector< float > widthInEta
Definition: EFTrackingTransient.h:227
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelClusterKernelName
Gaudi::Property< std::string > m_pixelClusterKernelName
Name of the pixel clustering kernel.
Definition: BenchmarkAlg.h:77
EFTrackingTransient::PixelClusterAuxInput::globalPosition
std::vector< float > globalPosition
Definition: EFTrackingTransient.h:223
EFTrackingFPGAIntegration::BenchmarkAlg::runPassThrough
StatusCode runPassThrough(std::vector< uint64_t > &pixelChainOutput, std::vector< uint64_t > &stripChainOutput, const EventContext &ctx) const
Definition: BenchmarkAlg.cxx:270
DEBUG
#define DEBUG
Definition: page_access.h:11
EFTrackingTransient::StripClusterAuxInput::id
std::vector< long unsigned int > id
Definition: EFTrackingTransient.h:207
EFTrackingTransient::StripClusterAuxInput::channelsInPhi
std::vector< int > channelsInPhi
Definition: EFTrackingTransient.h:210
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelClusteringTime
std::atomic< cl_ulong > m_pixelClusteringTime
Time for pixel clustering.
Definition: BenchmarkAlg.h:95
EFTrackingFPGAIntegration::BenchmarkAlg::m_pixelOutputTime
std::atomic< cl_ulong > m_pixelOutputTime
Time for pixel output buffer read.
Definition: BenchmarkAlg.h:100
EFTrackingFPGAIntegration::BenchmarkAlg::m_stripL2GTime
std::atomic< cl_ulong > m_stripL2GTime
Time for strip L2G.
Definition: BenchmarkAlg.h:98
IntegrationBase::m_program
cl::Program m_program
Program object containing the kernel.
Definition: IntegrationBase.h:68
EFTrackingFPGAIntegration::BenchmarkAlg::m_edmPrepTime
std::atomic< cl_ulong > m_edmPrepTime
Time for EDM preparation.
Definition: BenchmarkAlg.h:99
EFTrackingFPGAIntegration::BenchmarkAlg::m_runPassThrough
Gaudi::Property< bool > m_runPassThrough
Run the pass-through kernel.
Definition: BenchmarkAlg.h:89
EFTrackingFPGAIntegration::BenchmarkAlg::runDataPrep
StatusCode runDataPrep(std::vector< uint64_t > &pixelChainOutput, std::vector< uint64_t > &stripChainOutput, const EventContext &ctx) const
Definition: BenchmarkAlg.cxx:331