ATLAS Offline Software
Loading...
Searching...
No Matches
F110StreamIntegrationAlg.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3 */
4
8#include <xrt/xrt_bo.h>
9#include <xrt/xrt_device.h>
10#include <xrt/xrt_kernel.h>
11#include <xrt/xrt_uuid.h>
12#include <fstream>
13
15{
17 {
18 ATH_MSG_INFO("Running on the FPGA accelerator");
19
21
22 ATH_CHECK(m_chronoSvc.retrieve());
23
24 {
25 Athena::Chrono chrono("Platform and device initlize", m_chronoSvc.get());
27 }
28
29 {
30 Athena::Chrono chrono("CL::loadProgram", m_chronoSvc.get());
32 }
33 ATH_MSG_INFO("loading "<<m_xclbin);
34
35
36 ATH_CHECK(m_FPGAPixelRDO.initialize());
37 ATH_CHECK(m_FPGAStripRDO.initialize());
38
39 ATH_CHECK(m_FPGAPixelRDOSize.initialize());
40 ATH_CHECK(m_FPGAStripRDOSize.initialize());
41
42 ATH_CHECK(m_FPGAPixelOutput.initialize());
43 ATH_CHECK(m_FPGAStripOutput.initialize());
44
45 std::vector<std::string> listofCUs;
46
47 getListofCUs(listofCUs);
48
49 cl_int err = 0;
50
51 unsigned int nthreads = m_FPGAThreads.value();
52
53 if(m_FPGAThreads.value() < 1){
54 nthreads = SG::getNSlots();
55 }
56
57 // create the buffers
58 for(unsigned int i = 0; i < nthreads; i++)
59 {
60 m_acc_queues.emplace_back(m_context, m_accelerator, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
61
62 // Input
63 m_pixelClusterInputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::PIXEL_CONTAINER_INPUT_BUF_SIZE * sizeof(uint64_t), NULL, &err));
64 m_stripClusterInputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::STRIP_CONTAINER_INPUT_BUF_SIZE * sizeof(uint64_t), NULL, &err));
65
66 // EDMPrep
67 m_edmPixelOutputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE * sizeof(uint32_t), NULL, &err));
68 m_edmStripOutputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE * sizeof(uint32_t), NULL, &err));
69 }
70
71 // Create kernels for each one of CUs that is inside device
72 for (const auto& cuName: listofCUs)
73 {
74 // Pixel clustering
75 if(cuName.find(m_pixelEndClusterKernelName.value()) != std::string::npos) m_pixelEndClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
76 else if(cuName.find(m_pixelStartClusterKernelName.value()) != std::string::npos) m_pixelStartClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
77
78 // Strip clustering
79 else if(cuName.find(m_stripEndClusterKernelName.value()) != std::string::npos) m_stripEndClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
80 else if(cuName.find(m_stripStartClusterKernelName.value()) != std::string::npos) m_stripStartClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
81 // Strip clustering
82 else if(cuName.find(m_pixelLUTKernelName.value()) != std::string::npos) m_pixelLUTKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
83 else if(cuName.find(m_stripLUTKernelName.value()) != std::string::npos) m_stripLUTKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
84
85
86 else
87 {
88 ATH_MSG_WARNING("Do not recognize kernel name: "<<cuName);
89 }
90 }
91
92 ATH_MSG_INFO(m_pixelStartClusterKernelName.value()<<" size: "<<m_pixelStartClusteringKernels.size());
93 ATH_MSG_INFO(m_pixelEndClusterKernelName.value()<<" size: "<<m_pixelEndClusteringKernels.size());
94 ATH_MSG_INFO(m_stripStartClusterKernelName.value()<<" size: "<<m_stripStartClusteringKernels.size());
95 ATH_MSG_INFO(m_stripEndClusterKernelName.value()<<" size: "<<m_stripEndClusteringKernels.size());
96
97 ATH_MSG_INFO(m_pixelLUTKernelName.value()<<" size: "<<m_pixelLUTKernels.size());
98 ATH_MSG_INFO(m_stripLUTKernelName.value()<<" size: "<<m_stripLUTKernels.size());
99
100 // if the LUT kernels are found, transfer the data there
101 if(m_pixelLUTKernels.size())
102 {
103 // read the information from the file
104 std::vector<uint64_t> data;
105 if(!readCalibfile(m_pixelLUTFilePath.value(), data)) return StatusCode::FAILURE;
106
108 {
109 ATH_MSG_ERROR("Pixel LUT size of "<<data.size() <<" does not match expectation of "<<EFTrackingTransient::PIXEL_LUT_SIZE);
110 }
111
112 // Send the data to each LUT kernel
113 for(size_t i = 0; i < m_pixelLUTKernels.size(); i++)
114 {
115 cl::Kernel &lutKernel = m_pixelLUTKernels[i];
116 auto lutBuffer = cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::PIXEL_LUT_SIZE * sizeof(uint64_t), NULL, &err);
117
118 // Set kernel arguments
119 lutKernel.setArg(0, lutBuffer);
120 lutKernel.setArg(2, static_cast<unsigned long long>(data.size()));
121
122 // Start the transfers
123 cl::Event lut_inputEvent;
124 // just use the first queue for this
125 auto queue = m_acc_queues[0];
126
127 queue.enqueueWriteBuffer(lutBuffer, CL_FALSE, 0, sizeof(uint64_t) * data.size(), data.data(), NULL, &lut_inputEvent);
128 queue.enqueueTask(lutKernel, NULL, &lut_inputEvent);
129
130 // wait for this queue to finish
131 queue.finish();
132 }
133 }
134
135 if(m_stripLUTKernels.size())
136 {
137 // read the information from the file
138 std::vector<uint64_t> data;
139 if(!readCalibfile(m_stripLUTFilePath.value(), data)) return StatusCode::FAILURE;
140
142 {
143 ATH_MSG_ERROR("Strip LUT size of "<<data.size() <<" does not match expectation of "<<EFTrackingTransient::STRIP_LUT_SIZE);
144 }
145
146 // Send the data to each LUT kernel
147 for(size_t i = 0; i < m_stripLUTKernels.size(); i++)
148 {
149 cl::Kernel &lutKernel = m_stripLUTKernels[i];
150
151 auto lutBuffer = cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::STRIP_LUT_SIZE * sizeof(uint64_t), NULL, &err);
152
153 // Set kernel arguments
154 lutKernel.setArg(0, lutBuffer);
155 lutKernel.setArg(2, static_cast<unsigned long long>(data.size()));
156
157 // Start the transfers
158 cl::Event lut_inputEvent;
159 // just use the first queue for this
160 auto queue = m_acc_queues[0];
161
162 queue.enqueueWriteBuffer(lutBuffer, CL_FALSE, 0, sizeof(uint64_t) * data.size(), data.data(), NULL, &lut_inputEvent);
163 queue.enqueueTask(lutKernel, NULL, &lut_inputEvent);
164
165 // wait for this queue to finish
166 queue.finish();
167 }
168 }
169
170
171 return StatusCode::SUCCESS;
172 }
173
174 StatusCode F110StreamIntegrationAlg::readCalibfile(std::string inputFileName, std::vector<uint64_t>& data)
175 {
176 ATH_MSG_INFO("Loading LUTs from " << inputFileName);
177
178
179 std::ifstream inputFile(inputFileName);
180 if (!inputFile.is_open()) {
181 std::cerr << "Error opening input file " << inputFileName << std::endl;
182 return StatusCode::FAILURE;
183 }
184
185 // Read the full file: expects hex tokens (e.g. "0x1234" or "1234")
186 uint64_t value = 0;
187 while (inputFile >> std::hex >> value) {
188 data.push_back(value);
189 }
190 return StatusCode::SUCCESS;
191 }
192
193
194 StatusCode F110StreamIntegrationAlg::execute(const EventContext &ctx) const
195 {
196 ATH_MSG_DEBUG("Executing F110StreamIntegrationAlg");
197 m_numEvents++;
198
200 const std::vector<uint64_t>* pixelInput{nullptr}, *stripInput{nullptr};
201 ATH_CHECK(SG::get(pixelInput, m_FPGAPixelRDO, ctx));
202 ATH_CHECK(SG::get(stripInput, m_FPGAStripRDO, ctx));
203
204 const int* pixelInputSize{nullptr}, *stripInputSize{nullptr};
205 ATH_CHECK(SG::get(pixelInputSize, m_FPGAPixelRDOSize, ctx));
206 ATH_CHECK(SG::get(stripInputSize, m_FPGAStripRDOSize, ctx));
207
208 // logic
209 unsigned int nthreads = m_FPGAThreads.value();
210
211 if(m_FPGAThreads.value() < 1){
212 nthreads = SG::getNSlots();
213 }
214
215 size_t bufferIndex = ctx.slot() % nthreads;
216
217 // Get index for each of the kernels
218 size_t pixelStartClusterIndex = ctx.slot() % m_pixelStartClusteringKernels.size();
219 size_t pixelEndClusterIndex = ctx.slot() % m_pixelEndClusteringKernels.size();
220 size_t stripStartClusterIndex = ctx.slot() % m_stripStartClusteringKernels.size();
221 size_t stripEndClusterIndex = ctx.slot() % m_stripEndClusteringKernels.size();
222
223
224 const cl::CommandQueue &acc_queue = m_acc_queues[bufferIndex];
225
226 ATH_MSG_INFO("Thread number "<<ctx.slot()<<" running on buffer "<<bufferIndex<<" pixelStartClusterIndex: "<< pixelStartClusterIndex<<" stripStartClusterIndex: "<< stripStartClusterIndex<<" stripEndClusterIndex: "<< stripEndClusterIndex);
227
228 cl::Kernel &pixelStartClusteringKernel = m_pixelStartClusteringKernels[pixelStartClusterIndex];
229 cl::Kernel &pixelEndClusteringKernel = m_pixelEndClusteringKernels[pixelEndClusterIndex];
230 cl::Kernel &stripStartClusteringKernel = m_stripStartClusteringKernels[stripStartClusterIndex];
231 cl::Kernel &stripEndClusteringKernel = m_stripEndClusteringKernels[stripEndClusterIndex];
232
233 // Set kernel arguments
234 pixelStartClusteringKernel.setArg(0, m_pixelClusterInputBufferList[bufferIndex]);
235 pixelStartClusteringKernel.setArg(2, static_cast<unsigned long long>(*pixelInputSize));
236
237 pixelEndClusteringKernel.setArg(2, m_edmPixelOutputBufferList[bufferIndex]);
238
239
240 stripStartClusteringKernel.setArg(0, m_stripClusterInputBufferList[bufferIndex]);
241 stripStartClusteringKernel.setArg(2, static_cast<unsigned long long>(*stripInputSize));
242
243 stripEndClusteringKernel.setArg(2, m_edmStripOutputBufferList[bufferIndex]);
244
245 // Start the transfers
246 cl::Event evt_write_pixel_input;
247 cl::Event evt_write_strip_input;
248
249 acc_queue.enqueueWriteBuffer(m_pixelClusterInputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint64_t) * (*pixelInput).size(), (*pixelInput).data(), NULL, &evt_write_pixel_input);
250 acc_queue.enqueueWriteBuffer(m_stripClusterInputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint64_t) * (*stripInput).size(), (*stripInput).data(), NULL, &evt_write_strip_input);
251 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
252 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
253
254
255 cl::Event evt_pixel_start_clustering;
256 cl::Event evt_pixel_end_clustering;
257 cl::Event evt_strip_start_clustering;
258 cl::Event evt_strip_end_clustering;
259
260 {
261 Athena::Chrono chrono("Kernel execution", m_chronoSvc.get());
262
263 acc_queue.enqueueTask(pixelStartClusteringKernel, &evt_vec_pixel_input, &evt_pixel_start_clustering);
264 acc_queue.enqueueTask(pixelEndClusteringKernel, NULL , &evt_pixel_end_clustering);
265
266 acc_queue.enqueueTask(stripStartClusteringKernel, &evt_vec_strip_input, &evt_strip_start_clustering);
267 acc_queue.enqueueTask(stripEndClusteringKernel, NULL, &evt_strip_end_clustering);
268
269 }
270
271 cl::Event evt_pixel_cluster_output;
272 cl::Event evt_strip_cluster_output;
273
274 std::vector<cl::Event> evt_vec_pixel_done{evt_pixel_end_clustering};
275 std::vector<cl::Event> evt_vec_strip_done{evt_strip_end_clustering};
276
277
278 // output handles
279
281 ATH_CHECK(FPGAPixelOutput.record(std::make_unique<std::vector<uint32_t> >(EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE, 0)));
282
284 ATH_CHECK(FPGAStripOutput.record(std::make_unique<std::vector<uint32_t> >(EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE, 0)));
285
286 acc_queue.enqueueReadBuffer(m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint32_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_done, &evt_pixel_cluster_output);
287 acc_queue.enqueueReadBuffer(m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint32_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_done, &evt_strip_cluster_output);
288
289 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output};
290 cl::Event::waitForEvents(wait_for_reads);
291
292
293 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0; // if no pixel input, set the first element to 0
294 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0; // if no strip input, set the first element to 0
295
296
297 // calculate the time for the kernel execution
298 // get the time of writing pixel input buffer
299 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
300 m_pixelInputTime += pixel_input_time;
301
302 // get the time of writing strip input buffer
303 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
304 m_stripInputTime += strip_input_time;
305
306 // get the time of pixel clustering
307 cl_ulong pixel_clustering_time = evt_pixel_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
308 m_pixelPipelineTime += pixel_clustering_time;
309
310 // get the time of strip clustering
311 cl_ulong strip_clustering_time = evt_strip_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
312 m_stripPipelineTime += strip_clustering_time;
313
314 // get the time of reading pixel output buffer
315 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
316 m_pixelOutputTime += pixel_output_time;
317
318 // get the time of reading strip output buffer
319 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
320 m_stripOutputTime += strip_output_time;
321
322 return StatusCode::SUCCESS;
323 }
324
326 {
327
328 ATH_MSG_INFO("Finalizing F110StreamIntegrationAlg");
329 ATH_MSG_INFO("Number of events: " << m_numEvents);
330
331 if(m_numEvents > 0){
332 ATH_MSG_INFO("Pixel input ave time: " << m_pixelInputTime / m_numEvents / 1e6 << " ms");
333 ATH_MSG_INFO("Strip input ave time: " << m_stripInputTime / m_numEvents / 1e6 << " ms");
334 ATH_MSG_INFO("Pixel pipeline ave time: " << m_pixelPipelineTime / m_numEvents / 1e6 << " ms");
335 ATH_MSG_INFO("Strip pipeline ave time: " << m_stripPipelineTime / m_numEvents / 1e6 << " ms");
336 ATH_MSG_INFO("Pixel output ave time: " << m_pixelOutputTime / m_numEvents / 1e6 << " ms");
337 ATH_MSG_INFO("Strip output ave time: " << m_stripOutputTime / m_numEvents / 1e6 << " ms");
338 }
339
340 return StatusCode::SUCCESS;
341 }
342
343 void F110StreamIntegrationAlg::getListofCUs(std::vector<std::string>& cuNames)
344 {
345 xrt::xclbin xrt_xclbin(m_xclbin.value());
346
347 ATH_MSG_INFO("xsa name: "<<xrt_xclbin.get_xsa_name());
348 ATH_MSG_INFO("fpga name: "<<xrt_xclbin.get_fpga_device_name());
349 ATH_MSG_INFO("uuid: "<<xrt_xclbin.get_uuid().to_string());
350
351 for (const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
352 const std::string& kernelName = kernel.get_name();
353
354 ATH_MSG_INFO("kernelName: "<<kernelName);
355
356
357 for (const xrt::xclbin::ip &computeUnit : kernel.get_cus()) {
358 const std::string& computeUnitName = computeUnit.get_name();
359 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
360
361 const std::string computeUnitUsableName = kernelName + ":{" + computeUnitIsolatedName + "}";
362
363 ATH_MSG_INFO("CU name: "<<computeUnitUsableName);
364 cuNames.push_back(computeUnitUsableName);
365 }
366 }
367 }
368
369} // namespace EFTrackingFPGAIntegration
#define ATH_CHECK
Evaluate an expression and check for errors.
#define ATH_MSG_ERROR(x)
#define ATH_MSG_INFO(x)
#define ATH_MSG_WARNING(x)
#define ATH_MSG_DEBUG(x)
Maintain a set of objects, one per slot.
Exception-safe IChronoSvc caller.
char data[hepevt_bytes_allocation_ATLAS]
Definition HepEvt.cxx:11
Exception-safe IChronoSvc caller.
Definition Chrono.h:50
std::atomic< cl_ulong > m_stripPipelineTime
Time for strip pipeline.
Gaudi::Property< std::string > m_stripStartClusterKernelName
Name of the strip clustering kernel start.
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAPixelOutput
virtual StatusCode initialize() override final
Detect the OpenCL devices and prepare OpenCL context.
Gaudi::Property< std::string > m_pixelStartClusterKernelName
Name of the pixel clustering kernel start.
Gaudi::Property< std::string > m_stripLUTKernelName
Name of the pixel lut loading kernel.
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAStripOutput
virtual StatusCode execute(const EventContext &ctx) const override final
Should be overriden by derived classes to perform meaningful work.
Gaudi::Property< std::string > m_stripEndClusterKernelName
Name of the strip clustering kernel start.
void getListofCUs(std::vector< std::string > &cuNames)
Gaudi::Property< std::string > m_xclbin
Path and name of the xclbin file.
Gaudi::Property< std::string > m_pixelLUTKernelName
Name of the pixel lut loading kernel.
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAStripRDO
StatusCode readCalibfile(std::string inputFileName, std::vector< uint64_t > &data)
std::atomic< cl_ulong > m_stripInputTime
Time for strip input buffer write.
std::atomic< cl_ulong > m_stripOutputTime
Time for strip output buffer read.
ServiceHandle< IChronoSvc > m_chronoSvc
Service for timing the algorithm.
std::atomic< cl_ulong > m_pixelPipelineTime
Time for pixel pipeline.
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAPixelRDO
std::atomic< cl_ulong > m_pixelInputTime
Time for pixel input buffer write.
std::atomic< ulonglong > m_numEvents
Number of events processed.
Gaudi::Property< std::string > m_pixelEndClusterKernelName
Name of the pixel clustering kernel start.
std::atomic< cl_ulong > m_pixelOutputTime
Time for pixel output buffer read.
StatusCode loadProgram(const std::string &xclbin)
Find the xclbin file and load it into the OpenCL program object.
cl::Program m_program
Program object containing the kernel.
virtual StatusCode initialize() override
Detect the OpenCL devices and prepare OpenCL context.
cl::Context m_context
Context object for the application.
StatusCode precheck(const std::vector< Gaudi::Property< std::string > > &inputs) const
Check if the the desired Gaudi properties are set.
cl::Device m_accelerator
Device object for the accelerator card.
StatusCode record(std::unique_ptr< T > data)
Record a const object to the store.
The class for enconding RDO to FPGA format.
constexpr unsigned long PIXEL_CONTAINER_INPUT_BUF_SIZE
constexpr uint32_t STRIP_CONTAINER_BUF_SIZE
constexpr unsigned long STRIP_CONTAINER_INPUT_BUF_SIZE
constexpr uint32_t STRIP_LUT_SIZE
constexpr uint32_t PIXEL_LUT_SIZE
constexpr uint32_t PIXEL_CONTAINER_BUF_SIZE
size_t getNSlots()
Return the number of event slots.
const T * get(const ReadCondHandleKey< T > &key, const EventContext &ctx)
Convenience function to retrieve an object given a ReadCondHandleKey.