ATLAS Offline Software
Loading...
Searching...
No Matches
F110StreamIntegrationAlg.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3 */
4
8#include <xrt/xrt_bo.h>
9#include <xrt/xrt_device.h>
10#include <xrt/xrt_kernel.h>
11#include <xrt/xrt_uuid.h>
12#include <fstream>
13
15{
17 {
18 ATH_MSG_INFO("Running on the FPGA accelerator");
19
21
22 ATH_CHECK(m_chronoSvc.retrieve());
23
24 {
25 Athena::Chrono chrono("Platform and device initlize", m_chronoSvc.get());
27 }
28
29 {
30 Athena::Chrono chrono("CL::loadProgram", m_chronoSvc.get());
32 }
33 ATH_MSG_INFO("loading "<<m_xclbin);
34
35
36 ATH_CHECK(m_FPGAPixelRDO.initialize());
37 ATH_CHECK(m_FPGAStripRDO.initialize());
38
39 ATH_CHECK(m_FPGAPixelRDOSize.initialize());
40 ATH_CHECK(m_FPGAStripRDOSize.initialize());
41
42 ATH_CHECK(m_FPGAPixelOutput.initialize());
43 ATH_CHECK(m_FPGAStripOutput.initialize());
44
45 std::vector<std::string> listofCUs;
46
47 getListofCUs(listofCUs);
48
49 cl_int err = 0;
50
51 unsigned int nthreads = m_FPGAThreads.value();
52
53 if(m_FPGAThreads.value() < 1){
54 nthreads = SG::getNSlots();
55 }
56
57 // create the buffers
58 for(unsigned int i = 0; i < nthreads; i++)
59 {
60 m_acc_queues.emplace_back(m_context, m_accelerator, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
61
62 // Input
63 m_pixelClusterInputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::PIXEL_CONTAINER_INPUT_BUF_SIZE * sizeof(uint64_t), NULL, &err));
64 m_stripClusterInputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_ONLY, EFTrackingTransient::STRIP_CONTAINER_INPUT_BUF_SIZE * sizeof(uint64_t), NULL, &err));
65
66 // EDMPrep
67 m_edmPixelOutputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE * sizeof(uint32_t), NULL, &err));
68 m_edmStripOutputBufferList.push_back(cl::Buffer(m_context, CL_MEM_READ_WRITE, EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE * sizeof(uint32_t), NULL, &err));
69 }
70
71 // Create kernels for each one of CUs that is inside device
72 for (const auto& cuName: listofCUs)
73 {
74 // Pixel clustering
75 if(cuName.find(m_pixelEndClusterKernelName.value()) != std::string::npos) m_pixelEndClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
76 else if(cuName.find(m_pixelStartClusterKernelName.value()) != std::string::npos) m_pixelStartClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
77
78 // Strip clustering
79 else if(cuName.find(m_stripEndClusterKernelName.value()) != std::string::npos) m_stripEndClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
80 else if(cuName.find(m_stripStartClusterKernelName.value()) != std::string::npos) m_stripStartClusteringKernels.emplace_back(cl::Kernel(m_program, cuName.c_str()));
81
82 else
83 {
84 ATH_MSG_WARNING("Do not recognize kernel name: "<<cuName);
85 }
86 }
87
88 ATH_MSG_INFO(m_pixelStartClusterKernelName.value()<<" size: "<<m_pixelStartClusteringKernels.size());
89 ATH_MSG_INFO(m_pixelEndClusterKernelName.value()<<" size: "<<m_pixelEndClusteringKernels.size());
90 ATH_MSG_INFO(m_stripStartClusterKernelName.value()<<" size: "<<m_stripStartClusteringKernels.size());
91 ATH_MSG_INFO(m_stripEndClusterKernelName.value()<<" size: "<<m_stripEndClusteringKernels.size());
92
93
94 return StatusCode::SUCCESS;
95 }
96
97 StatusCode F110StreamIntegrationAlg::execute(const EventContext &ctx) const
98 {
99 ATH_MSG_DEBUG("Executing F110StreamIntegrationAlg");
100 m_numEvents++;
101
103 const std::vector<uint64_t>* pixelInput{nullptr}, *stripInput{nullptr};
104 ATH_CHECK(SG::get(pixelInput, m_FPGAPixelRDO, ctx));
105 ATH_CHECK(SG::get(stripInput, m_FPGAStripRDO, ctx));
106
107 const int* pixelInputSize{nullptr}, *stripInputSize{nullptr};
108 ATH_CHECK(SG::get(pixelInputSize, m_FPGAPixelRDOSize, ctx));
109 ATH_CHECK(SG::get(stripInputSize, m_FPGAStripRDOSize, ctx));
110
111 // logic
112 unsigned int nthreads = m_FPGAThreads.value();
113
114 if(m_FPGAThreads.value() < 1){
115 nthreads = SG::getNSlots();
116 }
117
118 size_t bufferIndex = ctx.slot() % nthreads;
119
120 // Get index for each of the kernels
121 size_t pixelStartClusterIndex = ctx.slot() % m_pixelStartClusteringKernels.size();
122 size_t pixelEndClusterIndex = ctx.slot() % m_pixelEndClusteringKernels.size();
123 size_t stripStartClusterIndex = ctx.slot() % m_stripStartClusteringKernels.size();
124 size_t stripEndClusterIndex = ctx.slot() % m_stripEndClusteringKernels.size();
125
126
127 const cl::CommandQueue &acc_queue = m_acc_queues[bufferIndex];
128
129 ATH_MSG_INFO("Thread number "<<ctx.slot()<<" running on buffer "<<bufferIndex<<" pixelStartClusterIndex: "<< pixelStartClusterIndex<<" stripStartClusterIndex: "<< stripStartClusterIndex<<" stripEndClusterIndex: "<< stripEndClusterIndex);
130
131 cl::Kernel &pixelStartClusteringKernel = m_pixelStartClusteringKernels[pixelStartClusterIndex];
132 cl::Kernel &pixelEndClusteringKernel = m_pixelEndClusteringKernels[pixelEndClusterIndex];
133 cl::Kernel &stripStartClusteringKernel = m_stripStartClusteringKernels[stripStartClusterIndex];
134 cl::Kernel &stripEndClusteringKernel = m_stripEndClusteringKernels[stripEndClusterIndex];
135
136 // Set kernel arguments
137 pixelStartClusteringKernel.setArg(0, m_pixelClusterInputBufferList[bufferIndex]);
138 pixelStartClusteringKernel.setArg(2, static_cast<unsigned long long>(*pixelInputSize));
139
140 pixelEndClusteringKernel.setArg(2, m_edmPixelOutputBufferList[bufferIndex]);
141
142
143 stripStartClusteringKernel.setArg(0, m_stripClusterInputBufferList[bufferIndex]);
144 stripStartClusteringKernel.setArg(2, static_cast<unsigned long long>(*stripInputSize));
145
146 stripEndClusteringKernel.setArg(2, m_edmStripOutputBufferList[bufferIndex]);
147
148 // Start the transfers
149 cl::Event evt_write_pixel_input;
150 cl::Event evt_write_strip_input;
151
152 acc_queue.enqueueWriteBuffer(m_pixelClusterInputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint64_t) * (*pixelInput).size(), (*pixelInput).data(), NULL, &evt_write_pixel_input);
153 acc_queue.enqueueWriteBuffer(m_stripClusterInputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint64_t) * (*stripInput).size(), (*stripInput).data(), NULL, &evt_write_strip_input);
154 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
155 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
156
157
158 cl::Event evt_pixel_start_clustering;
159 cl::Event evt_pixel_end_clustering;
160 cl::Event evt_strip_start_clustering;
161 cl::Event evt_strip_end_clustering;
162
163 {
164 Athena::Chrono chrono("Kernel execution", m_chronoSvc.get());
165
166 acc_queue.enqueueTask(pixelStartClusteringKernel, &evt_vec_pixel_input, &evt_pixel_start_clustering);
167 acc_queue.enqueueTask(pixelEndClusteringKernel, NULL , &evt_pixel_end_clustering);
168
169 acc_queue.enqueueTask(stripStartClusteringKernel, &evt_vec_strip_input, &evt_strip_start_clustering);
170 acc_queue.enqueueTask(stripEndClusteringKernel, NULL, &evt_strip_end_clustering);
171
172 }
173
174 cl::Event evt_pixel_cluster_output;
175 cl::Event evt_strip_cluster_output;
176
177 std::vector<cl::Event> evt_vec_pixel_done{evt_pixel_end_clustering};
178 std::vector<cl::Event> evt_vec_strip_done{evt_strip_end_clustering};
179
180
181 // output handles
182
184 ATH_CHECK(FPGAPixelOutput.record(std::make_unique<std::vector<uint32_t> >(EFTrackingTransient::PIXEL_CONTAINER_BUF_SIZE, 0)));
185
187 ATH_CHECK(FPGAStripOutput.record(std::make_unique<std::vector<uint32_t> >(EFTrackingTransient::STRIP_CONTAINER_BUF_SIZE, 0)));
188
189 acc_queue.enqueueReadBuffer(m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint32_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_done, &evt_pixel_cluster_output);
190 acc_queue.enqueueReadBuffer(m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0, sizeof(uint32_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_done, &evt_strip_cluster_output);
191
192 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output};
193 cl::Event::waitForEvents(wait_for_reads);
194
195
196 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0; // if no pixel input, set the first element to 0
197 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0; // if no strip input, set the first element to 0
198
199
200 // calculate the time for the kernel execution
201 // get the time of writing pixel input buffer
202 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
203 m_pixelInputTime += pixel_input_time;
204
205 // get the time of writing strip input buffer
206 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
207 m_stripInputTime += strip_input_time;
208
209 // get the time of pixel clustering
210 cl_ulong pixel_clustering_time = evt_pixel_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
211 m_pixelPipelineTime += pixel_clustering_time;
212
213 // get the time of strip clustering
214 cl_ulong strip_clustering_time = evt_strip_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
215 m_stripPipelineTime += strip_clustering_time;
216
217 // get the time of reading pixel output buffer
218 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
219 m_pixelOutputTime += pixel_output_time;
220
221 // get the time of reading strip output buffer
222 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
223 m_stripOutputTime += strip_output_time;
224
225 return StatusCode::SUCCESS;
226 }
227
229 {
230
231 ATH_MSG_INFO("Finalizing F110StreamIntegrationAlg");
232 ATH_MSG_INFO("Number of events: " << m_numEvents);
233
234 if(m_numEvents > 0){
235 ATH_MSG_INFO("Pixel input ave time: " << m_pixelInputTime / m_numEvents / 1e6 << " ms");
236 ATH_MSG_INFO("Strip input ave time: " << m_stripInputTime / m_numEvents / 1e6 << " ms");
237 ATH_MSG_INFO("Pixel pipeline ave time: " << m_pixelPipelineTime / m_numEvents / 1e6 << " ms");
238 ATH_MSG_INFO("Strip pipeline ave time: " << m_stripPipelineTime / m_numEvents / 1e6 << " ms");
239 ATH_MSG_INFO("Pixel output ave time: " << m_pixelOutputTime / m_numEvents / 1e6 << " ms");
240 ATH_MSG_INFO("Strip output ave time: " << m_stripOutputTime / m_numEvents / 1e6 << " ms");
241 }
242
243 return StatusCode::SUCCESS;
244 }
245
246 void F110StreamIntegrationAlg::getListofCUs(std::vector<std::string>& cuNames)
247 {
248 xrt::xclbin xrt_xclbin(m_xclbin.value());
249
250 ATH_MSG_INFO("xsa name: "<<xrt_xclbin.get_xsa_name());
251 ATH_MSG_INFO("fpga name: "<<xrt_xclbin.get_fpga_device_name());
252 ATH_MSG_INFO("uuid: "<<xrt_xclbin.get_uuid().to_string());
253
254 for (const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
255 const std::string& kernelName = kernel.get_name();
256
257 ATH_MSG_INFO("kernelName: "<<kernelName);
258
259
260 for (const xrt::xclbin::ip &computeUnit : kernel.get_cus()) {
261 const std::string& computeUnitName = computeUnit.get_name();
262 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
263
264 const std::string computeUnitUsableName = kernelName + ":{" + computeUnitIsolatedName + "}";
265
266 ATH_MSG_INFO("CU name: "<<computeUnitUsableName);
267 cuNames.push_back(computeUnitUsableName);
268 }
269 }
270 }
271
272} // namespace EFTrackingFPGAIntegration
#define ATH_CHECK
Evaluate an expression and check for errors.
#define ATH_MSG_INFO(x)
#define ATH_MSG_WARNING(x)
#define ATH_MSG_DEBUG(x)
Maintain a set of objects, one per slot.
Exception-safe IChronoSvc caller.
Exception-safe IChronoSvc caller.
Definition Chrono.h:50
std::atomic< cl_ulong > m_stripPipelineTime
Time for strip pipeline.
Gaudi::Property< std::string > m_stripStartClusterKernelName
Name of the strip clustering kernel start.
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAPixelOutput
virtual StatusCode initialize() override final
Detect the OpenCL devices and prepare OpenCL context.
Gaudi::Property< std::string > m_pixelStartClusterKernelName
Name of the pixel clustering kernel start.
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAStripOutput
virtual StatusCode execute(const EventContext &ctx) const override final
Should be overriden by derived classes to perform meaningful work.
Gaudi::Property< std::string > m_stripEndClusterKernelName
Name of the strip clustering kernel start.
void getListofCUs(std::vector< std::string > &cuNames)
Gaudi::Property< std::string > m_xclbin
Path and name of the xclbin file.
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAStripRDO
std::atomic< cl_ulong > m_stripInputTime
Time for strip input buffer write.
std::atomic< cl_ulong > m_stripOutputTime
Time for strip output buffer read.
ServiceHandle< IChronoSvc > m_chronoSvc
Service for timing the algorithm.
std::atomic< cl_ulong > m_pixelPipelineTime
Time for pixel pipeline.
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAPixelRDO
std::atomic< cl_ulong > m_pixelInputTime
Time for pixel input buffer write.
std::atomic< ulonglong > m_numEvents
Number of events processed.
Gaudi::Property< std::string > m_pixelEndClusterKernelName
Name of the pixel clustering kernel start.
std::atomic< cl_ulong > m_pixelOutputTime
Time for pixel output buffer read.
StatusCode loadProgram(const std::string &xclbin)
Find the xclbin file and load it into the OpenCL program object.
cl::Program m_program
Program object containing the kernel.
virtual StatusCode initialize() override
Detect the OpenCL devices and prepare OpenCL context.
cl::Context m_context
Context object for the application.
StatusCode precheck(const std::vector< Gaudi::Property< std::string > > &inputs) const
Check if the the desired Gaudi properties are set.
cl::Device m_accelerator
Device object for the accelerator card.
StatusCode record(std::unique_ptr< T > data)
Record a const object to the store.
The class for enconding RDO to FPGA format.
constexpr unsigned long PIXEL_CONTAINER_INPUT_BUF_SIZE
constexpr uint32_t STRIP_CONTAINER_BUF_SIZE
constexpr unsigned long STRIP_CONTAINER_INPUT_BUF_SIZE
constexpr uint32_t PIXEL_CONTAINER_BUF_SIZE
size_t getNSlots()
Return the number of event slots.
const T * get(const ReadCondHandleKey< T > &key, const EventContext &ctx)
Convenience function to retrieve an object given a ReadCondHandleKey.