8 #include <xrt/xrt_bo.h>
9 #include <xrt/xrt_device.h>
10 #include <xrt/xrt_kernel.h>
11 #include <xrt/xrt_uuid.h>
41 std::vector<std::string> listofCUs;
54 for(
unsigned int i = 0;
i < nthreads;
i++)
77 for (
const auto& cuName: listofCUs)
88 else if(cuName.find(
m_stripL2GKernelName.value()) != std::string::npos) m_stripL2GKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
91 else if(cuName.find(
m_pixelEdmKernelName.value()) != std::string::npos) m_pixelEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
93 else if(cuName.find(
m_stripEdmKernelName.value()) != std::string::npos) m_stripEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
109 return StatusCode::SUCCESS;
118 const std::vector<uint64_t>* pixelInput{
nullptr}, *stripInput{
nullptr};
130 size_t bufferIndex = ctx.slot() % nthreads;
133 size_t pixelStartClusterIndex = ctx.slot() % m_pixelStartClusteringKernels.size();
134 size_t pixelEndClusterIndex = ctx.slot() % m_pixelEndClusteringKernels.size();
135 size_t stripStartClusterIndex = ctx.slot() % m_stripStartClusteringKernels.size();
136 size_t stripEndClusterIndex = ctx.slot() % m_stripEndClusteringKernels.size();
137 size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
138 size_t pixelEDMIndex = m_pixelEdmPrepKernels.size() ? ctx.slot() % m_pixelEdmPrepKernels.size() : 0;
139 size_t stripEDMIndex = m_stripEdmPrepKernels.size() ? ctx.slot() % m_stripEdmPrepKernels.size() : 0;
141 const cl::CommandQueue &acc_queue =
m_acc_queues[bufferIndex];
143 ATH_MSG_INFO(
"Thread number "<<ctx.slot()<<
" running on buffer "<<bufferIndex<<
" pixelStartClusterIndex: "<< pixelStartClusterIndex<<
" stripStartClusterIndex: "<< stripStartClusterIndex<<
" stripEndClusterIndex: "<< stripEndClusterIndex<<
" stripL2GIndex: "<< stripL2GIndex<<
" pixelEDMIndex: "<< pixelEDMIndex<<
" stripEDMIndex: "<< stripEDMIndex);
145 cl::Kernel &pixelStartClusteringKernel = m_pixelStartClusteringKernels[pixelStartClusterIndex];
146 cl::Kernel &pixelEndClusteringKernel = m_pixelEndClusteringKernels[pixelEndClusterIndex];
147 cl::Kernel &stripStartClusteringKernel = m_stripStartClusteringKernels[stripStartClusterIndex];
148 cl::Kernel &stripEndClusteringKernel = m_stripEndClusteringKernels[stripEndClusterIndex];
149 cl::Kernel &stripL2GKernel = m_stripL2GKernels[stripL2GIndex];
151 cl::Kernel &pixelEdmPrepKernel = m_pixelEdmPrepKernels[pixelEDMIndex];
152 cl::Kernel &stripEdmPrepKernel = m_stripEdmPrepKernels[stripEDMIndex];
157 pixelStartClusteringKernel.setArg(2,
static_cast<unsigned long long>((*pixelInput).size()));
163 stripStartClusteringKernel.setArg(2,
static_cast<unsigned long long>((*stripInput).size()));
186 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
187 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
201 acc_queue.enqueueTask(pixelStartClusteringKernel, &evt_vec_pixel_input, &evt_pixel_start_clustering);
202 acc_queue.enqueueTask(pixelEndClusteringKernel, NULL , &evt_pixel_end_clustering);
203 acc_queue.enqueueTask(stripStartClusteringKernel, &evt_vec_strip_input, &evt_strip_start_clustering);
204 acc_queue.enqueueTask(stripEndClusteringKernel, NULL, &evt_strip_end_clustering);
206 std::vector<cl::Event> evt_vec_pixel_clustering{evt_pixel_end_clustering};
207 std::vector<cl::Event> evt_vec_strip_clustering{evt_strip_end_clustering};
209 acc_queue.enqueueTask(stripL2GKernel, &evt_vec_strip_clustering, &evt_strip_l2g);
211 std::vector<cl::Event> evt_vec_strip_l2g{evt_strip_l2g};
212 acc_queue.enqueueTask(pixelEdmPrepKernel, &evt_vec_pixel_clustering, &evt_pixel_edm_prep);
213 acc_queue.enqueueTask(stripEdmPrepKernel, &evt_vec_strip_l2g, &evt_strip_edm_prep);
220 std::vector<cl::Event> evt_vec_pixel_edm_prep;
221 std::vector<cl::Event> evt_vec_strip_edm_prep;
223 evt_vec_pixel_edm_prep.push_back(evt_pixel_edm_prep);
224 evt_vec_strip_edm_prep.push_back(evt_strip_edm_prep);
235 acc_queue.enqueueReadBuffer(
m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_edm_prep, &evt_pixel_cluster_output);
236 acc_queue.enqueueReadBuffer(
m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_edm_prep, &evt_strip_cluster_output);
238 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output };
239 cl::Event::waitForEvents(wait_for_reads);
242 if(pixelInput->size() == 6) (*FPGAPixelOutput)[0] = 0;
243 if(stripInput->size() == 6) (*FPGAStripOutput)[0] = 0;
248 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
250 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << pixel_input_time / 1e6 <<
" ms");
253 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
255 ATH_MSG_DEBUG(
"Strip input buffer write time: " << strip_input_time / 1e6 <<
" ms");
258 cl_ulong pixel_clustering_time = evt_pixel_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
260 ATH_MSG_DEBUG(
"Pixel clustering time: " << pixel_clustering_time / 1e6 <<
" ms");
263 cl_ulong strip_clustering_time = evt_strip_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
265 ATH_MSG_DEBUG(
"Strip clustering time: " << strip_clustering_time / 1e6 <<
" ms");
268 cl_ulong strip_l2g_time = evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
270 ATH_MSG_DEBUG(
"Strip L2G time: " << strip_l2g_time / 1e6 <<
" ms");
273 cl_ulong pixel_edm_prep_time = evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
274 cl_ulong strip_edm_prep_time = evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
277 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << pixel_edm_prep_time / 1e6 <<
" ms");
280 ATH_MSG_DEBUG(
"StripEDMPrep time: " << strip_edm_prep_time / 1e6 <<
" ms");
284 cl_ulong kernel_start = evt_pixel_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
285 cl_ulong kernel_end =
std::max(evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>(), evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>());
287 ATH_MSG_DEBUG(
"Kernel execution time: " << (kernel_end - kernel_start) / 1e6 <<
" ms");
290 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
292 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << pixel_output_time / 1e6 <<
" ms");
295 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
297 ATH_MSG_DEBUG(
"Strip output buffer read time: " << strip_output_time / 1e6 <<
" ms");
299 return StatusCode::SUCCESS;
321 return StatusCode::SUCCESS;
329 ATH_MSG_INFO(
"fpga name: "<<xrt_xclbin.get_fpga_device_name());
330 ATH_MSG_INFO(
"uuid: "<<xrt_xclbin.get_uuid().to_string());
332 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
333 const std::string& kernelName = kernel.get_name();
339 const std::string& computeUnitName = computeUnit.get_name();
340 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
342 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
345 cuNames.push_back(computeUnitUsableName);