8 #include <xrt/xrt_bo.h>
9 #include <xrt/xrt_device.h>
10 #include <xrt/xrt_kernel.h>
11 #include <xrt/xrt_uuid.h>
41 std::vector<std::string> listofCUs;
54 for(
unsigned int i = 0;
i < nthreads;
i++)
82 for (
const auto& cuName: listofCUs)
94 else if(cuName.find(
m_stripL2GKernelName.value()) != std::string::npos) m_stripL2GKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
97 else if(cuName.find(
m_pixelEdmKernelName.value()) != std::string::npos) m_pixelEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
99 else if(cuName.find(
m_stripEdmKernelName.value()) != std::string::npos) m_stripEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
113 if(m_pixelClusteringKernels.size()==0){
115 return StatusCode::FAILURE;
126 return StatusCode::SUCCESS;
135 mnt_timer_Total.start();
140 const std::vector<uint64_t>* pixelInput{
nullptr}, *stripInput{
nullptr};
152 size_t bufferIndex = ctx.slot() % nthreads;
155 size_t pixelClusterIndex = ctx.slot() % m_pixelClusteringKernels.size();
156 size_t stripClusterIndex = ctx.slot() % m_stripClusteringKernels.size();
157 size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
158 size_t pixelL2GIndex = m_pixelL2GKernels.size() ? ctx.slot() % m_pixelL2GKernels.size() : 0;
159 size_t pixelEDMIndex = m_pixelEdmPrepKernels.size() ? ctx.slot() % m_pixelEdmPrepKernels.size() : 0;
160 size_t stripEDMIndex = m_stripEdmPrepKernels.size() ? ctx.slot() % m_stripEdmPrepKernels.size() : 0;
162 const cl::CommandQueue &acc_queue =
m_acc_queues[bufferIndex];
164 cl::Kernel &pixelClusteringKernel = m_pixelClusteringKernels[pixelClusterIndex];
165 cl::Kernel &stripClusteringKernel = m_stripClusteringKernels[stripClusterIndex];
166 cl::Kernel &stripL2GKernel = m_stripL2GKernels[stripL2GIndex];
167 cl::Kernel &pixelEdmPrepKernel = m_pixelEdmPrepKernels[pixelEDMIndex];
168 cl::Kernel &stripEdmPrepKernel = m_stripEdmPrepKernels[stripEDMIndex];
169 cl::Kernel *pixelL2GKernel =
nullptr;
171 if (!
m_doF110) pixelL2GKernel = &m_pixelL2GKernels[pixelL2GIndex];
182 uint32_t cluster_vector_size_bytes =
sizeof(
uint64_t) * (*pixelInput).size();
183 uint32_t edm_vector_size_bytes =
sizeof(
uint64_t) * (*pixelInput).size() * 8;
186 pixelClusteringKernel.setArg(3, hit_vector_size_bytes);
187 pixelClusteringKernel.setArg(4, cluster_vector_size_bytes);
188 pixelClusteringKernel.setArg(5, edm_vector_size_bytes);
194 stripClusteringKernel.setArg(3,
static_cast<unsigned int>((*stripInput).size()));
222 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
223 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
235 acc_queue.enqueueTask(pixelClusteringKernel, &evt_vec_pixel_input, &evt_pixel_clustering);
236 acc_queue.enqueueTask(stripClusteringKernel, &evt_vec_strip_input, &evt_strip_clustering);
238 std::vector<cl::Event> evt_vec_strip_clustering{evt_strip_clustering};
241 std::vector<cl::Event> evt_vec_pixel_clustering{evt_pixel_clustering};
242 acc_queue.enqueueTask(*pixelL2GKernel, &evt_vec_pixel_clustering, &evt_pixel_l2g);
244 acc_queue.enqueueTask(stripL2GKernel, &evt_vec_strip_clustering, &evt_strip_l2g);
246 std::vector<cl::Event> evt_vec_pixelEDM;
247 if(
m_doF110) evt_vec_pixelEDM.push_back(evt_pixel_clustering);
248 else evt_vec_pixelEDM.push_back(evt_pixel_l2g);
249 std::vector<cl::Event> evt_vec_strip_l2g{evt_strip_l2g};
250 acc_queue.enqueueTask(pixelEdmPrepKernel, &evt_vec_pixelEDM, &evt_pixel_edm_prep);
251 acc_queue.enqueueTask(stripEdmPrepKernel, &evt_vec_strip_l2g, &evt_strip_edm_prep);
258 std::vector<cl::Event> evt_vec_pixel_edm_prep;
259 std::vector<cl::Event> evt_vec_strip_edm_prep;
261 evt_vec_pixel_edm_prep.push_back(evt_pixel_edm_prep);
262 evt_vec_strip_edm_prep.push_back(evt_strip_edm_prep);
273 acc_queue.enqueueReadBuffer(
m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_edm_prep, &evt_pixel_cluster_output);
274 acc_queue.enqueueReadBuffer(
m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_edm_prep, &evt_strip_cluster_output);
276 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output };
277 cl::Event::waitForEvents(wait_for_reads);
279 mnt_timer_Total.stop();
281 if(pixelInput->size() == 6) (*FPGAPixelOutput)[0] = 0;
282 if(stripInput->size() == 6) (*FPGAStripOutput)[0] = 0;
287 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
289 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << pixel_input_time / 1e6 <<
" ms");
292 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
294 ATH_MSG_DEBUG(
"Strip input buffer write time: " << strip_input_time / 1e6 <<
" ms");
297 cl_ulong pixel_clustering_time = evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
299 ATH_MSG_DEBUG(
"Pixel clustering time: " << pixel_clustering_time / 1e6 <<
" ms");
302 cl_ulong strip_clustering_time = evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
304 ATH_MSG_DEBUG(
"Strip clustering time: " << strip_clustering_time / 1e6 <<
" ms");
308 cl_ulong pixel_l2g_time = evt_pixel_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
310 ATH_MSG_DEBUG(
"Pixel L2G time: " << pixel_l2g_time / 1e6 <<
" ms");
314 cl_ulong strip_l2g_time = evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
316 ATH_MSG_DEBUG(
"Strip L2G time: " << strip_l2g_time / 1e6 <<
" ms");
320 cl_ulong pixel_edm_prep_time = evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
321 cl_ulong strip_edm_prep_time = evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
324 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << pixel_edm_prep_time / 1e6 <<
" ms");
327 ATH_MSG_DEBUG(
"StripEDMPrep time: " << strip_edm_prep_time / 1e6 <<
" ms");
330 cl_ulong edm_prep_time = evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
333 ATH_MSG_DEBUG(
"EDMPrep time: " << edm_prep_time / 1e6 <<
" ms");
337 cl_ulong kernel_start = evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
339 std::max(evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>(), evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>()) :
340 evt_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>();
342 ATH_MSG_DEBUG(
"Kernel execution time: " << (kernel_end - kernel_start) / 1e6 <<
" ms");
345 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
347 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << pixel_output_time / 1e6 <<
" ms");
350 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
352 ATH_MSG_DEBUG(
"Strip output buffer read time: " << strip_output_time / 1e6 <<
" ms");
354 return StatusCode::SUCCESS;
383 return StatusCode::SUCCESS;
388 xrt::xclbin xrt_xclbin(
m_xclbin.value());
391 ATH_MSG_INFO(
"fpga name: "<<xrt_xclbin.get_fpga_device_name());
392 ATH_MSG_INFO(
"uuid: "<<xrt_xclbin.get_uuid().to_string());
394 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
395 const std::string& kernelName = kernel.get_name();
401 const std::string& computeUnitName = computeUnit.get_name();
402 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
404 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
407 cuNames.push_back(computeUnitUsableName);