8 #include <xrt/xrt_bo.h>
9 #include <xrt/xrt_device.h>
10 #include <xrt/xrt_kernel.h>
11 #include <xrt/xrt_uuid.h>
43 std::vector<std::string> listofCUs;
56 for(
unsigned int i = 0;
i < nthreads;
i++)
82 for (
const auto& cuName: listofCUs)
90 else if(cuName.find(
m_stripL2GKernelName.value()) != std::string::npos) m_stripL2GKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
93 else if(cuName.find(
m_pixelEdmKernelName.value()) != std::string::npos) m_pixelEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
94 else if(cuName.find(
m_stripEdmKernelName.value()) != std::string::npos) m_stripEdmPrepKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
99 else if(cuName.find(
m_insideOutInputName.value()) != std::string::npos) m_insideOutInputKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
100 else if(cuName.find(
m_insideOutOutputName.value()) != std::string::npos) m_insideOutOutputKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
117 if(m_pixelClusteringKernels.size()==0){
119 return StatusCode::FAILURE;
130 return StatusCode::SUCCESS;
136 auto withEvt = [&](
const std::string&
fname) {
137 const auto evt = ctx.eventID().event_number();
139 if (
dot == std::string::npos) {
147 std::ofstream
outputFile(withEvt(dataDescriptor));
150 outputFile << std::hex << std::setw(16) << std::setfill(
'0') <<
d <<
'\n';
164 mnt_timer_Total.start();
169 const std::vector<uint64_t>* pixelInput{
nullptr}, *stripInput{
nullptr};
181 size_t bufferIndex = ctx.slot() % nthreads;
184 size_t pixelClusterIndex = ctx.slot() % m_pixelClusteringKernels.size();
185 size_t stripClusterIndex = ctx.slot() % m_stripClusteringKernels.size();
186 size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
187 size_t pixelEDMIndex = ctx.slot() % m_pixelEdmPrepKernels.size();
188 size_t stripEDMIndex = ctx.slot() % m_stripEdmPrepKernels.size();
189 size_t slicingOutIndex = ctx.slot() % m_slicingEngineOutputKernels.size();
190 size_t insideOutInputIndex = ctx.slot() % m_insideOutInputKernels.size();
191 size_t insideOutOutputIndex = ctx.slot() % m_insideOutOutputKernels.size();
193 const cl::CommandQueue &acc_queue =
m_acc_queues[bufferIndex];
195 cl::Kernel &pixelClusteringKernel = m_pixelClusteringKernels[pixelClusterIndex];
196 cl::Kernel &stripClusteringKernel = m_stripClusteringKernels[stripClusterIndex];
197 cl::Kernel &stripL2GKernel = m_stripL2GKernels[stripL2GIndex];
198 cl::Kernel &pixelEdmPrepKernel = m_pixelEdmPrepKernels[pixelEDMIndex];
199 cl::Kernel &stripEdmPrepKernel = m_stripEdmPrepKernels[stripEDMIndex];
200 cl::Kernel &slicingEngineOutputKernel = m_slicingEngineOutputKernels[slicingOutIndex];
201 cl::Kernel &insideOutInputKernel = m_insideOutInputKernels[insideOutInputIndex];
202 cl::Kernel &insideOutOutputKernel = m_insideOutOutputKernels[insideOutOutputIndex];
212 stripClusteringKernel.setArg(3,
static_cast<unsigned int>((*stripInput).size()));
234 acc_queue.enqueueWriteBuffer(
m_pixelClusterInputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*pixelInput).size(), (*pixelInput).data(),
nullptr, &evt_write_pixel_input);
235 acc_queue.enqueueWriteBuffer(
m_stripClusterInputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*stripInput).size(), (*stripInput).data(),
nullptr, &evt_write_strip_input);
236 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
237 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
252 acc_queue.enqueueTask(pixelClusteringKernel, &evt_vec_pixel_input, &evt_pixel_clustering);
253 acc_queue.enqueueTask(stripClusteringKernel, &evt_vec_strip_input, &evt_strip_clustering);
255 std::vector<cl::Event> evt_vec_strip_clustering{evt_strip_clustering};
256 acc_queue.enqueueTask(stripL2GKernel, &evt_vec_strip_clustering, &evt_strip_l2g);
258 std::vector<cl::Event> evt_vec_pixelEDM{evt_pixel_clustering};
259 std::vector<cl::Event> evt_vec_strip_l2g{evt_strip_l2g};
260 acc_queue.enqueueTask(pixelEdmPrepKernel, &evt_vec_pixelEDM, &evt_pixel_edm_prep);
261 acc_queue.enqueueTask(stripEdmPrepKernel, &evt_vec_strip_l2g, &evt_strip_edm_prep);
263 acc_queue.enqueueTask(slicingEngineOutputKernel,
nullptr, &evt_slicing_done);
267 std::vector<cl::Event> evt_vec_slicing{evt_slicing_done};
268 acc_queue.enqueueTask(insideOutInputKernel, &evt_vec_slicing, &evt_insideoutInput_done);
269 acc_queue.enqueueTask(insideOutOutputKernel,
nullptr, &evt_insideoutOutput_done);
277 std::vector<cl::Event> evt_vec_pixel_edm_prep {evt_pixel_edm_prep};
278 std::vector<cl::Event> evt_vec_strip_edm_prep {evt_strip_edm_prep};
279 std::vector<cl::Event> evt_vec_insideout_output {evt_insideoutOutput_done};
293 acc_queue.enqueueReadBuffer(
m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_edm_prep, &evt_pixel_cluster_output);
294 acc_queue.enqueueReadBuffer(
m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_edm_prep, &evt_strip_cluster_output);
295 acc_queue.enqueueReadBuffer(
m_insideOutOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGATrackOutput).size(), (*FPGATrackOutput).data(), &evt_vec_insideout_output, &evt_track_output);
297 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output, evt_track_output };
298 cl::Event::waitForEvents(wait_for_reads);
301 dumpHexData((*FPGATrackOutput),
"HW_F150i_Stream_insideOut.txt", ctx);
307 auto bufferElemCount = [](
const cl::Buffer&
b) ->
size_t {
309 b.getInfo(CL_MEM_SIZE, &bytes);
330 std::vector<cl::Event> deps_pixel_clust_edm { evt_pixel_clustering };
331 std::vector<cl::Event> deps_strip_clust { evt_strip_clustering };
332 std::vector<cl::Event> deps_strip_l2g { evt_strip_l2g };
333 std::vector<cl::Event> deps_slicing { evt_slicing_done };
336 acc_queue.enqueueReadBuffer(
338 sizeof(
uint64_t) * pixelClusterEDMOut.size(), pixelClusterEDMOut.data(),
339 &deps_pixel_clust_edm, &evt_read_pixel_cluster_edm);
341 acc_queue.enqueueReadBuffer(
343 sizeof(
uint64_t) * stripClusterOut.size(), stripClusterOut.data(),
344 &deps_strip_clust, &evt_read_strip_cluster);
346 acc_queue.enqueueReadBuffer(
348 sizeof(
uint64_t) * stripClusterEDMOut.size(), stripClusterEDMOut.data(),
349 &deps_strip_clust, &evt_read_strip_cluster_edm);
351 acc_queue.enqueueReadBuffer(
353 sizeof(
uint64_t) * stripL2GOut.size(), stripL2GOut.data(),
354 &deps_strip_l2g, &evt_read_strip_l2g);
356 acc_queue.enqueueReadBuffer(
358 sizeof(
uint64_t) * stripL2GEDMOut.size(), stripL2GEDMOut.data(),
359 &deps_strip_l2g, &evt_read_strip_l2g_edm);
361 acc_queue.enqueueReadBuffer(
363 sizeof(
uint64_t) * slicingEngineOut.size(), slicingEngineOut.data(),
364 &deps_slicing, &evt_read_slicing_out);
367 std::vector<cl::Event> all_reads = {
369 evt_pixel_cluster_output, evt_strip_cluster_output, evt_track_output,
371 evt_read_pixel_cluster_edm,
372 evt_read_strip_cluster, evt_read_strip_cluster_edm,
373 evt_read_strip_l2g, evt_read_strip_l2g_edm,
376 cl::Event::waitForEvents(all_reads);
379 dumpHexData((*pixelInput),
"HW_F150i_Stream_pixelInput_event.txt", ctx);
380 dumpHexData((*stripInput),
"HW_F150i_Stream_stripInput_event.txt", ctx);
383 dumpHexData((*FPGAPixelOutput),
"HW_F150i_Stream_pixelEDM_event.txt", ctx);
384 dumpHexData((*FPGAStripOutput),
"HW_F150i_Stream_stripEDM_event.txt", ctx);
387 dumpHexData(pixelClusterEDMOut,
"HW_F150i_Stream_pixelClusterEDM_event.txt", ctx);
388 dumpHexData(stripClusterOut,
"HW_F150i_Stream_stripCluster_event.txt", ctx);
389 dumpHexData(stripClusterEDMOut,
"HW_F150i_Stream_stripClusterEDM_event.txt", ctx);
390 dumpHexData(stripL2GOut,
"HW_F150i_Stream_stripL2G_event.txt", ctx);
391 dumpHexData(stripL2GEDMOut,
"HW_F150i_Stream_stripL2GEDM_event.txt", ctx);
392 dumpHexData(slicingEngineOut,
"HW_F150i_Stream_slicingEngineOut_event.txt", ctx);
396 mnt_timer_Total.stop();
398 if(pixelInput->size() == 6) (*FPGAPixelOutput)[0] = 0;
399 if(stripInput->size() == 6) (*FPGAStripOutput)[0] = 0;
404 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
406 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << pixel_input_time / 1e6 <<
" ms");
409 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
411 ATH_MSG_DEBUG(
"Strip input buffer write time: " << strip_input_time / 1e6 <<
" ms");
414 cl_ulong pixel_clustering_time = evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
416 ATH_MSG_DEBUG(
"Pixel clustering time: " << pixel_clustering_time / 1e6 <<
" ms");
419 cl_ulong strip_clustering_time = evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
421 ATH_MSG_DEBUG(
"Strip clustering time: " << strip_clustering_time / 1e6 <<
" ms");
424 cl_ulong strip_l2g_time = evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_l2g.getProfilingInfo<CL_PROFILING_COMMAND_START>();
426 ATH_MSG_DEBUG(
"Strip L2G time: " << strip_l2g_time / 1e6 <<
" ms");
428 cl_ulong pixel_edm_prep_time = evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
429 cl_ulong strip_edm_prep_time = evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_START>();
432 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << pixel_edm_prep_time / 1e6 <<
" ms");
435 ATH_MSG_DEBUG(
"StripEDMPrep time: " << strip_edm_prep_time / 1e6 <<
" ms");
439 cl_ulong kernel_start = evt_pixel_clustering.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
440 cl_ulong kernel_end =
std::max(evt_pixel_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>(), evt_strip_edm_prep.getProfilingInfo<CL_PROFILING_COMMAND_END>());
442 ATH_MSG_DEBUG(
"Kernel execution time: " << (kernel_end - kernel_start) / 1e6 <<
" ms");
445 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
447 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << pixel_output_time / 1e6 <<
" ms");
450 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
452 ATH_MSG_DEBUG(
"Strip output buffer read time: " << strip_output_time / 1e6 <<
" ms");
454 return StatusCode::SUCCESS;
476 return StatusCode::SUCCESS;
481 xrt::xclbin xrt_xclbin(
m_xclbin.value());
484 ATH_MSG_INFO(
"fpga name: "<<xrt_xclbin.get_fpga_device_name());
485 ATH_MSG_INFO(
"uuid: "<<xrt_xclbin.get_uuid().to_string());
487 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
488 const std::string& kernelName = kernel.get_name();
494 const std::string& computeUnitName = computeUnit.get_name();
495 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
497 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
500 cuNames.push_back(computeUnitUsableName);