8 #include <xrt/xrt_bo.h>
9 #include <xrt/xrt_device.h>
10 #include <xrt/xrt_kernel.h>
11 #include <xrt/xrt_uuid.h>
44 std::vector<std::string> listofCUs;
48 for (
const auto& cuName: listofCUs)
57 else if(cuName.find(
m_stripL2GKernelName.value()) != std::string::npos) m_stripL2GKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
60 else if(cuName.find(
m_pixelEdmKernelName.value()) != std::string::npos) m_pixelEDMKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
62 else if(cuName.find(
m_stripEdmKernelName.value()) != std::string::npos) m_stripEDMKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
80 m_stripClusterEndEvents.resize(m_stripClusterKernels.size());
81 m_stripL2GEndEvents.resize(m_stripL2GKernels.size());
82 m_stripEDMEndEvents.resize(m_stripEDMKernels.size());
85 m_pixelClusterEndEvents.resize(m_pixelClusterKernels.size());
86 m_pixelEDMEndEvents.resize(m_pixelEDMKernels.size());
95 for(
unsigned int i = 0;
i < nthreads;
i++)
116 if (
err != 0)
return StatusCode::FAILURE;
117 return StatusCode::SUCCESS;
121 std::vector<cl::Event> deps;
128 deps.push_back(
event);
151 size_t bufferIndex = ctx.slot() % nthreads;
154 size_t pixelClusterIndex = ctx.slot() % m_pixelClusterKernels.size();
155 size_t stripClusterIndex = ctx.slot() % m_stripClusterKernels.size();
156 size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
157 size_t pixelEDMIndex = ctx.slot() % m_pixelEDMKernels.size();
158 size_t stripEDMIndex = ctx.slot() % m_stripEDMKernels.size();
165 ATH_MSG_DEBUG(
"F100 Thread number "<<ctx.slot()<<
" running on buffer "<<bufferIndex<<
" pixelClusterIndex: "<< pixelClusterIndex<<
" stripClusterIndex: "<< stripClusterIndex<<
" stripL2GIndex: "<< stripL2GIndex<<
" pixelEDMIndex: "<< pixelEDMIndex<<
" stripEDMIndex: "<< stripEDMIndex);
169 cl::Buffer pixelClusterInputBuffer = m_pixelClusterInputBufferList[bufferIndex];
170 cl::Buffer stripClusterInputBuffer = m_stripClusterInputBufferList[bufferIndex];
171 cl::Buffer stripClusterOutputBuffer = m_stripClusterOutputBufferList[bufferIndex];
172 cl::Buffer pixelClusterEDMOutputBuffer = m_pixelClusterEDMOutputBufferList[bufferIndex];
173 cl::Buffer stripClusterEDMOutputBuffer = m_stripClusterEDMOutputBufferList[bufferIndex];
174 cl::Buffer stripL2GOutputBuffer = m_stripL2GOutputBufferList[bufferIndex];
175 cl::Buffer stripL2GEDMOutputBuffer = m_stripL2GEDMOutputBufferList[bufferIndex];
176 cl::Buffer edmPixelOutputBuffer = m_edmPixelOutputBufferList[bufferIndex];
177 cl::Buffer edmStripOutputBuffer = m_edmStripOutputBufferList[bufferIndex];
181 cl::Kernel &pixelClusteringKernel = m_pixelClusterKernels[pixelClusterIndex];
182 cl::Kernel &pixelEdmPrepKernel = m_pixelEDMKernels[pixelEDMIndex];
184 cl::Kernel &stripClusteringKernel = m_stripClusterKernels[stripClusterIndex];
185 cl::Kernel &stripL2GKernel = m_stripL2GKernels[stripL2GIndex];
186 cl::Kernel &stripEdmPrepKernel = m_stripEDMKernels[stripEDMIndex];
190 pixelClusteringKernel.setArg<cl::Buffer>(0, pixelClusterInputBuffer);
191 pixelClusteringKernel.setArg<cl::Buffer>(1, pixelClusterEDMOutputBuffer);
193 stripClusteringKernel.setArg<cl::Buffer>(0, stripClusterInputBuffer);
194 stripClusteringKernel.setArg<cl::Buffer>(1, stripClusterOutputBuffer);
195 stripClusteringKernel.setArg<cl::Buffer>(2, stripClusterEDMOutputBuffer);
196 stripClusteringKernel.setArg<
unsigned int>(3, (*stripInput).size());
198 stripL2GKernel.setArg<cl::Buffer>(0, stripClusterOutputBuffer);
199 stripL2GKernel.setArg<cl::Buffer>(1, stripClusterEDMOutputBuffer);
200 stripL2GKernel.setArg<cl::Buffer>(2, stripL2GOutputBuffer);
201 stripL2GKernel.setArg<cl::Buffer>(3, stripL2GEDMOutputBuffer);
203 pixelEdmPrepKernel.setArg<cl::Buffer>(0, pixelClusterEDMOutputBuffer);
204 pixelEdmPrepKernel.setArg<cl::Buffer>(1, edmPixelOutputBuffer);
205 stripEdmPrepKernel.setArg<cl::Buffer>(0, stripL2GEDMOutputBuffer);
206 stripEdmPrepKernel.setArg<cl::Buffer>(1, edmStripOutputBuffer);
211 std::vector<cl::Event> writePixelInputDeps =
getDepVector(m_pixelClusterEndEvents, pixelClusterIndex);
212 std::vector<cl::Event> writeStripInputDeps =
getDepVector(m_stripClusterEndEvents, stripClusterIndex);
216 m_acc_queue.enqueueWriteBuffer(pixelClusterInputBuffer, CL_FALSE, 0,
sizeof(
uint64_t) * (*pixelInput).size(), (*pixelInput).data(), &writePixelInputDeps, &writePixelInputEvt);
217 m_acc_queue.enqueueWriteBuffer(stripClusterInputBuffer, CL_FALSE, 0,
sizeof(
uint64_t) * (*stripInput).size(), (*stripInput).data(), &writeStripInputDeps, &writeStripInputEvt);
219 std::vector<cl::Event> pixelClusteringDeps = { writePixelInputEvt };
220 std::vector<cl::Event> stripClusteringDeps = { writeStripInputEvt };
234 m_acc_queue.enqueueTask(pixelClusteringKernel, &pixelClusteringDeps, &pixelClusteringEvt);
235 m_acc_queue.enqueueTask(stripClusteringKernel, &stripClusteringDeps, &stripClusteringEvt);
238 m_pixelClusterEndEvents[pixelClusterIndex] = pixelClusteringEvt;
239 m_stripClusterEndEvents[stripClusterIndex] = stripClusteringEvt;
241 std::vector<cl::Event> stripL2GDeps =
getDepVector(m_stripL2GEndEvents, stripClusterIndex);
242 stripL2GDeps.push_back(stripClusteringEvt);
244 m_acc_queue.enqueueTask(stripL2GKernel, &stripL2GDeps, &stripL2GEvt);
246 m_stripL2GEndEvents[stripClusterIndex] = stripL2GEvt;
249 std::vector<cl::Event> pixelEdmPrepDeps =
getDepVector(m_pixelEDMEndEvents, pixelClusterIndex);
250 pixelEdmPrepDeps.push_back(pixelClusteringEvt);
253 std::vector<cl::Event> stripEdmPrepDeps =
getDepVector(m_stripEDMEndEvents, stripClusterIndex);
254 stripEdmPrepDeps.push_back(stripL2GEvt);
256 m_acc_queue.enqueueTask(stripEdmPrepKernel, &stripEdmPrepDeps, &stripEdmPrepEvt);
257 m_acc_queue.enqueueTask(pixelEdmPrepKernel, &pixelEdmPrepDeps, &pixelEdmPrepEvt);
264 std::vector<cl::Event> readPixelOutputDeps;
265 std::vector<cl::Event> readStripOutputDeps;
267 readPixelOutputDeps.push_back(pixelEdmPrepEvt);
268 readStripOutputDeps.push_back(stripEdmPrepEvt);
277 m_acc_queue.enqueueReadBuffer(edmPixelOutputBuffer, CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &readPixelOutputDeps, &readPixelOutputEvt);
278 m_acc_queue.enqueueReadBuffer(edmStripOutputBuffer, CL_FALSE, 0,
sizeof(
uint64_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &readStripOutputDeps, &readStripOutputEvt);
286 std::vector<cl::Event> terminationDeps = { readPixelOutputEvt, readStripOutputEvt };
287 cl::Event::waitForEvents(terminationDeps);
289 if(pixelInput->size() == 6) (*FPGAPixelOutput)[0] = 0;
290 if(stripInput->size() == 6) (*FPGAStripOutput)[0] = 0;
295 cl_ulong pixel_input_time = writePixelInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - writePixelInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
297 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << pixel_input_time / 1e6 <<
" ms");
300 cl_ulong strip_input_time = writeStripInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - writeStripInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
302 ATH_MSG_DEBUG(
"Strip input buffer write time: " << strip_input_time / 1e6 <<
" ms");
305 cl_ulong pixel_clustering_time = pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
307 ATH_MSG_DEBUG(
"Pixel clustering time: " << pixel_clustering_time / 1e6 <<
" ms");
310 cl_ulong strip_clustering_time = stripClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
312 ATH_MSG_DEBUG(
"Strip clustering time: " << strip_clustering_time / 1e6 <<
" ms");
316 cl_ulong strip_l2g_time = stripL2GEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripL2GEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
318 ATH_MSG_DEBUG(
"Strip L2G time: " << strip_l2g_time / 1e6 <<
" ms");
322 cl_ulong pixel_edm_prep_time = pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
323 cl_ulong strip_edm_prep_time = stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
326 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << pixel_edm_prep_time / 1e6 <<
" ms");
329 ATH_MSG_DEBUG(
"StripEDMPrep time: " << strip_edm_prep_time / 1e6 <<
" ms");
332 cl_ulong kernel_start = pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
333 cl_ulong kernel_end =
std::max(pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>(), stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>());
335 ATH_MSG_DEBUG(
"Kernel execution time: " << (kernel_end - kernel_start) / 1e6 <<
" ms");
338 cl_ulong pixel_output_time = readPixelOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - readPixelOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
340 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << pixel_output_time / 1e6 <<
" ms");
343 cl_ulong strip_output_time = readStripOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - readStripOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
345 ATH_MSG_DEBUG(
"Strip output buffer read time: " << strip_output_time / 1e6 <<
" ms");
348 return StatusCode::SUCCESS;
370 return StatusCode::SUCCESS;
378 ATH_MSG_INFO(
"fpga name: "<<xrt_xclbin.get_fpga_device_name());
379 ATH_MSG_INFO(
"uuid: "<<xrt_xclbin.get_uuid().to_string());
381 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
382 const std::string& kernelName = kernel.get_name();
388 const std::string& computeUnitName = computeUnit.get_name();
389 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
391 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
394 cuNames.push_back(computeUnitUsableName);