9 #include <xrt/xrt_bo.h>
10 #include <xrt/xrt_device.h>
11 #include <xrt/xrt_kernel.h>
12 #include <xrt/xrt_uuid.h>
22 static inline uint64_t ns_between(
const std::chrono::steady_clock::time_point&
a,
23 const std::chrono::steady_clock::time_point&
b)
25 return static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(
b -
a).count());
30 ATH_MSG_INFO(
"Running on the FPGA accelerator (XRT native)");
56 std::vector<std::string> listofCUs;
60 for (
const auto& cuName : listofCUs) {
77 ATH_MSG_ERROR(
"Failed to create kernel for CU '" << cuName <<
"': " <<
e.what());
78 return StatusCode::FAILURE;
95 auto choose = [](
const std::vector<xrt::kernel>& ks) ->
const xrt::kernel* {
96 return ks.empty() ? nullptr : &ks.front();
99 const xrt::kernel* kPC = choose(m_pixelClusteringKernels);
100 const xrt::kernel* kSC = choose(m_stripClusteringKernels);
101 const xrt::kernel* kPL2G = choose(m_pixelL2GKernels);
102 const xrt::kernel* kSL2G = choose(m_stripL2GKernels);
103 const xrt::kernel* kPEDM = choose(m_pixelEdmPrepKernels);
104 const xrt::kernel* kSEDM = choose(m_stripEdmPrepKernels);
106 auto gid = [](
const xrt::kernel*
k,
unsigned arg_index)->
unsigned {
107 return k ?
k->group_id(arg_index) : 0;
110 for (
unsigned i = 0;
i < nthreads; ++
i) {
139 return StatusCode::SUCCESS;
148 const std::vector<uint64_t>* pixelInput{
nullptr};
149 const std::vector<uint64_t>* stripInput{
nullptr};
153 const int* pixelInputSize{
nullptr}, *stripInputSize{
nullptr};
160 const size_t bufferIndex = ctx.slot() % nthreads;
163 const size_t pixelClusterIndex = ctx.slot() % m_pixelClusteringKernels.size();
164 const size_t stripClusterIndex = ctx.slot() % m_stripClusteringKernels.size();
165 const size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
166 const size_t pixelL2GIndex = m_pixelL2GKernels.empty() ? 0 : (ctx.slot() % m_pixelL2GKernels.size());
167 const size_t pixelEDMIndex = m_pixelEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_pixelEdmPrepKernels.size());
168 const size_t stripEDMIndex = m_stripEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_stripEdmPrepKernels.size());
171 <<
" running on buffer " << bufferIndex
172 <<
" pixelClusterIndex: " << pixelClusterIndex
173 <<
" stripClusterIndex: " << stripClusterIndex
174 <<
" stripL2GIndex: " << stripL2GIndex
175 <<
" pixelL2GIndex: " << pixelL2GIndex
176 <<
" pixelEDMIndex: " << pixelEDMIndex
177 <<
" stripEDMIndex: " << stripEDMIndex);
180 auto& bo_pix_in = m_pixelClusterInputBOList[bufferIndex];
181 auto& bo_str_in = m_stripClusterInputBOList[bufferIndex];
183 xrt::bo* bo_pix_cl_out = (!
m_doF110) ? &m_pixelClusterOutputBOList[bufferIndex] :
nullptr;
184 auto& bo_pix_cl_edm = m_pixelClusterEDMOutputBOList[bufferIndex];
186 auto& bo_str_cl = m_stripClusterOutputBOList[bufferIndex];
187 auto& bo_str_cl_edm = m_stripClusterEDMOutputBOList[bufferIndex];
189 xrt::bo* bo_pix_l2g_out = (!
m_doF110) ? &m_pixelL2GOutputBOList[bufferIndex] :
nullptr;
190 xrt::bo* bo_pix_l2g_edm = (!
m_doF110) ? &m_pixelL2GEDMOutputBOList[bufferIndex] :
nullptr;
192 auto& bo_str_l2g_out = m_stripL2GOutputBOList[bufferIndex];
193 auto& bo_str_l2g_edm = m_stripL2GEDMOutputBOList[bufferIndex];
195 auto& bo_pix_edm_cont = m_edmPixelOutputBOList[bufferIndex];
196 auto& bo_str_edm_cont = m_edmStripOutputBOList[bufferIndex];
200 bo_pix_in.write(pixelInput->data(), pixelInput->size() *
sizeof(
uint64_t), 0);
201 bo_pix_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
204 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << (ns_between(t_wi0, t_wi1) / 1e6) <<
" ms");
207 bo_str_in.write(stripInput->data(), stripInput->size() *
sizeof(
uint64_t), 0);
208 bo_str_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
211 ATH_MSG_DEBUG(
"Strip input buffer write time: " << (ns_between(t_wi2, t_wi3) / 1e6) <<
" ms");
217 auto& k_pix_cl = m_pixelClusteringKernels[pixelClusterIndex];
219 r_pix_cl.set_arg(0, bo_pix_in);
221 r_pix_cl.set_arg(1, bo_pix_cl_edm);
223 r_pix_cl.set_arg(1, *bo_pix_cl_out);
224 r_pix_cl.set_arg(2, bo_pix_cl_edm);
227 int rounded =
static_cast<int>(std::ceil(
static_cast<double>(*pixelInputSize) / 256.0)) * 256;
231 r_pix_cl.set_arg(3, hit_bytes);
232 r_pix_cl.set_arg(4, cluster_bytes);
233 r_pix_cl.set_arg(5, edm_bytes);
238 auto& k_str_cl = m_stripClusteringKernels[stripClusterIndex];
240 r_str_cl.set_arg(0, bo_str_in);
241 r_str_cl.set_arg(1, bo_str_cl);
242 r_str_cl.set_arg(2, bo_str_cl_edm);
243 r_str_cl.set_arg(3,
static_cast<unsigned int>(*stripInputSize));
250 ATH_MSG_DEBUG(
"Pixel clustering time: " << (ns_between(t_pc_start, t_pc_done) / 1e6) <<
" ms");
255 ATH_MSG_DEBUG(
"Strip clustering time: " << (ns_between(t_sc_start, t_sc_done) / 1e6) <<
" ms");
258 std::chrono::steady_clock::time_point t_pl2g_done = t_pc_done;
260 auto& k_pix_l2g = m_pixelL2GKernels[pixelL2GIndex];
262 r_pix_l2g.set_arg(0, *bo_pix_cl_out);
263 r_pix_l2g.set_arg(1, bo_pix_cl_edm);
264 r_pix_l2g.set_arg(2, *bo_pix_l2g_out);
265 r_pix_l2g.set_arg(3, *bo_pix_l2g_edm);
271 ATH_MSG_DEBUG(
"Pixel L2G time: " << (ns_between(t_pl2g_start, t_pl2g_done) / 1e6) <<
" ms");
275 auto& k_str_l2g = m_stripL2GKernels[stripL2GIndex];
277 r_str_l2g.set_arg(0, bo_str_cl);
278 r_str_l2g.set_arg(1, bo_str_cl_edm);
279 r_str_l2g.set_arg(2, bo_str_l2g_out);
280 r_str_l2g.set_arg(3, bo_str_l2g_edm);
286 ATH_MSG_DEBUG(
"Strip L2G time: " << (ns_between(t_sl2g_start, t_sl2g_done) / 1e6) <<
" ms");
289 auto& k_pedm = m_pixelEdmPrepKernels[pixelEDMIndex];
290 auto& k_sedm = m_stripEdmPrepKernels[stripEDMIndex];
293 r_pedm.set_arg(0, bo_pix_cl_edm);
294 r_pedm.set_arg(1, bo_pix_edm_cont);
297 r_sedm.set_arg(0, bo_str_l2g_edm);
298 r_sedm.set_arg(1, bo_str_edm_cont);
313 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << (ns_between(t_pedm_start, t_pedm_done) / 1e6) <<
" ms");
318 ATH_MSG_DEBUG(
"StripEDMPrep time: " << (ns_between(t_sedm_start, t_sedm_done) / 1e6) <<
" ms");
321 const auto t_kend =
std::max(t_pedm_done, t_sedm_done);
323 ATH_MSG_DEBUG(
"Kernel execution time: " << (ns_between(t_k0, t_kend) / 1e6) <<
" ms");
333 bo_pix_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
334 bo_pix_edm_cont.read(FPGAPixelOutput->data(), FPGAPixelOutput->size() *
sizeof(
uint64_t), 0);
337 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << (ns_between(t_ro0, t_ro1) / 1e6) <<
" ms");
340 bo_str_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
341 bo_str_edm_cont.read(FPGAStripOutput->data(), FPGAStripOutput->size() *
sizeof(
uint64_t), 0);
344 ATH_MSG_DEBUG(
"Strip output buffer read time: " << (ns_between(t_ro2, t_ro3) / 1e6) <<
" ms");
347 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0;
348 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0;
350 return StatusCode::SUCCESS;
374 return StatusCode::SUCCESS;
379 xrt::xclbin xrt_xclbin(
m_xclbin.value());
381 ATH_MSG_INFO(
"xsa name: " << xrt_xclbin.get_xsa_name());
382 ATH_MSG_INFO(
"fpga name: " << xrt_xclbin.get_fpga_device_name());
383 ATH_MSG_INFO(
"uuid: " << xrt_xclbin.get_uuid().to_string());
385 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
386 const std::string& kernelName = kernel.get_name();
389 const std::string& computeUnitName = computeUnit.get_name();
390 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
391 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
393 cuNames.push_back(computeUnitUsableName);