9 #include <xrt/xrt_bo.h>
10 #include <xrt/xrt_device.h>
11 #include <xrt/xrt_kernel.h>
12 #include <xrt/xrt_uuid.h>
22 static inline uint64_t ns_between(
const std::chrono::steady_clock::time_point&
a,
23 const std::chrono::steady_clock::time_point&
b)
25 return static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(
b -
a).count());
30 ATH_MSG_INFO(
"Running on the FPGA accelerator (XRT native)");
54 std::vector<std::string> listofCUs;
58 for (
const auto& cuName : listofCUs) {
75 ATH_MSG_ERROR(
"Failed to create kernel for CU '" << cuName <<
"': " <<
e.what());
76 return StatusCode::FAILURE;
93 auto choose = [](
const std::vector<xrt::kernel>& ks) ->
const xrt::kernel* {
94 return ks.empty() ? nullptr : &ks.front();
97 const xrt::kernel* kPC = choose(m_pixelClusteringKernels);
98 const xrt::kernel* kSC = choose(m_stripClusteringKernels);
99 const xrt::kernel* kPL2G = choose(m_pixelL2GKernels);
100 const xrt::kernel* kSL2G = choose(m_stripL2GKernels);
101 const xrt::kernel* kPEDM = choose(m_pixelEdmPrepKernels);
102 const xrt::kernel* kSEDM = choose(m_stripEdmPrepKernels);
104 auto gid = [](
const xrt::kernel*
k,
unsigned arg_index)->
unsigned {
105 return k ?
k->group_id(arg_index) : 0;
108 for (
unsigned i = 0;
i < nthreads; ++
i) {
137 return StatusCode::SUCCESS;
146 const std::vector<uint64_t>* pixelInput{
nullptr};
147 const std::vector<uint64_t>* stripInput{
nullptr};
154 const size_t bufferIndex = ctx.slot() % nthreads;
157 const size_t pixelClusterIndex = ctx.slot() % m_pixelClusteringKernels.size();
158 const size_t stripClusterIndex = ctx.slot() % m_stripClusteringKernels.size();
159 const size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
160 const size_t pixelL2GIndex = m_pixelL2GKernels.empty() ? 0 : (ctx.slot() % m_pixelL2GKernels.size());
161 const size_t pixelEDMIndex = m_pixelEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_pixelEdmPrepKernels.size());
162 const size_t stripEDMIndex = m_stripEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_stripEdmPrepKernels.size());
165 <<
" running on buffer " << bufferIndex
166 <<
" pixelClusterIndex: " << pixelClusterIndex
167 <<
" stripClusterIndex: " << stripClusterIndex
168 <<
" stripL2GIndex: " << stripL2GIndex
169 <<
" pixelL2GIndex: " << pixelL2GIndex
170 <<
" pixelEDMIndex: " << pixelEDMIndex
171 <<
" stripEDMIndex: " << stripEDMIndex);
174 auto& bo_pix_in = m_pixelClusterInputBOList[bufferIndex];
175 auto& bo_str_in = m_stripClusterInputBOList[bufferIndex];
177 xrt::bo* bo_pix_cl_out = (!
m_doF110) ? &m_pixelClusterOutputBOList[bufferIndex] :
nullptr;
178 auto& bo_pix_cl_edm = m_pixelClusterEDMOutputBOList[bufferIndex];
180 auto& bo_str_cl = m_stripClusterOutputBOList[bufferIndex];
181 auto& bo_str_cl_edm = m_stripClusterEDMOutputBOList[bufferIndex];
183 xrt::bo* bo_pix_l2g_out = (!
m_doF110) ? &m_pixelL2GOutputBOList[bufferIndex] :
nullptr;
184 xrt::bo* bo_pix_l2g_edm = (!
m_doF110) ? &m_pixelL2GEDMOutputBOList[bufferIndex] :
nullptr;
186 auto& bo_str_l2g_out = m_stripL2GOutputBOList[bufferIndex];
187 auto& bo_str_l2g_edm = m_stripL2GEDMOutputBOList[bufferIndex];
189 auto& bo_pix_edm_cont = m_edmPixelOutputBOList[bufferIndex];
190 auto& bo_str_edm_cont = m_edmStripOutputBOList[bufferIndex];
194 bo_pix_in.write(pixelInput->data(), pixelInput->size() *
sizeof(
uint64_t), 0);
195 bo_pix_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
198 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << (ns_between(t_wi0, t_wi1) / 1e6) <<
" ms");
201 bo_str_in.write(stripInput->data(), stripInput->size() *
sizeof(
uint64_t), 0);
202 bo_str_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
205 ATH_MSG_DEBUG(
"Strip input buffer write time: " << (ns_between(t_wi2, t_wi3) / 1e6) <<
" ms");
211 auto& k_pix_cl = m_pixelClusteringKernels[pixelClusterIndex];
213 r_pix_cl.set_arg(0, bo_pix_in);
215 r_pix_cl.set_arg(1, bo_pix_cl_edm);
217 r_pix_cl.set_arg(1, *bo_pix_cl_out);
218 r_pix_cl.set_arg(2, bo_pix_cl_edm);
221 int rounded =
static_cast<int>(std::ceil(
static_cast<double>(pixelInput->size()) / 256.0)) * 256;
225 r_pix_cl.set_arg(3, hit_bytes);
226 r_pix_cl.set_arg(4, cluster_bytes);
227 r_pix_cl.set_arg(5, edm_bytes);
232 auto& k_str_cl = m_stripClusteringKernels[stripClusterIndex];
234 r_str_cl.set_arg(0, bo_str_in);
235 r_str_cl.set_arg(1, bo_str_cl);
236 r_str_cl.set_arg(2, bo_str_cl_edm);
237 r_str_cl.set_arg(3,
static_cast<unsigned int>(stripInput->size()));
244 ATH_MSG_DEBUG(
"Pixel clustering time: " << (ns_between(t_pc_start, t_pc_done) / 1e6) <<
" ms");
249 ATH_MSG_DEBUG(
"Strip clustering time: " << (ns_between(t_sc_start, t_sc_done) / 1e6) <<
" ms");
252 std::chrono::steady_clock::time_point t_pl2g_done = t_pc_done;
254 auto& k_pix_l2g = m_pixelL2GKernels[pixelL2GIndex];
256 r_pix_l2g.set_arg(0, *bo_pix_cl_out);
257 r_pix_l2g.set_arg(1, bo_pix_cl_edm);
258 r_pix_l2g.set_arg(2, *bo_pix_l2g_out);
259 r_pix_l2g.set_arg(3, *bo_pix_l2g_edm);
265 ATH_MSG_DEBUG(
"Pixel L2G time: " << (ns_between(t_pl2g_start, t_pl2g_done) / 1e6) <<
" ms");
269 auto& k_str_l2g = m_stripL2GKernels[stripL2GIndex];
271 r_str_l2g.set_arg(0, bo_str_cl);
272 r_str_l2g.set_arg(1, bo_str_cl_edm);
273 r_str_l2g.set_arg(2, bo_str_l2g_out);
274 r_str_l2g.set_arg(3, bo_str_l2g_edm);
280 ATH_MSG_DEBUG(
"Strip L2G time: " << (ns_between(t_sl2g_start, t_sl2g_done) / 1e6) <<
" ms");
283 auto& k_pedm = m_pixelEdmPrepKernels[pixelEDMIndex];
284 auto& k_sedm = m_stripEdmPrepKernels[stripEDMIndex];
287 r_pedm.set_arg(0, bo_pix_cl_edm);
288 r_pedm.set_arg(1, bo_pix_edm_cont);
291 r_sedm.set_arg(0, bo_str_l2g_edm);
292 r_sedm.set_arg(1, bo_str_edm_cont);
307 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << (ns_between(t_pedm_start, t_pedm_done) / 1e6) <<
" ms");
312 ATH_MSG_DEBUG(
"StripEDMPrep time: " << (ns_between(t_sedm_start, t_sedm_done) / 1e6) <<
" ms");
315 const auto t_kend =
std::max(t_pedm_done, t_sedm_done);
317 ATH_MSG_DEBUG(
"Kernel execution time: " << (ns_between(t_k0, t_kend) / 1e6) <<
" ms");
327 bo_pix_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
328 bo_pix_edm_cont.read(FPGAPixelOutput->data(), FPGAPixelOutput->size() *
sizeof(
uint64_t), 0);
331 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << (ns_between(t_ro0, t_ro1) / 1e6) <<
" ms");
334 bo_str_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
335 bo_str_edm_cont.read(FPGAStripOutput->data(), FPGAStripOutput->size() *
sizeof(
uint64_t), 0);
338 ATH_MSG_DEBUG(
"Strip output buffer read time: " << (ns_between(t_ro2, t_ro3) / 1e6) <<
" ms");
341 if(pixelInput->size() == 6) (*FPGAPixelOutput)[0] = 0;
342 if(stripInput->size() == 6) (*FPGAStripOutput)[0] = 0;
344 return StatusCode::SUCCESS;
368 return StatusCode::SUCCESS;
375 ATH_MSG_INFO(
"xsa name: " << xrt_xclbin.get_xsa_name());
376 ATH_MSG_INFO(
"fpga name: " << xrt_xclbin.get_fpga_device_name());
377 ATH_MSG_INFO(
"uuid: " << xrt_xclbin.get_uuid().to_string());
379 for (
const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
380 const std::string& kernelName = kernel.get_name();
383 const std::string& computeUnitName = computeUnit.get_name();
384 const std::string computeUnitIsolatedName = computeUnitName.substr(kernelName.size() + 1);
385 const std::string computeUnitUsableName = kernelName +
":{" + computeUnitIsolatedName +
"}";
387 cuNames.push_back(computeUnitUsableName);