30 ATH_MSG_INFO(
"Running on the FPGA accelerator (XRT native)");
56 std::vector<std::string> listofCUs;
60 for (
const auto& cuName : listofCUs) {
76 }
catch (
const std::exception& e) {
77 ATH_MSG_ERROR(
"Failed to create kernel for CU '" << cuName <<
"': " << e.what());
78 return StatusCode::FAILURE;
95 auto choose = [](
const std::vector<xrt::kernel>& ks) ->
const xrt::kernel* {
96 return ks.empty() ? nullptr : &ks.front();
99 const xrt::kernel* kPC = choose(m_pixelClusteringKernels);
100 const xrt::kernel* kSC = choose(m_stripClusteringKernels);
101 const xrt::kernel* kPL2G = choose(m_pixelL2GKernels);
102 const xrt::kernel* kSL2G = choose(m_stripL2GKernels);
103 const xrt::kernel* kPEDM = choose(m_pixelEdmPrepKernels);
104 const xrt::kernel* kSEDM = choose(m_stripEdmPrepKernels);
106 auto gid = [](
const xrt::kernel* k,
unsigned arg_index)->
unsigned {
107 return k ? k->group_id(arg_index) : 0;
110 for (
unsigned i = 0; i < nthreads; ++i) {
139 return StatusCode::SUCCESS;
148 const std::vector<uint64_t>* pixelInput{
nullptr};
149 const std::vector<uint64_t>* stripInput{
nullptr};
153 const int* pixelInputSize{
nullptr}, *stripInputSize{
nullptr};
160 const size_t bufferIndex = ctx.slot() % nthreads;
163 const size_t pixelClusterIndex = ctx.slot() % m_pixelClusteringKernels.size();
164 const size_t stripClusterIndex = ctx.slot() % m_stripClusteringKernels.size();
165 const size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
166 const size_t pixelL2GIndex = m_pixelL2GKernels.empty() ? 0 : (ctx.slot() % m_pixelL2GKernels.size());
167 const size_t pixelEDMIndex = m_pixelEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_pixelEdmPrepKernels.size());
168 const size_t stripEDMIndex = m_stripEdmPrepKernels.empty() ? 0 : (ctx.slot() % m_stripEdmPrepKernels.size());
171 <<
" running on buffer " << bufferIndex
172 <<
" pixelClusterIndex: " << pixelClusterIndex
173 <<
" stripClusterIndex: " << stripClusterIndex
174 <<
" stripL2GIndex: " << stripL2GIndex
175 <<
" pixelL2GIndex: " << pixelL2GIndex
176 <<
" pixelEDMIndex: " << pixelEDMIndex
177 <<
" stripEDMIndex: " << stripEDMIndex);
180 auto& bo_pix_in = m_pixelClusterInputBOList[bufferIndex];
181 auto& bo_str_in = m_stripClusterInputBOList[bufferIndex];
183 xrt::bo* bo_pix_cl_out = (!
m_doF110) ? &m_pixelClusterOutputBOList[bufferIndex] :
nullptr;
184 auto& bo_pix_cl_edm = m_pixelClusterEDMOutputBOList[bufferIndex];
186 auto& bo_str_cl = m_stripClusterOutputBOList[bufferIndex];
187 auto& bo_str_cl_edm = m_stripClusterEDMOutputBOList[bufferIndex];
189 xrt::bo* bo_pix_l2g_out = (!
m_doF110) ? &m_pixelL2GOutputBOList[bufferIndex] :
nullptr;
190 xrt::bo* bo_pix_l2g_edm = (!
m_doF110) ? &m_pixelL2GEDMOutputBOList[bufferIndex] :
nullptr;
192 auto& bo_str_l2g_out = m_stripL2GOutputBOList[bufferIndex];
193 auto& bo_str_l2g_edm = m_stripL2GEDMOutputBOList[bufferIndex];
195 auto& bo_pix_edm_cont = m_edmPixelOutputBOList[bufferIndex];
196 auto& bo_str_edm_cont = m_edmStripOutputBOList[bufferIndex];
199 const auto t_wi0 = std::chrono::steady_clock::now();
200 bo_pix_in.write(pixelInput->data(), pixelInput->size() *
sizeof(uint64_t), 0);
201 bo_pix_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
202 const auto t_wi1 = std::chrono::steady_clock::now();
206 const auto t_wi2 = std::chrono::steady_clock::now();
207 bo_str_in.write(stripInput->data(), stripInput->size() *
sizeof(uint64_t), 0);
208 bo_str_in.sync(XCL_BO_SYNC_BO_TO_DEVICE);
209 const auto t_wi3 = std::chrono::steady_clock::now();
214 const auto t_k0 = std::chrono::steady_clock::now();
217 auto& k_pix_cl = m_pixelClusteringKernels[pixelClusterIndex];
218 xrt::run r_pix_cl{k_pix_cl};
219 r_pix_cl.set_arg(0, bo_pix_in);
221 r_pix_cl.set_arg(1, bo_pix_cl_edm);
223 r_pix_cl.set_arg(1, *bo_pix_cl_out);
224 r_pix_cl.set_arg(2, bo_pix_cl_edm);
227 int rounded =
static_cast<int>(std::ceil(
static_cast<double>(*pixelInputSize) / 256.0)) * 256;
228 uint32_t hit_bytes =
static_cast<uint32_t
>(
sizeof(uint64_t) * rounded);
229 uint32_t cluster_bytes =
static_cast<uint32_t
>(
sizeof(uint64_t) * rounded);
230 uint32_t edm_bytes =
static_cast<uint32_t
>(
sizeof(uint64_t) * rounded * 8);
231 r_pix_cl.set_arg(3, hit_bytes);
232 r_pix_cl.set_arg(4, cluster_bytes);
233 r_pix_cl.set_arg(5, edm_bytes);
235 const auto t_pc_start = std::chrono::steady_clock::now();
238 auto& k_str_cl = m_stripClusteringKernels[stripClusterIndex];
239 xrt::run r_str_cl{k_str_cl};
240 r_str_cl.set_arg(0, bo_str_in);
241 r_str_cl.set_arg(1, bo_str_cl);
242 r_str_cl.set_arg(2, bo_str_cl_edm);
243 r_str_cl.set_arg(3,
static_cast<unsigned int>(*stripInputSize));
244 const auto t_sc_start = std::chrono::steady_clock::now();
248 const auto t_pc_done = std::chrono::steady_clock::now();
253 const auto t_sc_done = std::chrono::steady_clock::now();
258 std::chrono::steady_clock::time_point t_pl2g_done = t_pc_done;
260 auto& k_pix_l2g = m_pixelL2GKernels[pixelL2GIndex];
261 xrt::run r_pix_l2g{k_pix_l2g};
262 r_pix_l2g.set_arg(0, *bo_pix_cl_out);
263 r_pix_l2g.set_arg(1, bo_pix_cl_edm);
264 r_pix_l2g.set_arg(2, *bo_pix_l2g_out);
265 r_pix_l2g.set_arg(3, *bo_pix_l2g_edm);
266 const auto t_pl2g_start = std::chrono::steady_clock::now();
269 t_pl2g_done = std::chrono::steady_clock::now();
275 auto& k_str_l2g = m_stripL2GKernels[stripL2GIndex];
276 xrt::run r_str_l2g{k_str_l2g};
277 r_str_l2g.set_arg(0, bo_str_cl);
278 r_str_l2g.set_arg(1, bo_str_cl_edm);
279 r_str_l2g.set_arg(2, bo_str_l2g_out);
280 r_str_l2g.set_arg(3, bo_str_l2g_edm);
281 const auto t_sl2g_start = std::chrono::steady_clock::now();
284 const auto t_sl2g_done = std::chrono::steady_clock::now();
289 auto& k_pedm = m_pixelEdmPrepKernels[pixelEDMIndex];
290 auto& k_sedm = m_stripEdmPrepKernels[stripEDMIndex];
292 xrt::run r_pedm{k_pedm};
293 r_pedm.set_arg(0, bo_pix_cl_edm);
294 r_pedm.set_arg(1, bo_pix_edm_cont);
296 xrt::run r_sedm{k_sedm};
297 r_sedm.set_arg(0, bo_str_l2g_edm);
298 r_sedm.set_arg(1, bo_str_edm_cont);
303 const auto t_pedm_start = std::chrono::steady_clock::now();
306 const auto t_sedm_start = std::chrono::steady_clock::now();
311 const auto t_pedm_done = std::chrono::steady_clock::now();
316 const auto t_sedm_done = std::chrono::steady_clock::now();
321 const auto t_kend = std::max(t_pedm_done, t_sedm_done);
332 const auto t_ro0 = std::chrono::steady_clock::now();
333 bo_pix_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
334 bo_pix_edm_cont.read(FPGAPixelOutput->data(), FPGAPixelOutput->size() *
sizeof(uint64_t), 0);
335 const auto t_ro1 = std::chrono::steady_clock::now();
339 const auto t_ro2 = std::chrono::steady_clock::now();
340 bo_str_edm_cont.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
341 bo_str_edm_cont.read(FPGAStripOutput->data(), FPGAStripOutput->size() *
sizeof(uint64_t), 0);
342 const auto t_ro3 = std::chrono::steady_clock::now();
347 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0;
348 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0;
350 return StatusCode::SUCCESS;