Should be overriden by derived classes to perform meaningful work.
139 {
142
146
147 const int* pixelInputSize{nullptr}, *stripInputSize{nullptr};
150
151
153
156 }
157
158 size_t bufferIndex = ctx.slot() % nthreads;
159
160
161 size_t pixelClusterIndex = ctx.slot() % m_pixelClusterKernels.size();
162 size_t stripClusterIndex = ctx.slot() % m_stripClusterKernels.size();
163 size_t stripL2GIndex = ctx.slot() % m_stripL2GKernels.size();
164 size_t pixelEDMIndex = ctx.slot() % m_pixelEDMKernels.size();
165 size_t stripEDMIndex = ctx.slot() % m_stripEDMKernels.size();
166
167
168
170
171
172
173
174
175
176 cl::Buffer pixelClusterInputBuffer = m_pixelClusterInputBufferList[bufferIndex];
177 cl::Buffer stripClusterInputBuffer = m_stripClusterInputBufferList[bufferIndex];
178 cl::Buffer stripClusterOutputBuffer = m_stripClusterOutputBufferList[bufferIndex];
179 cl::Buffer pixelClusterEDMOutputBuffer = m_pixelClusterEDMOutputBufferList[bufferIndex];
180 cl::Buffer stripClusterEDMOutputBuffer = m_stripClusterEDMOutputBufferList[bufferIndex];
181 cl::Buffer stripL2GOutputBuffer = m_stripL2GOutputBufferList[bufferIndex];
182 cl::Buffer stripL2GEDMOutputBuffer = m_stripL2GEDMOutputBufferList[bufferIndex];
183 cl::Buffer edmPixelOutputBuffer = m_edmPixelOutputBufferList[bufferIndex];
184 cl::Buffer edmStripOutputBuffer = m_edmStripOutputBufferList[bufferIndex];
185
186
187
188 cl::Kernel &pixelClusteringKernel = m_pixelClusterKernels[pixelClusterIndex];
189 cl::Kernel &pixelEdmPrepKernel = m_pixelEDMKernels[pixelEDMIndex];
190
191 cl::Kernel &stripClusteringKernel = m_stripClusterKernels[stripClusterIndex];
192 cl::Kernel &stripL2GKernel = m_stripL2GKernels[stripL2GIndex];
193 cl::Kernel &stripEdmPrepKernel = m_stripEDMKernels[stripEDMIndex];
194
195
196
197 pixelClusteringKernel.setArg<cl::Buffer>(0, pixelClusterInputBuffer);
198 pixelClusteringKernel.setArg<cl::Buffer>(1, pixelClusterEDMOutputBuffer);
199
200 stripClusteringKernel.setArg<cl::Buffer>(0, stripClusterInputBuffer);
201 stripClusteringKernel.setArg<cl::Buffer>(1, stripClusterOutputBuffer);
202 stripClusteringKernel.setArg<cl::Buffer>(2, stripClusterEDMOutputBuffer);
203 stripClusteringKernel.setArg<unsigned int>(3, *stripInputSize);
204
205 stripL2GKernel.setArg<cl::Buffer>(0, stripClusterOutputBuffer);
206 stripL2GKernel.setArg<cl::Buffer>(1, stripClusterEDMOutputBuffer);
207 stripL2GKernel.setArg<cl::Buffer>(2, stripL2GOutputBuffer);
208 stripL2GKernel.setArg<cl::Buffer>(3, stripL2GEDMOutputBuffer);
209
210 pixelEdmPrepKernel.setArg<cl::Buffer>(0, pixelClusterEDMOutputBuffer);
211 pixelEdmPrepKernel.setArg<cl::Buffer>(1, edmPixelOutputBuffer);
212 stripEdmPrepKernel.setArg<cl::Buffer>(0, stripL2GEDMOutputBuffer);
213 stripEdmPrepKernel.setArg<cl::Buffer>(1, edmStripOutputBuffer);
214
215
216
217
218 std::vector<cl::Event> writePixelInputDeps =
getDepVector(m_pixelClusterEndEvents, pixelClusterIndex);
219 std::vector<cl::Event> writeStripInputDeps =
getDepVector(m_stripClusterEndEvents, stripClusterIndex);
220
221 cl::Event writePixelInputEvt;
222 cl::Event writeStripInputEvt;
223 m_acc_queue.enqueueWriteBuffer(pixelClusterInputBuffer, CL_FALSE, 0,
sizeof(uint64_t) * (*pixelInput).size(), (*pixelInput).data(), &writePixelInputDeps, &writePixelInputEvt);
224 m_acc_queue.enqueueWriteBuffer(stripClusterInputBuffer, CL_FALSE, 0,
sizeof(uint64_t) * (*stripInput).size(), (*stripInput).data(), &writeStripInputDeps, &writeStripInputEvt);
225
226 std::vector<cl::Event> pixelClusteringDeps = { writePixelInputEvt };
227 std::vector<cl::Event> stripClusteringDeps = { writeStripInputEvt };
228
229 cl::Event pixelClusteringEvt;
230 cl::Event stripClusteringEvt;
231 cl::Event pixelL2GEvt;
232 cl::Event stripL2GEvt;
233 cl::Event edmPrepEvt;
234 cl::Event pixelEdmPrepEvt;
235 cl::Event stripEdmPrepEvt;
236
237 {
238 Athena::Chrono chrono(
"Kernel execution",
m_chronoSvc.get());
239
240
241 m_acc_queue.enqueueTask(pixelClusteringKernel, &pixelClusteringDeps, &pixelClusteringEvt);
242 m_acc_queue.enqueueTask(stripClusteringKernel, &stripClusteringDeps, &stripClusteringEvt);
243
244
245 m_pixelClusterEndEvents[pixelClusterIndex] = pixelClusteringEvt;
246 m_stripClusterEndEvents[stripClusterIndex] = stripClusteringEvt;
247
248 std::vector<cl::Event> stripL2GDeps =
getDepVector(m_stripL2GEndEvents, stripClusterIndex);
249 stripL2GDeps.push_back(stripClusteringEvt);
250
251 m_acc_queue.enqueueTask(stripL2GKernel, &stripL2GDeps, &stripL2GEvt);
252
253 m_stripL2GEndEvents[stripClusterIndex] = stripL2GEvt;
254
255
256 std::vector<cl::Event> pixelEdmPrepDeps =
getDepVector(m_pixelEDMEndEvents, pixelClusterIndex);
257 pixelEdmPrepDeps.push_back(pixelClusteringEvt);
258
259
260 std::vector<cl::Event> stripEdmPrepDeps =
getDepVector(m_stripEDMEndEvents, stripClusterIndex);
261 stripEdmPrepDeps.push_back(stripL2GEvt);
262
263 m_acc_queue.enqueueTask(stripEdmPrepKernel, &stripEdmPrepDeps, &stripEdmPrepEvt);
264 m_acc_queue.enqueueTask(pixelEdmPrepKernel, &pixelEdmPrepDeps, &pixelEdmPrepEvt);
265
266 }
267
268
269 cl::Event readPixelOutputEvt;
270 cl::Event readStripOutputEvt;
271 std::vector<cl::Event> readPixelOutputDeps;
272 std::vector<cl::Event> readStripOutputDeps;
273
274 readPixelOutputDeps.push_back(pixelEdmPrepEvt);
275 readStripOutputDeps.push_back(stripEdmPrepEvt);
276
277
280
283
284 m_acc_queue.enqueueReadBuffer(edmPixelOutputBuffer, CL_FALSE, 0,
sizeof(uint32_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &readPixelOutputDeps, &readPixelOutputEvt);
285 m_acc_queue.enqueueReadBuffer(edmStripOutputBuffer, CL_FALSE, 0,
sizeof(uint32_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &readStripOutputDeps, &readStripOutputEvt);
286
287
288
289 lock.unlock();
290
291
292
293 std::vector<cl::Event> terminationDeps = { readPixelOutputEvt, readStripOutputEvt };
294 cl::Event::waitForEvents(terminationDeps);
295
296 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0;
297 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0;
298
299
300
301
302 cl_ulong pixel_input_time = writePixelInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - writePixelInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
304 ATH_MSG_DEBUG(
"Pixel input buffer write time: " << pixel_input_time / 1e6 <<
" ms");
305
306
307 cl_ulong strip_input_time = writeStripInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - writeStripInputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
309 ATH_MSG_DEBUG(
"Strip input buffer write time: " << strip_input_time / 1e6 <<
" ms");
310
311
312 cl_ulong pixel_clustering_time = pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
314 ATH_MSG_DEBUG(
"Pixel clustering time: " << pixel_clustering_time / 1e6 <<
" ms");
315
316
317 cl_ulong strip_clustering_time = stripClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
319 ATH_MSG_DEBUG(
"Strip clustering time: " << strip_clustering_time / 1e6 <<
" ms");
320
321
322
323 cl_ulong strip_l2g_time = stripL2GEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripL2GEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
325 ATH_MSG_DEBUG(
"Strip L2G time: " << strip_l2g_time / 1e6 <<
" ms");
326
327
328
329 cl_ulong pixel_edm_prep_time = pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
330 cl_ulong strip_edm_prep_time = stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
331
333 ATH_MSG_DEBUG(
"PixelEDMPrep time: " << pixel_edm_prep_time / 1e6 <<
" ms");
334
336 ATH_MSG_DEBUG(
"StripEDMPrep time: " << strip_edm_prep_time / 1e6 <<
" ms");
337
338
339 cl_ulong kernel_start = pixelClusteringEvt.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
340 cl_ulong kernel_end = std::max(pixelEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>(), stripEdmPrepEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>());
342 ATH_MSG_DEBUG(
"Kernel execution time: " << (kernel_end - kernel_start) / 1e6 <<
" ms");
343
344
345 cl_ulong pixel_output_time = readPixelOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - readPixelOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
347 ATH_MSG_DEBUG(
"Pixel output buffer read time: " << pixel_output_time / 1e6 <<
" ms");
348
349
350 cl_ulong strip_output_time = readStripOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_END>() - readStripOutputEvt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
352 ATH_MSG_DEBUG(
"Strip output buffer read time: " << strip_output_time / 1e6 <<
" ms");
353
354
355 return StatusCode::SUCCESS;
356 }
#define ATH_CHECK
Evaluate an expression and check for errors.
std::vector< cl::Event > getDepVector(std::vector< cl::Event > &endEvents, size_t cu) const
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAPixelRDO
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAPixelOutput
std::atomic< cl_ulong > m_pixelInputTime
Time for pixel input buffer write.
std::atomic< cl_ulong > m_stripOutputTime
Time for strip output buffer read.
ServiceHandle< IChronoSvc > m_chronoSvc
Service for timing the algorithm.
std::atomic< cl_ulong > m_pixelClusteringTime
Time for pixel clustering.
std::atomic< cl_ulong > m_kernelTime
Time for kernel execution.
Gaudi::Property< int > m_FPGAThreads
std::atomic< cl_ulong > m_stripInputTime
Time for strip input buffer write.
SG::ReadHandleKey< int > m_FPGAStripRDOSize
cl::CommandQueue m_acc_queue
std::atomic< cl_ulong > m_pixelOutputTime
Time for pixel output buffer read.
SG::WriteHandleKey< std::vector< uint32_t > > m_FPGAStripOutput
std::mutex m_fpgaHandleMtx
std::atomic< cl_ulong > m_stripClusteringTime
Time for strip clustering.
std::atomic< ulonglong > m_numEvents
Number of events processed.
SG::ReadHandleKey< int > m_FPGAPixelRDOSize
std::atomic< cl_ulong > m_pixelEdmPrepTime
Time for pixel EDM preparation.
std::atomic< cl_ulong > m_stripL2GTime
Time for strip L2G.
std::atomic< cl_ulong > m_stripEdmPrepTime
Time for strip EDM preparation.
SG::ReadHandleKey< std::vector< uint64_t > > m_FPGAStripRDO
constexpr uint32_t STRIP_CONTAINER_BUF_SIZE
constexpr uint32_t PIXEL_CONTAINER_BUF_SIZE
size_t getNSlots()
Return the number of event slots.
const T * get(const ReadCondHandleKey< T > &key, const EventContext &ctx)
Convenience function to retrieve an object given a ReadCondHandleKey.