45 std::vector<std::string> listofCUs;
58 for(
unsigned int i = 0; i < nthreads; i++)
72 for (
const auto& cuName: listofCUs)
82 else if(cuName.find(
m_pixelLUTKernelName.value()) != std::string::npos) m_pixelLUTKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
83 else if(cuName.find(
m_stripLUTKernelName.value()) != std::string::npos) m_stripLUTKernels.emplace_back(cl::Kernel(
m_program, cuName.c_str()));
101 if(m_pixelLUTKernels.size())
104 std::vector<uint64_t>
data;
113 for(
size_t i = 0; i < m_pixelLUTKernels.size(); i++)
115 cl::Kernel &lutKernel = m_pixelLUTKernels[i];
119 lutKernel.setArg(0, lutBuffer);
120 lutKernel.setArg(2,
static_cast<unsigned long long>(
data.size()));
123 cl::Event lut_inputEvent;
127 queue.enqueueWriteBuffer(lutBuffer, CL_FALSE, 0,
sizeof(uint64_t) *
data.size(),
data.data(), NULL, &lut_inputEvent);
128 queue.enqueueTask(lutKernel, NULL, &lut_inputEvent);
135 if(m_stripLUTKernels.size())
138 std::vector<uint64_t>
data;
147 for(
size_t i = 0; i < m_stripLUTKernels.size(); i++)
149 cl::Kernel &lutKernel = m_stripLUTKernels[i];
154 lutKernel.setArg(0, lutBuffer);
155 lutKernel.setArg(2,
static_cast<unsigned long long>(
data.size()));
158 cl::Event lut_inputEvent;
162 queue.enqueueWriteBuffer(lutBuffer, CL_FALSE, 0,
sizeof(uint64_t) *
data.size(),
data.data(), NULL, &lut_inputEvent);
163 queue.enqueueTask(lutKernel, NULL, &lut_inputEvent);
171 return StatusCode::SUCCESS;
200 const std::vector<uint64_t>* pixelInput{
nullptr}, *stripInput{
nullptr};
204 const int* pixelInputSize{
nullptr}, *stripInputSize{
nullptr};
215 size_t bufferIndex = ctx.slot() % nthreads;
218 size_t pixelStartClusterIndex = ctx.slot() % m_pixelStartClusteringKernels.size();
219 size_t pixelEndClusterIndex = ctx.slot() % m_pixelEndClusteringKernels.size();
220 size_t stripStartClusterIndex = ctx.slot() % m_stripStartClusteringKernels.size();
221 size_t stripEndClusterIndex = ctx.slot() % m_stripEndClusteringKernels.size();
224 const cl::CommandQueue &acc_queue =
m_acc_queues[bufferIndex];
226 ATH_MSG_INFO(
"Thread number "<<ctx.slot()<<
" running on buffer "<<bufferIndex<<
" pixelStartClusterIndex: "<< pixelStartClusterIndex<<
" stripStartClusterIndex: "<< stripStartClusterIndex<<
" stripEndClusterIndex: "<< stripEndClusterIndex);
228 cl::Kernel &pixelStartClusteringKernel = m_pixelStartClusteringKernels[pixelStartClusterIndex];
229 cl::Kernel &pixelEndClusteringKernel = m_pixelEndClusteringKernels[pixelEndClusterIndex];
230 cl::Kernel &stripStartClusteringKernel = m_stripStartClusteringKernels[stripStartClusterIndex];
231 cl::Kernel &stripEndClusteringKernel = m_stripEndClusteringKernels[stripEndClusterIndex];
235 pixelStartClusteringKernel.setArg(2,
static_cast<unsigned long long>(*pixelInputSize));
241 stripStartClusteringKernel.setArg(2,
static_cast<unsigned long long>(*stripInputSize));
246 cl::Event evt_write_pixel_input;
247 cl::Event evt_write_strip_input;
249 acc_queue.enqueueWriteBuffer(
m_pixelClusterInputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(uint64_t) * (*pixelInput).size(), (*pixelInput).data(), NULL, &evt_write_pixel_input);
250 acc_queue.enqueueWriteBuffer(
m_stripClusterInputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(uint64_t) * (*stripInput).size(), (*stripInput).data(), NULL, &evt_write_strip_input);
251 std::vector<cl::Event> evt_vec_pixel_input{evt_write_pixel_input};
252 std::vector<cl::Event> evt_vec_strip_input{evt_write_strip_input};
255 cl::Event evt_pixel_start_clustering;
256 cl::Event evt_pixel_end_clustering;
257 cl::Event evt_strip_start_clustering;
258 cl::Event evt_strip_end_clustering;
263 acc_queue.enqueueTask(pixelStartClusteringKernel, &evt_vec_pixel_input, &evt_pixel_start_clustering);
264 acc_queue.enqueueTask(pixelEndClusteringKernel, NULL , &evt_pixel_end_clustering);
266 acc_queue.enqueueTask(stripStartClusteringKernel, &evt_vec_strip_input, &evt_strip_start_clustering);
267 acc_queue.enqueueTask(stripEndClusteringKernel, NULL, &evt_strip_end_clustering);
271 cl::Event evt_pixel_cluster_output;
272 cl::Event evt_strip_cluster_output;
274 std::vector<cl::Event> evt_vec_pixel_done{evt_pixel_end_clustering};
275 std::vector<cl::Event> evt_vec_strip_done{evt_strip_end_clustering};
286 acc_queue.enqueueReadBuffer(
m_edmPixelOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(uint32_t) * (*FPGAPixelOutput).size(), (*FPGAPixelOutput).data(), &evt_vec_pixel_done, &evt_pixel_cluster_output);
287 acc_queue.enqueueReadBuffer(
m_edmStripOutputBufferList[bufferIndex], CL_FALSE, 0,
sizeof(uint32_t) * (*FPGAStripOutput).size(), (*FPGAStripOutput).data(), &evt_vec_strip_done, &evt_strip_cluster_output);
289 std::vector<cl::Event> wait_for_reads = { evt_pixel_cluster_output, evt_strip_cluster_output};
290 cl::Event::waitForEvents(wait_for_reads);
293 if(*pixelInputSize == 6) (*FPGAPixelOutput)[0] = 0;
294 if(*stripInputSize == 6) (*FPGAStripOutput)[0] = 0;
299 cl_ulong pixel_input_time = evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_pixel_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
303 cl_ulong strip_input_time = evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_write_strip_input.getProfilingInfo<CL_PROFILING_COMMAND_START>();
307 cl_ulong pixel_clustering_time = evt_pixel_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
311 cl_ulong strip_clustering_time = evt_strip_end_clustering.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_start_clustering.getProfilingInfo<CL_PROFILING_COMMAND_START>();
315 cl_ulong pixel_output_time = evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_pixel_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
319 cl_ulong strip_output_time = evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_END>() - evt_strip_cluster_output.getProfilingInfo<CL_PROFILING_COMMAND_START>();
322 return StatusCode::SUCCESS;