ATLAS Offline Software
Loading...
Searching...
No Matches
DeviceMgmtSvc.cxx
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3//
4
5// Local include(s).
6#include "DeviceMgmtSvc.h"
7
8// System include(s).
9#include <filesystem>
10#include <fstream>
11#include <CL/cl_ext_xilinx.h>
12
13namespace AthXRT {
14
20
21 std::vector<cl::Platform> platforms;
22
23 ATH_CHECK(cl::Platform::get(&platforms) == CL_SUCCESS);
24 si.device_count = 0;
25 for (const cl::Platform &platform : platforms) {
26
27 // Filter platforms by name. Currently AMD FPGA platform is
28 // still referenced as "Xilinx".
29 std::string platform_name;
30 ATH_CHECK(platform.getInfo(CL_PLATFORM_NAME, &platform_name) == CL_SUCCESS);
31 if (platform_name != "Xilinx") {
32 ATH_MSG_WARNING("Skipping unsuported platform " << platform_name);
33 continue;
34 }
35
36 // Get devices list for the platform and count total devices.
37 std::vector<cl::Device> devices;
38 ATH_CHECK(platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices) ==
39 CL_SUCCESS);
40 ATH_MSG_DEBUG("Found XRT/OpenCL platform '" << platform_name << "' with "
41 << devices.size()
42 << " devices");
43 si.device_count += devices.size();
44
45 // Group devices by type, using the name property.
46 // All similar devices should have the same name,
47 // I.E: "xilinx_u250_gen3x16_xdma_shell_4_1"
48 for (const cl::Device &device : devices) {
49 const std::string device_name = get_device_name(device);
50
51 // Check if we already have a device with the same name.
52 bool found = false;
53 for (std::vector<cl::Device> &list : si.device_types) {
54 std::string list_device_name;
55 ATH_CHECK(list[0].getInfo(CL_DEVICE_NAME, &list_device_name) ==
56 CL_SUCCESS);
57 if (device_name == list_device_name) {
58 found = true;
59 list.push_back(device);
60 break;
61 }
62 }
63 if (!found) {
64 std::vector<cl::Device> new_list = {device};
65 si.device_types.push_back(new_list);
66 }
67 }
68 }
69
70 // We expect to have at least one device to program.
71 if (si.device_count < 1) {
72 // This should not be this catastrophic and we could fallback to
73 // software implementation, but for now we will consider that an
74 // accelerator have to be present if this service is configured.
75 ATH_MSG_ERROR("No XRT device found");
76 return StatusCode::FAILURE;
77 } else {
78 ATH_MSG_INFO("Found a total of "
79 << si.device_count << " AMD FPGA device(s) ("
80 << si.device_types.size() << " device type(s))");
81 }
82
83 return StatusCode::SUCCESS;
84
85} // DeviceMgmtSvc::inspect_devices()
86
92
93 // We expect at least one XCLBIN file to be specified.
94 if (m_xclbin_path_list.empty()) {
95 ATH_MSG_ERROR("No XCLBIN list specified");
96 return StatusCode::FAILURE;
97 }
98
99 // If there is more XCLBIN files to program than device(s), this
100 // is probably an error (the opposite is ok).
101 if (m_xclbin_path_list.size() > si.device_count) {
103 "More XCLBIN file(s) specified than "
104 "devices type available ("
105 << si.device_count << "): ");
106 for (const std::string &xclbin_path : m_xclbin_path_list) {
107 ATH_MSG_ERROR(xclbin_path);
108 }
109 return StatusCode::FAILURE;
110 }
111
112 // Inspect XCLBIN files.
113 for (const std::string &xclbin_path : m_xclbin_path_list) {
114
115 if (!std::filesystem::exists(xclbin_path)) {
116 ATH_MSG_ERROR("XCLBIN file does not exist: " << xclbin_path);
117 return StatusCode::FAILURE;
118 }
119
120 // Create a temporary XRT API XCLBIN object to use introspection
121 // to gather some information. With this approach, we are loading
122 // them twice from disk which is not optimal.
123 DeviceMgmtSvc::XclbinInfo xclbin_info;
124 try {
125 xrt::xclbin xrt_xclbin(xclbin_path);
126 xclbin_info.path = xclbin_path;
127 xclbin_info.xsa_name = xrt_xclbin.get_xsa_name();
128 xclbin_info.fpga_device_name = xrt_xclbin.get_fpga_device_name();
129 xclbin_info.uuid = xrt_xclbin.get_uuid().to_string();
130
131 for (const xrt::xclbin::kernel &kernel : xrt_xclbin.get_kernels()) {
132 const std::string& kernelName = kernel.get_name();
133
134 // Ensure that the kernel have a least one compute unit.
135 // Having a kernel without compute unit is not a common use case,
136 // but it is possible if a .xo with a kernel is linked in the
137 // .xclbin, but the number of said kernel is set to 0.
138 if (!kernel.get_cus().empty()) {
139 xclbin_info.kernel_names.push_back(kernelName);
140 }
141
142 for (const xrt::xclbin::ip &computeUnit : kernel.get_cus()) {
143 const std::string& computeUnitName = computeUnit.get_name();
144 const std::string computeUnitIsolatedName =
145 computeUnitName.substr(kernelName.size() + 1);
146
147 const std::string computeUnitUsableName = kernelName + ":{" + computeUnitIsolatedName + "}";
148
149 xclbin_info.cu_names.push_back(std::move(computeUnitUsableName));
150 }
151 }
152 } catch (const std::exception &e) {
153 ATH_MSG_ERROR(e.what());
154 ATH_MSG_ERROR("Could not create xrt::xclbin from " << xclbin_path);
155 return StatusCode::FAILURE;
156 }
157 m_xclbin_infos.push_back(xclbin_info);
158 }
159
160 // Extract some more information from the XCLBIN collection:
161 // The number of different XCLBIN files and the number of different
162 // FPGA device names targeted by the XCLBIN files.
163 std::set<std::string> uuids;
164 std::set<std::string> fpga_device_names;
165 for (const XclbinInfo &info : m_xclbin_infos) {
166 uuids.insert(info.uuid);
167 fpga_device_names.insert(info.fpga_device_name);
168 }
169 si.different_xclbin_count = uuids.size();
170 si.different_xclbin_fpga_device_name = fpga_device_names.size();
171
172 return StatusCode::SUCCESS;
173
174} // DeviceMgmtSvc::inspect_xclbin()
175
179std::string DeviceMgmtSvc::get_device_name(const cl::Device &device) const {
180
181 cl_int err = CL_SUCCESS;
182 std::string device_name;
183 err = device.getInfo(CL_DEVICE_NAME, &device_name);
184 if (err != CL_SUCCESS) {
185 ATH_MSG_ERROR("Failed to get device name");
186 return std::string("error");
187 }
188 return device_name;
189}
190
194std::string DeviceMgmtSvc::get_device_bdf(const cl::Device &device) const {
195
196 cl_int err = CL_SUCCESS;
197 std::string device_bdf;
198 err = device.getInfo(CL_DEVICE_PCIE_BDF, &device_bdf);
199 if (err != CL_SUCCESS) {
200 ATH_MSG_ERROR("Failed to get device BDF");
201 return std::string("error");
202 }
203 return device_bdf;
204}
205
212static std::string getPrefixUpToNthOccurrence(const std::string &str,
213 char token, int n) {
214
215 std::size_t pos = 0;
216 int count = 0;
217
218 // Iterate through the string to find the nth occurrence of the token
219 while (count < n && pos != std::string::npos) {
220 pos = str.find(token, pos + 1);
221 count++;
222 }
223
224 // If the token isn't found n times, return the whole string
225 if (pos == std::string::npos) {
226 return str;
227 }
228
229 // Return the substring up to and including the nth occurrence
230 return str.substr(0, pos + 1);
231}
232
242 const DeviceMgmtSvc::XclbinInfo &xclbin_info,
243 const cl::Device &device) const {
244
245 const std::string device_prefix =
247 const std::string xsa_prefix =
248 getPrefixUpToNthOccurrence(xclbin_info.xsa_name, '_', 2);
249 if (device_prefix == xsa_prefix) {
250 return true;
251 } else {
252 return false;
253 }
254}
255
277
278 // Do we have only one FPGA type?
279 if (si.device_types.size() == 1) {
280
281 if (m_xclbin_infos.size() == 1) {
282
283 // We have one or multiple device(s) of the same type and only have one
284 // XCLBIN: Program all device(s) with the same XCLBIN and create one
285 // context.
286 ATH_MSG_DEBUG("Case 1: One or multiple identical device(s), one xclbin");
287 DeviceMgmtSvc::AthClContext ath_cl_context;
288 for (const cl::Device &device : si.device_types[0]) {
289 ath_cl_context.devices.push_back(device);
290 }
291 ath_cl_context.xclbin_info = m_xclbin_infos[0];
292 m_ath_cl_contexts.push_back(ath_cl_context);
293
294 } else {
295
297
298 // This is an error: we only have one FPGA type but
299 // XCLBIN files targeting multiple fpga.
301 "Specified XCLBINs target multiple device types, but only one "
302 "device type is present");
303 return StatusCode::FAILURE;
304 }
305
306 if (si.different_xclbin_count == 1) {
307 // We have multiple device of the same type and multiple identical
308 // xclbin: Program the same number of devices that we have XCLBIN files,
309 // but put them in only one context as the XCLBIN will be identical for
310 // all programmed devices. Some devices might be left un-programmed.
312 "Case 2: Multiple identical devices, multiple identical xclbins");
313 DeviceMgmtSvc::AthClContext ath_cl_context;
314 for (std::size_t i = 0; i < m_xclbin_infos.size(); ++i) {
315 ath_cl_context.devices.push_back(si.device_types[0][i]);
316 }
317 ath_cl_context.xclbin_info = m_xclbin_infos[0];
318 m_ath_cl_contexts.push_back(ath_cl_context);
319
320 } else {
321
322 // We have multiple device of the same type and multiple different
323 // XCLBIN, but all targeting the same device: Program all devices with a
324 // differnt XCLBIN and create one context per device/XCLBIN. Some
325 // devices might be left un-programmed.
327 "Case 3: Multiple identical devices, multiple different XCLBIN "
328 "files, but targeting the same device type");
329 for (std::size_t i = 0; i < m_xclbin_infos.size(); ++i) {
330 DeviceMgmtSvc::AthClContext ath_cl_context;
331 ath_cl_context.xclbin_info = m_xclbin_infos[i];
332 ath_cl_context.devices.push_back(si.device_types[0][i]);
333 m_ath_cl_contexts.push_back(ath_cl_context);
334 }
335 }
336 }
337 } else {
338
339 // More tricky (and probably an edge case): we have different
340 // devices types. We will try to pair each device them with a
341 // XCLBIN files based on the device name and XCLBIN XSA name,
342 // and load them in separate contexts.
343 ATH_MSG_DEBUG("Case 4: Multiple different devices, multiple xclbins");
344 std::vector<XclbinInfo> unaffected_xclbin_infos(m_xclbin_infos);
345 for (const std::vector<cl::Device> &device_type : si.device_types) {
346 for (const cl::Device &device : device_type) {
347 DeviceMgmtSvc::AthClContext ath_cl_context;
348 ath_cl_context.devices.push_back(device);
349
350 // Try to find a matching XCLBIN for this device.
351 std::vector<XclbinInfo>::iterator iter;
352 bool found = false;
353 for (iter = unaffected_xclbin_infos.begin();
354 iter != unaffected_xclbin_infos.end();) {
355 if (is_xclbin_compatible_with_device(*iter, device)) {
356 ath_cl_context.xclbin_info = *iter;
357 iter = unaffected_xclbin_infos.erase(iter);
358 found = true;
359 break;
360 } else {
361 ++iter;
362 }
363 }
364
365 // Only keep this combination if we found a matching XCLBIN.
366 if (found) {
367 m_ath_cl_contexts.push_back(ath_cl_context);
368 } else {
369 // If we did not find a matching XCLBIN, we will not program the
370 // device. This is not an error, but we will report it.
371 ATH_MSG_WARNING("No compatible XCLBIN found for device "
372 << get_device_name(device) << " ("
373 << get_device_bdf(device) << ")");
374 }
375 }
376 }
377
378 for (const XclbinInfo &xclbin_info : unaffected_xclbin_infos) {
379 // Report XCLBIN files that were not affected to a device.
380 // (This could happen for XCLBIN not compatible with any device.)
382 "No compatible device found for XCLBIN: " << xclbin_info.path);
383 }
384 }
385
386 return StatusCode::SUCCESS;
387
388} // DeviceMgmtSvc::pair_devices_and_xclbins()
389
397
398 cl_int err = CL_SUCCESS;
399
400 for (AthClContext &ath_cl_context : m_ath_cl_contexts) {
401
402 // Create an OpenCL context for the device(s).
403 ath_cl_context.context = std::make_shared<cl::Context>(
404 ath_cl_context.devices, nullptr, nullptr, nullptr, &err);
405 if (err != CL_SUCCESS) {
406 ATH_MSG_ERROR("Failed to create cl::Context");
407 return StatusCode::FAILURE;
408 }
409
410 // Load XCLBIN file from disk.
411 std::ifstream file;
412 try {
413 file.open(ath_cl_context.xclbin_info.path.c_str(), std::ios::binary);
414 } catch (...) {
415 ATH_MSG_ERROR("Could not open " << ath_cl_context.xclbin_info.path
416 << " for reading");
417 return StatusCode::FAILURE;
418 }
419 std::vector<char> xclbin_buffer((std::istreambuf_iterator<char>(file)),
420 std::istreambuf_iterator<char>());
421
422 // Wrap XCLBIN data and size in a vector of cl::Program::Binaries.
423 // If we program multiple devices, we need to provide the same
424 // binary for each device.
425 cl::Program::Binaries binary;
426 for (std::size_t i = 0; i < ath_cl_context.devices.size(); ++i) {
427 binary.push_back({xclbin_buffer.data(), xclbin_buffer.size()});
428 }
429
430 // Create a program from the XCLBIN binary.
431 // Effectively loading the XCLBIN on the device(s).
432 ath_cl_context.program = std::make_shared<cl::Program>(
433 *ath_cl_context.context, ath_cl_context.devices, binary, nullptr, &err);
434 if (err != CL_SUCCESS) {
435 ATH_MSG_ERROR("Failed to create cl::Program");
436 return StatusCode::FAILURE;
437 }
438
439 // Report what have been done.
440 std::string bdfs = "";
441 for (const cl::Device &device : ath_cl_context.devices) {
442 bdfs += get_device_bdf(device);
443 bdfs += " ";
444 }
445 ATH_MSG_INFO("Loaded " << ath_cl_context.xclbin_info.path << " on "
446 << ath_cl_context.devices.size() << " "
447 << get_device_name(ath_cl_context.devices[0])
448 << " device(s): " << bdfs);
449 }
450
451 return StatusCode::SUCCESS;
452
453} // DeviceMgmtSvc::program_devices
454
462
463 SystemInfo sys_info;
464
465 // Inspect available device(s) and fill sys_info.
466 ATH_CHECK(inspect_devices(sys_info));
467
468 // Inspect provided XCLBINs to gather information about kernel(s)
469 // into sys_info and a list of XclbinInfo.
470 ATH_CHECK(inspect_xclbins(sys_info));
471
472 // Now we can make a decision about the pairing of device(s)
473 // and xclbin(s), and the number of required context(s) by
474 // filling m_ath_cl_contexts.
476
477 // Program the devices with the XCLBINs and create contexts,
478 // and programs based on m_ath_cl_contexts.
480
481 // Return gracefully.
482 return StatusCode::SUCCESS;
483}
484
486
487 // Finalise the base class.
488 ATH_CHECK(Service::finalize());
489
490 // Return gracefully.
491 return StatusCode::SUCCESS;
492}
493
494const std::vector<std::shared_ptr<xrt::device>>
495DeviceMgmtSvc::get_xrt_devices_by_kernel_name(const std::string &name) const {
496
497 std::vector<std::shared_ptr<xrt::device>> devices;
498
499 // Iterate over all contexts and check if the kernel name is in the list.
500 // If so, add the device(s) to the list of devices to return.
501 for (const AthClContext &ath_cl_context : m_ath_cl_contexts) {
502 if (std::find(ath_cl_context.xclbin_info.kernel_names.begin(),
503 ath_cl_context.xclbin_info.kernel_names.end(),
504 name) != ath_cl_context.xclbin_info.kernel_names.end()) {
505 for (const cl::Device &device : ath_cl_context.devices) {
506 devices.push_back(std::make_shared<xrt::device>(
507 xrt::opencl::get_xrt_device(device())));
508 }
509 }
510 else if (std::find(ath_cl_context.xclbin_info.cu_names.begin(),
511 ath_cl_context.xclbin_info.cu_names.end(),
512 name) != ath_cl_context.xclbin_info.cu_names.end()) {
513 for (const cl::Device &device : ath_cl_context.devices) {
514 devices.push_back(std::make_shared<xrt::device>(
515 xrt::opencl::get_xrt_device(device())));
516 }
517 }
518 }
519
520 return devices;
521}
522
523const std::vector<IDeviceMgmtSvc::OpenCLHandle>
525 const std::string &name) const {
526
527 std::vector<IDeviceMgmtSvc::OpenCLHandle> handles;
528
529 // Iterate over all contexts and check if the kernel name is in the list.
530 // If so, add the context and program to the list of handles to return.
531 for (const AthClContext &ath_cl_context : m_ath_cl_contexts) {
532 if (std::find(ath_cl_context.xclbin_info.kernel_names.begin(),
533 ath_cl_context.xclbin_info.kernel_names.end(),
534 name) != ath_cl_context.xclbin_info.kernel_names.end()) {
535 IDeviceMgmtSvc::OpenCLHandle handle = {ath_cl_context.context,
536 ath_cl_context.program};
537 handles.push_back(handle);
538 }
539 }
540
541 return handles;
542}
543
544} // namespace AthXRT
#define ATH_CHECK
Evaluate an expression and check for errors.
#define ATH_MSG_ERROR(x)
#define ATH_MSG_INFO(x)
#define ATH_MSG_WARNING(x)
#define ATH_MSG_DEBUG(x)
StatusCode program_devices()
Program the devices with the XCLBIN files and create contexts.
virtual const std::vector< IDeviceMgmtSvc::OpenCLHandle > get_opencl_handles_by_kernel_name(const std::string &name) const override
Get a list of OpenCL handles providing the specified kernel.
std::vector< XclbinInfo > m_xclbin_infos
List of XCLBIN files info configured for the service.
virtual const std::vector< std::shared_ptr< xrt::device > > get_xrt_devices_by_kernel_name(const std::string &name) const override
Get a list of XRT devices providing the specified kernel.
virtual StatusCode finalize() override
Finalise the service.
Gaudi::Property< std::vector< std::string > > m_xclbin_path_list
The list of xclbin files to use.
std::string get_device_name(const cl::Device &device) const
Get the name of a cl::device.
std::vector< AthClContext > m_ath_cl_contexts
List of contexts configured for the service.
StatusCode pair_devices_and_xclbins(const SystemInfo &si)
Pair devices and XCLBINs and create contexts.
StatusCode inspect_xclbins(SystemInfo &si)
Inspect the provided XCLBIN files and fill the SystemInfo structure.
StatusCode inspect_devices(SystemInfo &si)
Inspect the available devices and fill the SystemInfo structure.
bool is_xclbin_compatible_with_device(const DeviceMgmtSvc::XclbinInfo &xclbin_info, const cl::Device &device) const
Helper function to check if an XCLBIN file is compatible with a device.
virtual StatusCode initialize() override
Initialise the service.
std::string get_device_bdf(const cl::Device &device) const
Get the BDF (bus:device:function) string of a cl::device.
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146
static std::string getPrefixUpToNthOccurrence(const std::string &str, char token, int n)
Get substring up to the nth occurrence of a token.
Struct to hold information about a context, as well as the devices, the program and XCLBIN file assoc...
std::vector< cl::Device > devices
std::vector< std::vector< cl::Device > > device_types
Struct to hold information about an XCLBIN file, as well as the kernels it contains.
std::vector< std::string > cu_names
std::vector< std::string > kernel_names
Struct holding OpenCL handles for a kernel.
TFile * file