Skip to content

Commit 73daf3f

Browse files
committed
[Offload] Provide a kernel library useable by the offload runtime
As mentioned in #68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
1 parent d213981 commit 73daf3f

File tree

18 files changed

+370
-115
lines changed

18 files changed

+370
-115
lines changed

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
12021202
options::OPT_fno_openmp, false)) {
12031203
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
12041204
if (Args.hasFlag(options::OPT_foffload_via_llvm,
1205-
options::OPT_fno_offload_via_llvm, false))
1205+
options::OPT_fno_offload_via_llvm, false)) {
12061206
CmdArgs.push_back("-lomptarget");
1207+
if (!Args.hasArg(options::OPT_nogpulib))
1208+
CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
1209+
}
12071210
return false;
12081211
}
12091212

@@ -1240,7 +1243,7 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
12401243
CmdArgs.push_back("-lomptarget");
12411244

12421245
if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
1243-
CmdArgs.push_back("-lomptarget.devicertl");
1246+
CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
12441247

12451248
addArchSpecificRPath(TC, Args, CmdArgs);
12461249

offload/DeviceRTL/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
6969
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
7070
endif()
7171
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
72+
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
7273

7374
set(include_files
7475
${include_directory}/Allocator.h

offload/include/device.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ struct DeviceTy {
124124
/// Calls the corresponding print device info function in the plugin.
125125
bool printDeviceInfo();
126126

127+
/// Return the handle to the kernel with name \p Name in \p HandlePtr.
128+
int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr);
129+
127130
/// Event related interfaces.
128131
/// {
129132
/// Create an event.

offload/include/omptarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper(
400400
int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
401401
int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
402402

403+
/// Launch the kernel \p KernelName with a CUDA style launch and the given grid
404+
/// sizes and arguments (\p KernelArgs).
405+
int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName,
406+
KernelArgsTy *KernelArgs);
407+
403408
// Non-blocking synchronization for target nowait regions. This function
404409
// acquires the asynchronous context from task data of the current task being
405410
// executed and tries to query for the completion of its operations. If the

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 9 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2016,20 +2016,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
20162016
return Plugin::success();
20172017
}
20182018

2019-
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2020-
DeviceImageTy &Image) override {
2021-
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2022-
if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2023-
Image.setPendingGlobalDtors();
2024-
2025-
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
2019+
virtual Expected<StringRef>
2020+
getGlobalConstructorName(DeviceImageTy &Image) override {
2021+
return "amdgcn.device.init";
20262022
}
2027-
2028-
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2029-
DeviceImageTy &Image) override {
2030-
if (Image.hasPendingGlobalDtors())
2031-
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
2032-
return Plugin::success();
2023+
virtual Expected<StringRef>
2024+
getGlobalDestructorName(DeviceImageTy &Image) override {
2025+
return "amdgcn.device.fini";
20332026
}
20342027

20352028
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
@@ -2107,13 +2100,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
21072100
uint64_t getClockFrequency() const override { return ClockFrequency; }
21082101

21092102
/// Allocate and construct an AMDGPU kernel.
2110-
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2103+
Expected<GenericKernelTy &>
2104+
constructKernelImpl(llvm::StringRef Name) override {
21112105
// Allocate and construct the AMDGPU kernel.
21122106
AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
21132107
if (!AMDGPUKernel)
21142108
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
21152109

2116-
new (AMDGPUKernel) AMDGPUKernelTy(Name);
2110+
new (AMDGPUKernel) AMDGPUKernelTy(Name.data());
21172111

21182112
return *AMDGPUKernel;
21192113
}
@@ -2791,38 +2785,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27912785
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
27922786
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
27932787

2794-
/// Common method to invoke a single threaded constructor or destructor
2795-
/// kernel by name.
2796-
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2797-
bool IsCtor) {
2798-
const char *KernelName =
2799-
IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2800-
// Perform a quick check for the named kernel in the image. The kernel
2801-
// should be created by the 'amdgpu-lower-ctor-dtor' pass.
2802-
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2803-
if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2804-
return Plugin::success();
2805-
2806-
// Allocate and construct the AMDGPU kernel.
2807-
AMDGPUKernelTy AMDGPUKernel(KernelName);
2808-
if (auto Err = AMDGPUKernel.init(*this, Image))
2809-
return Err;
2810-
2811-
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2812-
2813-
KernelArgsTy KernelArgs = {};
2814-
if (auto Err =
2815-
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2816-
/*NumBlocks=*/1ul, KernelArgs,
2817-
KernelLaunchParamsTy{}, AsyncInfoWrapper))
2818-
return Err;
2819-
2820-
Error Err = Plugin::success();
2821-
AsyncInfoWrapper.finalize(Err);
2822-
2823-
return Err;
2824-
}
2825-
28262788
/// Detect if current architecture is an APU.
28272789
Error checkIfAPU() {
28282790
// TODO: replace with ROCr API once it becomes available.

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -722,18 +722,17 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
722722
Error synchronize(__tgt_async_info *AsyncInfo);
723723
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
724724

725-
/// Invokes any global constructors on the device if present and is required
726-
/// by the target.
727-
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
728-
DeviceImageTy &Image) {
729-
return Error::success();
725+
/// Call the ctor/dtor of image \p Image, if available.
726+
Error callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor);
727+
728+
/// Return the name of the global constructors on the device.
729+
virtual Expected<StringRef> getGlobalConstructorName(DeviceImageTy &Image) {
730+
return "";
730731
}
731732

732-
/// Invokes any global destructors on the device if present and is required
733-
/// by the target.
734-
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
735-
DeviceImageTy &Image) {
736-
return Error::success();
733+
/// Return the name of the global destructors on the device.
734+
virtual Expected<StringRef> getGlobalDestructorName(DeviceImageTy &Image) {
735+
return "";
737736
}
738737

739738
/// Query for the completion of the pending operations on the __tgt_async_info
@@ -928,8 +927,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
928927
bool useAutoZeroCopy();
929928
virtual bool useAutoZeroCopyImpl() { return false; }
930929

931-
/// Allocate and construct a kernel object.
932-
virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
930+
/// Retrieve the kernel with name \p Name from image \p Image (or any image if
931+
/// \p Image is null) and return it. If \p Optional is true, the function
932+
/// returns success if there is no kernel with the given name.
933+
Expected<GenericKernelTy *> getKernel(llvm::StringRef Name,
934+
DeviceImageTy *Image = nullptr,
935+
bool Optional = false);
933936

934937
/// Reference to the underlying plugin that created this device.
935938
GenericPluginTy &Plugin;
@@ -947,6 +950,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
947950
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
948951

949952
private:
953+
/// Allocate and construct a kernel object (users should use getKernel).
954+
virtual Expected<GenericKernelTy &>
955+
constructKernelImpl(llvm::StringRef Name) = 0;
956+
950957
/// Get and set the stack size and heap size for the device. If not used, the
951958
/// plugin can implement the setters as no-op and setting the output
952959
/// value to zero for the getters.
@@ -1046,6 +1053,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
10461053
private:
10471054
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
10481055
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
1056+
1057+
DenseMap<StringRef, GenericKernelTy *> KernelMap;
10491058
};
10501059

10511060
/// Class implementing common functionalities of offload plugins. Each plugin

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 98 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
#include <cstdint>
3838
#include <limits>
39+
#include <string>
3940

4041
using namespace llvm;
4142
using namespace omp;
@@ -809,7 +810,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
809810

810811
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
811812
for (DeviceImageTy *Image : LoadedImages)
812-
if (auto Err = callGlobalDestructors(Plugin, *Image))
813+
if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ false))
813814
return Err;
814815

815816
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
@@ -866,6 +867,37 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
866867

867868
return deinitImpl();
868869
}
870+
871+
Error GenericDeviceTy::callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor) {
872+
auto NameOrErr =
873+
IsCtor ? getGlobalConstructorName(Image) : getGlobalDestructorName(Image);
874+
if (auto Err = NameOrErr.takeError())
875+
return Err;
876+
// No error but no name, that means there is no ctor/dtor.
877+
if (NameOrErr->empty())
878+
return Plugin::success();
879+
880+
auto KernelOrErr = getKernel(*NameOrErr, &Image, /*Optional=*/true);
881+
if (auto Err = KernelOrErr.takeError())
882+
return Err;
883+
884+
if (GenericKernelTy *Kernel = *KernelOrErr) {
885+
KernelArgsTy KernelArgs;
886+
KernelArgs.NumTeams[0] = KernelArgs.ThreadLimit[0] = 1;
887+
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
888+
if (auto Err = Kernel->launch(*this, /*ArgPtrs=*/nullptr,
889+
/*ArgOffsets=*/nullptr, KernelArgs,
890+
AsyncInfoWrapper))
891+
return Err;
892+
893+
Error Err = Plugin::success();
894+
AsyncInfoWrapper.finalize(Err);
895+
return Err;
896+
}
897+
898+
return Plugin::success();
899+
}
900+
869901
Expected<DeviceImageTy *>
870902
GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
871903
const __tgt_device_image *InputTgtImage) {
@@ -927,8 +959,8 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
927959
#endif
928960

929961
// Call any global constructors present on the device.
930-
if (auto Err = callGlobalConstructors(Plugin, *Image))
931-
return std::move(Err);
962+
if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ true))
963+
return Err;
932964

933965
// Return the pointer to the table of entries.
934966
return Image;
@@ -1533,6 +1565,67 @@ Error GenericDeviceTy::printInfo() {
15331565
return Plugin::success();
15341566
}
15351567

1568+
Expected<GenericKernelTy *> GenericDeviceTy::getKernel(llvm::StringRef Name,
1569+
DeviceImageTy *ImagePtr,
1570+
bool Optional) {
1571+
bool KernelFound = false;
1572+
GenericKernelTy *&KernelPtr = KernelMap[Name];
1573+
if (!KernelPtr) {
1574+
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
1575+
1576+
auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * {
1577+
if (!GHandler.isSymbolInImage(*this, Image, Name))
1578+
return nullptr;
1579+
KernelFound = true;
1580+
1581+
auto KernelOrErr = constructKernelImpl(Name);
1582+
if (Error Err = KernelOrErr.takeError()) {
1583+
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1584+
DP("Failed to construct kernel ('%s'): %s", Name.data(),
1585+
ErrStr.c_str());
1586+
return nullptr;
1587+
}
1588+
1589+
GenericKernelTy &Kernel = *KernelOrErr;
1590+
if (auto Err = Kernel.init(*this, Image)) {
1591+
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1592+
DP("Failed to initialize kernel ('%s'): %s", Name.data(),
1593+
ErrStr.c_str());
1594+
return nullptr;
1595+
}
1596+
1597+
return &Kernel;
1598+
};
1599+
1600+
if (ImagePtr) {
1601+
KernelPtr = CheckImage(*ImagePtr);
1602+
} else {
1603+
for (DeviceImageTy *Image : LoadedImages) {
1604+
KernelPtr = CheckImage(*Image);
1605+
if (KernelPtr)
1606+
break;
1607+
}
1608+
}
1609+
}
1610+
1611+
// If we didn't find the kernel and it was optional, we do not emit an error.
1612+
if (!KernelPtr && !KernelFound && Optional)
1613+
return nullptr;
1614+
// If we didn't find the kernel and it was not optional, we will emit an
1615+
// error.
1616+
if (!KernelPtr && !KernelFound)
1617+
return Plugin::error(
1618+
"Kernel '%s' not found%s", Name.data(),
1619+
ImagePtr
1620+
? ""
1621+
: ", searched " + std::to_string(LoadedImages.size()) + " images");
1622+
// If we found the kernel but couldn't initialize it, we will emit an error.
1623+
if (!KernelPtr)
1624+
return Plugin::error("Kernel '%s' failed to initialize");
1625+
// Found the kernel and initialized it.
1626+
return KernelPtr;
1627+
}
1628+
15361629
Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
15371630
return createEventImpl(EventPtrStorage);
15381631
}
@@ -2147,20 +2240,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
21472240

21482241
GenericDeviceTy &Device = Image.getDevice();
21492242

2150-
auto KernelOrErr = Device.constructKernel(Name);
2243+
auto KernelOrErr = Device.getKernel(Name, &Image);
21512244
if (Error Err = KernelOrErr.takeError()) {
21522245
REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
21532246
return OFFLOAD_FAIL;
21542247
}
21552248

2156-
GenericKernelTy &Kernel = *KernelOrErr;
2157-
if (auto Err = Kernel.init(Device, Image)) {
2158-
REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
2159-
return OFFLOAD_FAIL;
2160-
}
2161-
21622249
// Note that this is not the kernel's device address.
2163-
*KernelPtr = &Kernel;
2250+
*KernelPtr = *KernelOrErr;
21642251
return OFFLOAD_SUCCESS;
21652252
}
21662253

0 commit comments

Comments
 (0)