Skip to content

[OpenMP][Offload][Runtime] Add map types and map behaviour tweaks for descriptor and descriptor base address #138754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ enum class OpenMPOffloadMappingFlags : uint64_t {
// dynamic.
// This is an OpenMP extension for the sake of OpenACC support.
OMP_MAP_OMPX_HOLD = 0x2000,
// Mapping is for a descriptor (a.k.a. dope vector)
OMP_MAP_DESCRIPTOR = 0x4000,
// Mapping is for a descriptor's (a.k.a. dope vector) data base address
OMP_MAP_DESCRIPTOR_BASE_ADDR = 0x8000,
/// Signal that the runtime library should use args as an array of
/// descriptor_dim pointers and use args_size as dims. Used when we have
/// non-contiguous list items in target update directive
Expand Down
19 changes: 11 additions & 8 deletions offload/include/OpenMP/Mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ struct ShadowPtrInfoTy {
void *HstPtrVal = nullptr;
void **TgtPtrAddr = nullptr;
void *TgtPtrVal = nullptr;
bool IsDescriptorBaseAddr = false;

bool operator==(const ShadowPtrInfoTy &Other) const {
return HstPtrAddr == Other.HstPtrAddr;
Expand All @@ -68,6 +69,7 @@ struct HostDataToTargetTy {
const uintptr_t HstPtrBegin;
const uintptr_t HstPtrEnd; // non-inclusive.
const map_var_info_t HstPtrName; // Optional source name of mapped variable.
const int32_t AllocKind;

const uintptr_t TgtAllocBegin; // allocated target memory
const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding
Expand Down Expand Up @@ -124,10 +126,11 @@ struct HostDataToTargetTy {
public:
HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E,
uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin,
bool UseHoldRefCount, map_var_info_t Name = nullptr,
bool IsINF = false)
bool UseHoldRefCount, int32_t AllocKind,
map_var_info_t Name = nullptr, bool IsINF = false)
: HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin),
AllocKind(AllocKind), TgtAllocBegin(TgtAllocBegin),
TgtPtrBegin(TgtPtrBegin),
States(std::make_unique<StatesTy>(UseHoldRefCount ? 0
: IsINF ? INFRefCount
: 1,
Expand Down Expand Up @@ -479,11 +482,11 @@ struct MappingInfoTy {
/// - Data transfer issue fails.
TargetPointerResultTy getTargetPointer(
HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName,
bool HasFlagTo, bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR = nullptr,
bool ReleaseHDTTMap = true);
int64_t TgtPadding, int64_t Size, int64_t TypeFlags,
map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo,
HostDataToTargetTy *OwnedTPR = nullptr, bool ReleaseHDTTMap = true);

/// Return the target pointer for \p HstPtrBegin in \p HDTTMap. The accessor
/// ensures exclusive access to the HDTT map.
Expand Down
4 changes: 4 additions & 0 deletions offload/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ enum tgt_map_type {
// the structured region
// This is an OpenMP extension for the sake of OpenACC support.
OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
// mapping is for a descriptor (a.k.a. dope vector)
OMP_TGT_MAPTYPE_DESCRIPTOR = 0x4000,
// Mapping is for a descriptor's (a.k.a. dope vector) data base address
OMP_TGT_MAPTYPE_DESCRIPTOR_BASE_ADDR = 0x8000,
// descriptor for non-contiguous target-update
OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
// member of struct, member given by [16 MSBs] - 1
Expand Down
92 changes: 66 additions & 26 deletions offload/libomptarget/OpenMP/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
/*HstPtrEnd=*/(uintptr_t)HstPtrBegin + Size,
/*TgtAllocBegin=*/(uintptr_t)TgtPtrBegin,
/*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
/*UseHoldRefCount=*/false, /*Name=*/nullptr,
/*UseHoldRefCount=*/false, /*AllocKind=*/TARGET_ALLOC_DEFAULT,
/*Name=*/nullptr,
/*IsRefCountINF=*/true))
.first->HDTT;
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
Expand Down Expand Up @@ -199,10 +200,11 @@ LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,

TargetPointerResultTy MappingInfoTy::getTargetPointer(
HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName, bool HasFlagTo,
bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR, bool ReleaseHDTTMap) {
int64_t TgtPadding, int64_t Size, int64_t TypeFlags,
map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo,
HostDataToTargetTy *OwnedTPR, bool ReleaseHDTTMap) {

LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size, OwnedTPR);
LR.TPR.Flags.IsPresent = true;
Expand Down Expand Up @@ -286,17 +288,28 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
} else if (Size) {
// If it is not contained and Size > 0, we should create a new entry for it.
LR.TPR.Flags.IsNewEntry = true;

int32_t AllocKind = TARGET_ALLOC_DEFAULT;

if (TypeFlags == OMP_TGT_MAPTYPE_DESCRIPTOR &&
Device.RTL->use_shared_mem_for_descriptor(Device.DeviceID, Size)) {
AllocKind = TARGET_ALLOC_SHARED;
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
"Using shared memory for descriptor allocation of size=%zu\n", Size);
}

uintptr_t TgtAllocBegin =
(uintptr_t)Device.allocData(TgtPadding + Size, HstPtrBegin);
(uintptr_t)Device.allocData(TgtPadding + Size, HstPtrBegin, AllocKind);
uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding;
// Release the mapping table lock only after the entry is locked by
// attaching it to TPR.
LR.TPR.setEntry(HDTTMap
->emplace(new HostDataToTargetTy(
(uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
(uintptr_t)HstPtrBegin + Size, TgtAllocBegin,
TgtPtrBegin, HasHoldModifier, HstPtrName))
.first->HDTT);
LR.TPR.setEntry(
HDTTMap
->emplace(new HostDataToTargetTy(
(uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
(uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin,
HasHoldModifier, AllocKind, HstPtrName))
.first->HDTT);
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
"Creating new map entry with HstPtrBase=" DPxMOD
", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
Expand Down Expand Up @@ -326,20 +339,47 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
// data transfer.
if (LR.TPR.TargetPointer && !LR.TPR.Flags.IsHostPointer && HasFlagTo &&
(LR.TPR.Flags.IsNewEntry || HasFlagAlways) && Size != 0) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size,
DPxPTR(HstPtrBegin), DPxPTR(LR.TPR.TargetPointer));

int Ret = Device.submitData(LR.TPR.TargetPointer, HstPtrBegin, Size,
AsyncInfo, LR.TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
// We will also return nullptr if the data movement fails because that
// pointer points to a corrupted memory region so it doesn't make any
// sense to continue to use it.
LR.TPR.TargetPointer = nullptr;
} else if (LR.TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
OFFLOAD_SUCCESS)
return TargetPointerResultTy{};
if (LR.TPR.Flags.IsNewEntry ||
LR.TPR.getEntry()->AllocKind != TARGET_ALLOC_SHARED) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
Size, DPxPTR(HstPtrBegin), DPxPTR(LR.TPR.TargetPointer));

// If we are mapping a descriptor/dope vector, we map it with always as
// this information should always be up-to-date. Another issue is that
// due to an edge-case with declare target preventing this information
// being initialized on device we have force initialize it with always.
// However, in these cases to prevent overwriting of the data pointer
// breaking any pointer <-> data attachment that previous mappings may
// have established, we skip over the data pointer stored in the dope
// vector/descriptor, a subsequent seperate mapping of the pointer and
// data by the compiler should correctly establish any required data
// mappings, the descriptor mapping primarily just populates the relevant
// descriptor data fields that the fortran runtime depends on for bounds
// calculation and other relating things. The pointer is always in the
// same place, the first field of the descriptor structure, so we skip
// it by offsetting by 8-bytes. On architectures with more varied pointer
// sizes this may need further thought, but so would a lot of the data
// mapping I imagine if host/device pointers are mismatched sizes.
if ((TypeFlags & OMP_TGT_MAPTYPE_DESCRIPTOR) && HasFlagAlways) {
uintptr_t DescDataPtrOffset = 8;
HstPtrBegin = (void *)((uintptr_t)HstPtrBegin + DescDataPtrOffset);
LR.TPR.TargetPointer =
(void *)((uintptr_t)LR.TPR.TargetPointer + DescDataPtrOffset);
Size = Size - DescDataPtrOffset;
}

int Ret = Device.submitData(LR.TPR.TargetPointer, HstPtrBegin, Size,
AsyncInfo, LR.TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
// We will also return nullptr if the data movement fails because that
// pointer points to a corrupted memory region so it doesn't make any
// sense to continue to use it.
LR.TPR.TargetPointer = nullptr;
} else if (LR.TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
OFFLOAD_SUCCESS)
return TargetPointerResultTy{};
}
} else {
// If not a host pointer and no present modifier, we need to wait for the
// event if it exists.
Expand Down
4 changes: 2 additions & 2 deletions offload/libomptarget/PluginManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,8 @@ static int loadImagesOntoDevice(DeviceTy &Device) {
CurrHostEntry->Size /*HstPtrEnd*/,
(uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
(uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
false /*UseHoldRefCount*/, CurrHostEntry->SymbolName,
true /*IsRefCountINF*/));
false /*UseHoldRefCount*/, TARGET_ALLOC_DEFAULT /*AllocKind*/,
CurrHostEntry->SymbolName, true /*IsRefCountINF*/));

// Notify about the new mapping.
if (Device.notifyDataMapped(CurrHostEntry->Address,
Expand Down
26 changes: 21 additions & 5 deletions offload/libomptarget/omptarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
// when HasPresentModifier.
PointerTpr = Device.getMappingInfo().getTargetPointer(
HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
/*HstPtrName=*/nullptr,
ArgTypes[I], /*HstPtrName=*/nullptr,
/*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
/*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
Expand Down Expand Up @@ -451,9 +451,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
// Note that HDTTMap will be released in getTargetPointer.
auto TPR = Device.getMappingInfo().getTargetPointer(
HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, ArgTypes[I],
HstPtrName, HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef,
HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
PointerTpr.getEntry());
void *TgtPtrBegin = TPR.TargetPointer;
IsHostPtr = TPR.Flags.IsHostPointer;
// If data_size==0, then the argument could be a zero-length pointer to
Expand Down Expand Up @@ -482,7 +483,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,

if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
(void **)PointerHstPtrBegin, HstPtrBase,
(void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
(void **)PointerTgtPtrBegin, ExpectedTgtPtrBase,
static_cast<bool>(ArgTypes[I] &
OMP_TGT_MAPTYPE_DESCRIPTOR_BASE_ADDR)})) {
DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));

Expand Down Expand Up @@ -591,6 +594,12 @@ postProcessingTargetDataEnd(DeviceTy *Device,
const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
if (HasFrom) {
Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
// For Fortran descriptors/dope vectors, it is possible, we have
// deallocated the data on host and the descriptor persists as it is
// a separate entity, and we do not want to map back the data to host
// in these cases when releasing the dope vector.
if (*ShadowPtr.HstPtrAddr == nullptr && ShadowPtr.IsDescriptorBaseAddr)
return OFFLOAD_SUCCESS;
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
DP("Restoring original host pointer value " DPxMOD " for host "
"pointer " DPxMOD "\n",
Expand Down Expand Up @@ -833,6 +842,13 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
AsyncInfo.addPostProcessingFunction([=]() -> int {
int Ret = Entry->foreachShadowPointerInfo(
[&](const ShadowPtrInfoTy &ShadowPtr) {
// For Fortran descriptors/dope vectors, it is possible, we have
// deallocated the data on host and the descriptor persists as it
// is a separate entity, and we do not want to map back the data
// to host in these cases when releasing the dope vector.
if (*ShadowPtr.HstPtrAddr == nullptr &&
ShadowPtr.IsDescriptorBaseAddr)
return OFFLOAD_SUCCESS;
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
DP("Restoring original host pointer value " DPxMOD
" for host pointer " DPxMOD "\n",
Expand Down
16 changes: 13 additions & 3 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1906,9 +1906,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
OMPX_UseMultipleSdmaEngines(
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
HostDevice(HostDevice) {}
OMPX_ApuMaps("OMPX_APU_MAPS", false),
OMPX_SharedDescriptorMaxSize("LIBOMPTARGET_SHARED_DESCRIPTOR_MAX_SIZE",
96),
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}

~AMDGPUDeviceTy() {}

Expand Down Expand Up @@ -2813,6 +2815,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }

bool useSharedMemForDescriptor(int64_t Size) override {
return Size <= OMPX_SharedDescriptorMaxSize;
}

private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
Expand Down Expand Up @@ -2911,6 +2917,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// automatic zero-copy behavior on non-APU GPUs.
BoolEnvar OMPX_ApuMaps;

/// Descriptors of size <= to this value will be allocated using shared
/// memory. Default value is 48.
UInt32Envar OMPX_SharedDescriptorMaxSize;

/// Stream manager for AMDGPU streams.
AMDGPUStreamManagerTy AMDGPUStreamManager;

Expand Down
8 changes: 8 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -942,6 +942,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Allocate and construct a kernel object.
virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;

/// Return true if a descriptor of size 'Size' should be allocated using
/// shared memory. Default implementation returns 'false',
virtual bool useSharedMemForDescriptor(int64_t Size);

/// Reference to the underlying plugin that created this device.
GenericPluginTy &Plugin;

Expand Down Expand Up @@ -1344,6 +1348,10 @@ struct GenericPluginTy {
int32_t get_function(__tgt_device_binary Binary, const char *Name,
void **KernelPtr);

/// Return true if a descriptor of size 'Size' should be allocated using
/// shared memory.
bool use_shared_mem_for_descriptor(int32_t DeviceId, int64_t Size);

private:
/// Indicates if the platform runtime has been fully initialized.
bool Initialized = false;
Expand Down
7 changes: 7 additions & 0 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1597,6 +1597,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {

bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }

bool GenericDeviceTy::useSharedMemForDescriptor(int64_t Size) { return false; }

Error GenericPluginTy::init() {
if (Initialized)
return Plugin::success();
Expand Down Expand Up @@ -2199,3 +2201,8 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
*KernelPtr = &Kernel;
return OFFLOAD_SUCCESS;
}

bool GenericPluginTy::use_shared_mem_for_descriptor(int32_t DeviceId,
int64_t Size) {
return getDevice(DeviceId).useSharedMemForDescriptor(Size);
}