From c7263e78dc3cdefefef52369aeed96864f04c0bf Mon Sep 17 00:00:00 2001 From: agozillon Date: Tue, 10 Dec 2024 11:46:32 -0600 Subject: [PATCH] [Flang][OpenMP] Add global address space to globals for target device Currently we do not add the appropriate global address space to globals on the target device pass, this PR attempts to add at least a preliminary number of these address spaces markings and then add the appropriate casts where neccesary (from global to program primarily). This allows for more correct IR that the backends (in particular AMDGPU) can treat more aptly for optimisations and code correctness. Co-authored-by: Raghu Maddhipatla --- .../flang/Optimizer/Builder/FIRBuilder.h | 4 + .../flang/Optimizer/CodeGen/FIROpPatterns.h | 3 + flang/lib/Optimizer/Builder/FIRBuilder.cpp | 14 +++ flang/lib/Optimizer/CodeGen/CodeGen.cpp | 99 ++++++++++++++++--- flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp | 25 ++++- flang/test/Fir/convert-to-llvm.fir | 56 ++++++++--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 12 ++- 7 files changed, 182 insertions(+), 31 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index c5d86e713f253..eb32aeee9fe48 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -804,6 +804,10 @@ elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams); /// Get the address space which should be used for allocas uint64_t getAllocaAddressSpace(mlir::DataLayout *dataLayout); +uint64_t getGlobalAddressSpace(mlir::DataLayout *dataLayout); + +uint64_t getProgramAddressSpace(mlir::DataLayout *dataLayout); + } // namespace fir::factory #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h index 35749dae5d7e9..1659ede937f01 100644 --- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h +++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h @@ -185,6 +185,9 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern { unsigned getProgramAddressSpace(mlir::ConversionPatternRewriter &rewriter) const; + unsigned + getGlobalAddressSpace(mlir::ConversionPatternRewriter &rewriter) const; + const fir::FIRToLLVMPassOptions &options; using ConvertToLLVMPattern::match; diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index d01becfe80093..558f62616ae7e 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -1740,3 +1740,17 @@ uint64_t fir::factory::getAllocaAddressSpace(mlir::DataLayout *dataLayout) { return mlir::cast(addrSpace).getUInt(); return 0; } + +uint64_t fir::factory::getGlobalAddressSpace(mlir::DataLayout *dataLayout) { + if (dataLayout) + if (mlir::Attribute addrSpace = dataLayout->getGlobalMemorySpace()) + return mlir::cast(addrSpace).getUInt(); + return 0; +} + +uint64_t fir::factory::getProgramAddressSpace(mlir::DataLayout *dataLayout) { + if (dataLayout) + if (mlir::Attribute addrSpace = dataLayout->getProgramMemorySpace()) + return mlir::cast(addrSpace).getUInt(); + return 0; +} diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 5ba93fefab3f9..29de325817ab2 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -134,6 +134,49 @@ addLLVMOpBundleAttrs(mlir::ConversionPatternRewriter &rewriter, } namespace { + +// Creates an existing operation with an AddressOfOp or an AddrSpaceCastOp +// depending on the existing address spaces of the type. +mlir::Value createAddrOfOrASCast(mlir::ConversionPatternRewriter &rewriter, + mlir::Location loc, std::uint64_t globalAS, + std::uint64_t programAS, + llvm::StringRef symName, mlir::Type type) { + if (mlir::isa(type)) { + if (globalAS != programAS) { + auto llvmAddrOp = rewriter.create( + loc, getLlvmPtrType(rewriter.getContext(), globalAS), symName); + return rewriter.create( + loc, getLlvmPtrType(rewriter.getContext(), programAS), llvmAddrOp); + } + return rewriter.create( + loc, getLlvmPtrType(rewriter.getContext(), globalAS), symName); + } + return rewriter.create(loc, type, symName); +} + +// Replaces an existing operation with an AddressOfOp or an AddrSpaceCastOp +// depending on the existing address spaces of the type. +mlir::Value replaceWithAddrOfOrASCast(mlir::ConversionPatternRewriter &rewriter, + mlir::Location loc, + std::uint64_t globalAS, + std::uint64_t programAS, + llvm::StringRef symName, mlir::Type type, + mlir::Operation *replaceOp) { + if (mlir::isa(type)) { + if (globalAS != programAS) { + auto llvmAddrOp = rewriter.create( + loc, getLlvmPtrType(rewriter.getContext(), globalAS), symName); + return rewriter.replaceOpWithNewOp( + replaceOp, ::getLlvmPtrType(rewriter.getContext(), programAS), + llvmAddrOp); + } + return rewriter.replaceOpWithNewOp( + replaceOp, getLlvmPtrType(rewriter.getContext(), globalAS), symName); + } + return rewriter.replaceOpWithNewOp(replaceOp, type, + symName); +} + /// Lower `fir.address_of` operation to `llvm.address_of` operation. struct AddrOfOpConversion : public fir::FIROpConversion { using FIROpConversion::FIROpConversion; @@ -141,9 +184,15 @@ struct AddrOfOpConversion : public fir::FIROpConversion { llvm::LogicalResult matchAndRewrite(fir::AddrOfOp addr, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { - auto ty = convertType(addr.getType()); - rewriter.replaceOpWithNewOp( - addr, ty, addr.getSymbol().getRootReference().getValue()); + auto global = addr->getParentOfType() + .lookupSymbol(addr.getSymbol()); + replaceWithAddrOfOrASCast( + rewriter, addr->getLoc(), + global ? global.getAddrSpace() : getGlobalAddressSpace(rewriter), + getProgramAddressSpace(rewriter), + global ? global.getSymName() + : addr.getSymbol().getRootReference().getValue(), + convertType(addr.getType()), addr); return mlir::success(); } }; @@ -1350,14 +1399,34 @@ struct EmboxCommonConversion : public fir::FIROpConversion { ? fir::NameUniquer::getTypeDescriptorAssemblyName(recType.getName()) : fir::NameUniquer::getTypeDescriptorName(recType.getName()); mlir::Type llvmPtrTy = ::getLlvmPtrType(mod.getContext()); + + // We allow the module to be set to a default layout if it's a regular module + // however, we prevent this if it's a GPU module, as the datalayout in these + // cases will currently be the union of the GPU Module and the parent builtin + // module, with the GPU module overriding the parent where there are duplicates. + // However, if we force the default layout onto a GPU module, with no datalayout + // it'll result in issues as the infrastructure does not support the union of + // two layouts with builtin data layout entries currently (and it doesn't look + // like it was intended to). + std::optional dataLayout = + fir::support::getOrSetMLIRDataLayout( + mod, /*allowDefaultLayout*/ mlir::isa(mod) + ? false + : true); + assert(dataLayout.has_value() && "Module missing DataLayout information"); + if (auto global = mod.template lookupSymbol(name)) { - return rewriter.create(loc, llvmPtrTy, - global.getSymName()); + return createAddrOfOrASCast( + rewriter, loc, fir::factory::getGlobalAddressSpace(&*dataLayout), + fir::factory::getProgramAddressSpace(&*dataLayout), + global.getSymName(), llvmPtrTy); } if (auto global = mod.template lookupSymbol(name)) { // The global may have already been translated to LLVM. - return rewriter.create(loc, llvmPtrTy, - global.getSymName()); + return createAddrOfOrASCast( + rewriter, loc, global.getAddrSpace(), + fir::factory::getProgramAddressSpace(&*dataLayout), + global.getSymName(), llvmPtrTy); } // Type info derived types do not have type descriptors since they are the // types defining type descriptors. @@ -2896,12 +2965,16 @@ struct TypeDescOpConversion : public fir::FIROpConversion { : fir::NameUniquer::getTypeDescriptorName(recordType.getName()); auto llvmPtrTy = ::getLlvmPtrType(typeDescOp.getContext()); if (auto global = module.lookupSymbol(typeDescName)) { - rewriter.replaceOpWithNewOp( - typeDescOp, llvmPtrTy, global.getSymName()); + replaceWithAddrOfOrASCast(rewriter, typeDescOp->getLoc(), + global.getAddrSpace(), + getProgramAddressSpace(rewriter), + global.getSymName(), llvmPtrTy, typeDescOp); return mlir::success(); } else if (auto global = module.lookupSymbol(typeDescName)) { - rewriter.replaceOpWithNewOp( - typeDescOp, llvmPtrTy, global.getSymName()); + replaceWithAddrOfOrASCast(rewriter, typeDescOp->getLoc(), + getGlobalAddressSpace(rewriter), + getProgramAddressSpace(rewriter), + global.getSymName(), llvmPtrTy, typeDescOp); return mlir::success(); } return mlir::failure(); @@ -2992,8 +3065,8 @@ struct GlobalOpConversion : public fir::FIROpConversion { mlir::SymbolRefAttr comdat; llvm::ArrayRef attrs; auto g = rewriter.create( - loc, tyAttr, isConst, linkage, global.getSymName(), initAttr, 0, 0, - false, false, comdat, attrs, dbgExprs); + loc, tyAttr, isConst, linkage, global.getSymName(), initAttr, 0, + getGlobalAddressSpace(rewriter), false, false, comdat, attrs, dbgExprs); if (global.getAlignment() && *global.getAlignment() > 0) g.setAlignment(*global.getAlignment()); diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp index 12021deb4bd97..a352e00649c23 100644 --- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp +++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp @@ -346,7 +346,10 @@ unsigned ConvertFIRToLLVMPattern::getAllocaAddressSpace( mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp(); assert(parentOp != nullptr && "expected insertion block to have parent operation"); - if (auto module = parentOp->getParentOfType()) + auto module = mlir::isa(parentOp) + ? mlir::cast(parentOp) + : parentOp->getParentOfType(); + if (module) if (mlir::Attribute addrSpace = mlir::DataLayout(module).getAllocaMemorySpace()) return llvm::cast(addrSpace).getUInt(); @@ -358,11 +361,29 @@ unsigned ConvertFIRToLLVMPattern::getProgramAddressSpace( mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp(); assert(parentOp != nullptr && "expected insertion block to have parent operation"); - if (auto module = parentOp->getParentOfType()) + auto module = mlir::isa(parentOp) + ? mlir::cast(parentOp) + : parentOp->getParentOfType(); + if (module) if (mlir::Attribute addrSpace = mlir::DataLayout(module).getProgramMemorySpace()) return llvm::cast(addrSpace).getUInt(); return defaultAddressSpace; } +unsigned ConvertFIRToLLVMPattern::getGlobalAddressSpace( + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp(); + assert(parentOp != nullptr && + "expected insertion block to have parent operation"); + auto module = mlir::isa(parentOp) + ? mlir::cast(parentOp) + : parentOp->getParentOfType(); + if (module) + if (mlir::Attribute addrSpace = + mlir::DataLayout(module).getGlobalMemorySpace()) + return llvm::cast(addrSpace).getUInt(); + return defaultAddressSpace; +} + } // namespace fir diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 4c9f965e1241a..a5d741d7f566a 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -3,8 +3,8 @@ // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=i386-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=powerpc64le-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=x86_64-pc-win32" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC -// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-apple-darwin" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT,GENERIC -// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=amdgcn-amd-amdhsa, datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-P0" %s | FileCheck -check-prefixes=CHECK,AMDGPU %s +// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-apple-darwin" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT,GENERIC +// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=amdgcn-amd-amdhsa, datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" %s | FileCheck -check-prefixes=CHECK,AMDGPU %s //=================================================== // SUMMARY: Tests for FIR --> LLVM MLIR conversion @@ -17,7 +17,10 @@ fir.global @g_i0 : i32 { fir.has_value %1 : i32 } -// CHECK: llvm.mlir.global external @g_i0() {addr_space = 0 : i32} : i32 { +// CHECK: llvm.mlir.global external @g_i0() +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: i32 { // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: llvm.return %[[C0]] : i32 // CHECK: } @@ -29,7 +32,10 @@ fir.global @g_ci5 constant : i32 { fir.has_value %c : i32 } -// CHECK: llvm.mlir.global external constant @g_ci5() {addr_space = 0 : i32} : i32 { +// CHECK: llvm.mlir.global external constant @g_ci5() +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: i32 { // CHECK: %[[C5:.*]] = llvm.mlir.constant(5 : i32) : i32 // CHECK: llvm.return %[[C5]] : i32 // CHECK: } @@ -37,17 +43,26 @@ fir.global @g_ci5 constant : i32 { // ----- fir.global internal @i_i515 (515:i32) : i32 -// CHECK: llvm.mlir.global internal @i_i515(515 : i32) {addr_space = 0 : i32} : i32 +// CHECK: llvm.mlir.global internal @i_i515(515 : i32) +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: : i32 // ----- fir.global common @C_i511 (0:i32) : i32 -// CHECK: llvm.mlir.global common @C_i511(0 : i32) {addr_space = 0 : i32} : i32 +// CHECK: llvm.mlir.global common @C_i511(0 : i32) +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: : i32 // ----- fir.global weak @w_i86 (86:i32) : i32 -// CHECK: llvm.mlir.global weak @w_i86(86 : i32) {addr_space = 0 : i32} : i32 +// CHECK: llvm.mlir.global weak @w_i86(86 : i32) +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: : i32 // ----- @@ -69,9 +84,13 @@ fir.global @symbol : i64 { fir.has_value %0 : i64 } -// CHECK: %{{.*}} = llvm.mlir.addressof @[[SYMBOL:.*]] : !llvm.ptr +// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @[[SYMBOL:.*]] : !llvm.ptr +// AMDGPU: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<1> to !llvm.ptr -// CHECK: llvm.mlir.global external @[[SYMBOL]]() {addr_space = 0 : i32} : i64 { +// CHECK: llvm.mlir.global external @[[SYMBOL]]() +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: i64 { // CHECK: %{{.*}} = llvm.mlir.constant(1 : i64) : i64 // CHECK: llvm.return %{{.*}} : i64 // CHECK: } @@ -88,7 +107,10 @@ fir.global internal @_QEmultiarray : !fir.array<32x32xi32> { fir.has_value %2 : !fir.array<32x32xi32> } -// CHECK: llvm.mlir.global internal @_QEmultiarray() {addr_space = 0 : i32} : !llvm.array<32 x array<32 x i32>> { +// CHECK: llvm.mlir.global internal @_QEmultiarray() +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: : !llvm.array<32 x array<32 x i32>> { // CHECK: %[[CST:.*]] = llvm.mlir.constant(dense<1> : vector<32x32xi32>) : !llvm.array<32 x array<32 x i32>> // CHECK: llvm.return %[[CST]] : !llvm.array<32 x array<32 x i32>> // CHECK: } @@ -105,7 +127,10 @@ fir.global internal @_QEmultiarray : !fir.array<32xi32> { fir.has_value %2 : !fir.array<32xi32> } -// CHECK: llvm.mlir.global internal @_QEmultiarray() {addr_space = 0 : i32} : !llvm.array<32 x i32> { +// CHECK: llvm.mlir.global internal @_QEmultiarray() +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: : !llvm.array<32 x i32> { // CHECK: %[[CST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %{{.*}} = llvm.mlir.undef : !llvm.array<32 x i32> // CHECK: %{{.*}} = llvm.insertvalue %[[CST]], %{{.*}}[5] : !llvm.array<32 x i32> @@ -1787,7 +1812,9 @@ func.func @embox1(%arg0: !fir.ref>) { // CHECK: %{{.*}} = llvm.insertvalue %[[VERSION]], %{{.*}}[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> // CHECK: %[[TYPE_CODE_I8:.*]] = llvm.trunc %[[TYPE_CODE]] : i32 to i8 // CHECK: %{{.*}} = llvm.insertvalue %[[TYPE_CODE_I8]], %{{.*}}[4] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> -// CHECK: %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr +// GENERIC: %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr +// AMDGPU: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMtest_dinitE.dt.tseq : !llvm.ptr<1> +// AMDGPU: %[[TDESC:.*]] = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<1> to !llvm.ptr // CHECK: %{{.*}} = llvm.insertvalue %[[TDESC]], %{{.*}}[7] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> // ----- @@ -2775,7 +2802,10 @@ func.func @coordinate_array_unknown_size_1d(%arg0: !fir.ptr> fir.global common @c_(dense<0> : vector<4294967296xi8>) : !fir.array<4294967296xi8> -// CHECK: llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) {addr_space = 0 : i32} : !llvm.array<4294967296 x i8> +// CHECK: llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) +// GENERIC-SAME: {addr_space = 0 : i32} +// AMDGPU-SAME: {addr_space = 1 : i32} +// CHECK-SAME: !llvm.array<4294967296 x i8> // ----- diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3d461f0ad4228..ae31598ea2c7b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6743,6 +6743,12 @@ FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() { return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit); } +static Value *removeASCastIfPresent(Value *V) { + if (Operator::getOpcode(V) == Instruction::AddrSpaceCast) + return cast(V)->getOperand(0); + return V; +} + static Expected createOutlinedFunction( OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, @@ -6926,9 +6932,9 @@ static Expected createOutlinedFunction( // preceding mapped arguments that refer to the same global that may be // seperate segments. To prevent this, we defer global processing until all // other processing has been performed. - if (llvm::isa(std::get<0>(InArg)) || - llvm::isa(std::get<0>(InArg)) || - llvm::isa(std::get<0>(InArg))) { + if (llvm::isa(removeASCastIfPresent(Input)) || + llvm::isa(removeASCastIfPresent(Input)) || + llvm::isa(removeASCastIfPresent(Input))) { DeferredReplacement.push_back(std::make_pair(Input, InputCopy)); continue; }