diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 28909cef4748d..ac814defbf071 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2481,7 +2481,7 @@ class OpenMPIRBuilder { TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector &Dependencies, - bool HasNoWait); + TargetDataRTArgs &RTArgs, bool HasNoWait); /// Emit the arguments to be passed to the runtime library based on the /// arrays of base pointers, pointers, sizes, map types, and mappers. If diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 2e5ce5308eea5..9bb9dd65594bf 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6603,7 +6603,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( /*TargetTaskAllocaIP=*/{})); else cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP, - /*Dependencies=*/{}, Info.HasNoWait)); + /*Dependencies=*/{}, RTArgs, Info.HasNoWait)); } else { Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr( omp::OMPRTL___tgt_target_data_begin_mapper); @@ -7051,9 +7051,25 @@ static Expected createOutlinedFunction( /// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) /// This function is called from emitTargetTask once the /// code to launch the target kernel has been outlined already. -static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, - IRBuilderBase &Builder, - CallInst *StaleCI) { +static Function *emitTargetTaskProxyFunction( + OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, + StructType *PrivatesTy, StructType *TaskWithPrivatesTy, + const size_t NumOffloadingArrays, const int SharedArgsOperandNo) { + + // NumOffloadingArrays is the number of offloading arrays that we need to copy + // into the task structure so that the deferred target task can access this + // data even after the stack frame of the generating task has been rolled + // back. Offloading arrays contain base pointers, pointers, sizes etc + // of the data that the target kernel will access. In other words, the + // arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs + // The number of arrays and the size of each array depends on the specifics of + // the target call. These arrays are copied into a struct whose type is + // PrivatesTy. So, if NumOffloadingArrays is non-zero, PrivatesTy better + // not be nullptr + assert((!NumOffloadingArrays || PrivatesTy) && + "PrivatesTy cannot be nullptr when there are offloadingArrays" + "to privatize"); + Module &M = OMPBuilder.M; // KernelLaunchFunction is the target launch function, i.e. // the function that sets up kernel arguments and calls @@ -7080,10 +7096,13 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, // call void @_QQmain..omp_par.1(i32 %global.tid.val6) OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(), StaleCI->getIterator()); + LLVMContext &Ctx = StaleCI->getParent()->getContext(); + Type *ThreadIDTy = Type::getInt32Ty(Ctx); Type *TaskPtrTy = OMPBuilder.TaskPtr; Type *TaskTy = OMPBuilder.Task; + auto ProxyFnTy = FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy}, /* isVarArg */ false); @@ -7093,21 +7112,33 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, ProxyFn->getArg(0)->setName("thread.id"); ProxyFn->getArg(1)->setName("task"); + bool HasShareds = SharedArgsOperandNo > 0; + bool HasOffloadingArrays = NumOffloadingArrays > 0; BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", ProxyFn); Builder.SetInsertPoint(EntryBB); - bool HasShareds = StaleCI->arg_size() > 1; - // TODO: This is a temporary assert to prove to ourselves that - // the outlined target launch function is always going to have - // atmost two arguments if there is any data shared between - // host and device. - assert((!HasShareds || (StaleCI->arg_size() == 2)) && - "StaleCI with shareds should have exactly two arguments."); - Value *ThreadId = ProxyFn->getArg(0); + Value *TaskWithPrivates = ProxyFn->getArg(1); + + SmallVector KernelLaunchArgs; + KernelLaunchArgs.reserve(StaleCI->arg_size()); + KernelLaunchArgs.push_back(ThreadId); + + if (HasOffloadingArrays) { + assert(TaskTy != TaskWithPrivatesTy && + "If there are offloading arrays to pass to the target" + "TaskTy cannot be the same as TaskWithPrivatesTy"); + Value *Privates = + Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1); + for (unsigned int i = 0; i < NumOffloadingArrays; ++i) + KernelLaunchArgs.push_back( + Builder.CreateStructGEP(PrivatesTy, Privates, i)); + } + if (HasShareds) { - auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); + auto *ArgStructAlloca = + dyn_cast(StaleCI->getArgOperand(SharedArgsOperandNo)); assert(ArgStructAlloca && "Unable to find the alloca instruction corresponding to arguments " "for extracted function"); @@ -7115,27 +7146,76 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, AllocaInst *NewArgStructAlloca = Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); - Value *TaskT = ProxyFn->getArg(1); + Value *SharedsSize = Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); - Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0); + Value *TaskT = + Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0); + Value *Shareds = TaskT; + // TaskWithPrivatesTy can be + // %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t, + // %struct.privates } + // OR + // %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy + // In the former case, that is when TaskWithPrivatesTy is not the same as + // TaskTy, then its first member has to be the task descriptor. TaskTy is + // the type of the task descriptor. TaskT is the pointer to the task + // descriptor. Loading the first member of TaskT, gives us the pointer to + // shared data. + if (TaskWithPrivatesTy != TaskTy) + Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0); LoadInst *LoadShared = Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds); Builder.CreateMemCpy( NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared, LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); - - Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); - } else { - Builder.CreateCall(KernelLaunchFunction, {ThreadId}); + KernelLaunchArgs.push_back(NewArgStructAlloca); } - + Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs); Builder.CreateRetVoid(); return ProxyFn; } +// This function returns a struct that has at most two members. +// The first member is always %struct.kmp_task_ompbuilder_t, that is the task +// descriptor. The second member, if needed, is a struct containing arrays +// that need to be passed to the offloaded target kernel. For example, +// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to +// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64] +// respectively, then the types created by this function are +// +// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] } +// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t, +// %struct.privates } +// %struct.task_with_privates is returned by this function. +// If there aren't any offloading arrays to pass to the target kernel, +// %struct.kmp_task_ompbuilder_t is returned. +static StructType * +createTaskWithPrivatesTy(Type *Task, + ArrayRef OffloadingArraysToPrivatize) { + + if (OffloadingArraysToPrivatize.empty()) + return static_cast(Task); + + SmallVector StructFieldTypes; + for (auto &V : OffloadingArraysToPrivatize) { + assert(V->getType()->isPointerTy() && + "Expected pointer to array to privatize. Got a non-pointer value " + "instead"); + if (auto *GEP = dyn_cast(V)) + StructFieldTypes.push_back(GEP->getSourceElementType()); + else if (auto *Alloca = dyn_cast(V)) + StructFieldTypes.push_back(Alloca->getAllocatedType()); + else + llvm_unreachable("Unhandled Instruction type"); + } + StructType *PrivatesStructTy = + StructType::create(StructFieldTypes, "struct.privates"); + return StructType::create({Task, PrivatesStructTy}, + "struct.task_with_privates"); +} static Error emitTargetOutlinedFunction( OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, @@ -7161,7 +7241,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector &Dependencies, - bool HasNoWait) { + TargetDataRTArgs &RTArgs, bool HasNoWait) { // The following explains the code-gen scenario for the `target` directive. A // similar scneario is followed for other device-related directives (e.g. @@ -7171,27 +7251,30 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( // When we arrive at this function, the target region itself has been // outlined into the function OutlinedFn. // So at ths point, for - // -------------------------------------------------- + // -------------------------------------------------------------- // void user_code_that_offloads(...) { - // omp target depend(..) map(from:a) map(to:b, c) - // a = b + c + // omp target depend(..) map(from:a) map(to:b) private(i) + // do i = 1, 10 + // a(i) = b(i) + n // } // - // -------------------------------------------------- + // -------------------------------------------------------------- // // we have // - // -------------------------------------------------- + // -------------------------------------------------------------- // // void user_code_that_offloads(...) { - // %.offload_baseptrs = alloca [3 x ptr], align 8 - // %.offload_ptrs = alloca [3 x ptr], align 8 - // %.offload_mappers = alloca [3 x ptr], align 8 + // %.offload_baseptrs = alloca [2 x ptr], align 8 + // %.offload_ptrs = alloca [2 x ptr], align 8 + // %.offload_mappers = alloca [2 x ptr], align 8 // ;; target region has been outlined and now we need to // ;; offload to it via a target task. // } - // void outlined_device_function(ptr a, ptr b, ptr c) { - // *a = *b + *c + // void outlined_device_function(ptr a, ptr b, ptr n) { + // n = *n_ptr; + // do i = 1, 10 + // a(i) = b(i) + n // } // // We have to now do the following @@ -7204,33 +7287,58 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( // (iii) Create a task with the task entry point created in (ii) // // That is we create the following - // + // struct task_with_privates { + // struct kmp_task_ompbuilder_t; + // struct privates { + // [2 x ptr], ; baseptrs + // [2 x ptr] ; ptrs + // [2 x i64] ; sizes + // } + // } // void user_code_that_offloads(...) { - // %.offload_baseptrs = alloca [3 x ptr], align 8 - // %.offload_ptrs = alloca [3 x ptr], align 8 - // %.offload_mappers = alloca [3 x ptr], align 8 + // %.offload_baseptrs = alloca [2 x ptr], align 8 + // %.offload_ptrs = alloca [2 x ptr], align 8 + // %.offload_sizes = alloca [2 x i64], align 8 // // %structArg = alloca { ptr, ptr, ptr }, align 8 - // %strucArg[0] = %.offload_baseptrs - // %strucArg[1] = %.offload_ptrs - // %strucArg[2] = %.offload_mappers - // proxy_target_task = @__kmpc_omp_task_alloc(..., - // @.omp_target_task_proxy_func) - // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg)) + // %strucArg[0] = a + // %strucArg[1] = b + // %strucArg[2] = &n + // + // target_task_with_privates = @__kmpc_omp_target_task_alloc(..., + // sizeof(kmp_task_ompbuilder_t), + // sizeof(structArg), + // @.omp_target_task_proxy_func, + // ...) + // memcpy(target_task->shareds, %structArg, sizeof(structArg)) + // memcpy(target_task->privates->baseptrs, + // offload_baseptrs, sizeof(offload_baseptrs) + // memcpy(target_task->privates->ptrs, + // offload_ptrs, sizeof(offload_ptrs) + // memcpy(target_task->privates->sizes, + // offload_sizes, sizeof(offload_sizes) // dependencies_array = ... // ;; if nowait not present // call @__kmpc_omp_wait_deps(..., dependencies_array) // call @__kmpc_omp_task_begin_if0(...) // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr - // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...) + // %target_task_with_privates) + // call @__kmpc_omp_task_complete_if0(...) // } // // define internal void @.omp_target_task_proxy_func(i32 %thread.id, // ptr %task) { // %structArg = alloca {ptr, ptr, ptr} - // %shared_data = load (getelementptr %task, 0, 0) - // mempcy(%structArg, %shared_data, sizeof(structArg)) - // kernel_launch_function(%thread.id, %structArg) + // %task_ptr = getelementptr(%task, 0, 0) + // %shared_data = load (getelementptr %task_ptr, 0, 0) + // mempcy(%structArg, %shared_data, sizeof(%structArg)) + // + // %offloading_arrays = getelementptr(%task, 0, 1) + // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0) + // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1) + // %offload_sizes = getelementptr(%offloading_arrays, 0, 2) + // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs, + // %offload_sizes, %structArg) // } // // We need the proxy function because the signature of the task entry point @@ -7238,21 +7346,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( // that of the kernel_launch function. // // kernel_launch_function is generated by emitKernelLaunch and has the - // always_inline attribute. - // void kernel_launch_function(thread_id, - // structArg) alwaysinline { + // always_inline attribute. For this example, it'll look like so + // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs, + // %offload_sizes, %structArg) alwaysinline { // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 - // offload_baseptrs = load(getelementptr structArg, 0, 0) - // offload_ptrs = load(getelementptr structArg, 0, 1) - // offload_mappers = load(getelementptr structArg, 0, 2) + // ; load aggregated data from %structArg // ; setup kernel_args using offload_baseptrs, offload_ptrs and - // ; offload_mappers + // ; offload_sizes // call i32 @__tgt_target_kernel(..., // outlined_device_function, // ptr %kernel_args) // } - // void outlined_device_function(ptr a, ptr b, ptr c) { - // *a = *b + *c + // void outlined_device_function(ptr a, ptr b, ptr n) { + // n = *n_ptr; + // do i = 1, 10 + // a(i) = b(i) + n // } // BasicBlock *TargetTaskBodyBB = @@ -7273,6 +7381,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); + // Generate the task body which will subsequently be outlined. Builder.restoreIP(TargetTaskBodyIP); if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP)) return Err; @@ -7291,15 +7400,56 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(), /*IsFinished=*/true); - OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, - DeviceID](Function &OutlinedFn) mutable { + SmallVector OffloadingArraysToPrivatize; + if (DeviceID && HasNoWait) { + for (auto *V : + {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray, + RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd, + RTArgs.SizesArray}) { + if (V && !isa(V) && !isa(V)) { + OffloadingArraysToPrivatize.push_back(V); + OI.ExcludeArgsFromAggregate.push_back(V); + } + } + } + OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, DeviceID, + OffloadingArraysToPrivatize]( + Function &OutlinedFn) mutable { assert(OutlinedFn.getNumUses() == 1 && "there must be a single user for the outlined function"); CallInst *StaleCI = cast(OutlinedFn.user_back()); - bool HasShareds = StaleCI->arg_size() > 1; - Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI); + // The first argument of StaleCI is always the thread id. + // The next few arguments are the pointers to offloading arrays + // if any. (See OffloadingArraysToPrivatize) + // Finally, all other local values that are live-in into the outlined region + // end up in a structure whose pointer is passed as the last argument. This + // piece of data is passed in the "shared" field of the task structure. So, + // we know we have to pass shareds to the task if the number of arguments is + // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the + // thread id. Further, for safety, we assert that the number of arguments of + // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2 + const unsigned int NumStaleCIArgs = StaleCI->arg_size(); + bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1; + assert( + !HasShareds || + NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) && + "Wrong number of arguments for StaleCI when shareds are present"); + int SharedArgOperandNo = + HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0; + + StructType *TaskWithPrivatesTy = + createTaskWithPrivatesTy(Task, OffloadingArraysToPrivatize); + StructType *PrivatesTy = nullptr; + + if (OffloadingArraysToPrivatize.size()) + PrivatesTy = + static_cast(TaskWithPrivatesTy->getElementType(1)); + + Function *ProxyFn = emitTargetTaskProxyFunction( + *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy, + OffloadingArraysToPrivatize.size(), SharedArgOperandNo); LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn << "\n"); @@ -7330,17 +7480,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( // Argument - `sizeof_kmp_task_t` (TaskSize) // Tasksize refers to the size in bytes of kmp_task_t data structure - // including private vars accessed in task. - // TODO: add kmp_task_t_with_privates (privates) - Value *TaskSize = - Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task)); + // plus any other data to be passed to the target task, if any, which + // is packed into a struct. kmp_task_t and the struct so created are + // packed into a wrapper struct whose type is TaskWithPrivatesTy + Value *TaskSize = Builder.getInt64( + M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy)); // Argument - `sizeof_shareds` (SharedsSize) // SharedsSize refers to the shareds array size in the kmp_task_t data // structure. Value *SharedsSize = Builder.getInt64(0); if (HasShareds) { - auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); + auto *ArgStructAlloca = + dyn_cast(StaleCI->getArgOperand(SharedArgOperandNo)); assert(ArgStructAlloca && "Unable to find the alloca instruction corresponding to arguments " "for extracted function"); @@ -7378,13 +7530,43 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs); + Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); if (HasShareds) { - Value *Shareds = StaleCI->getArgOperand(1); - Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); - Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); + Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo); + Value *TaskT = Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 0); + Value *TaskSharedsPtr = TaskT; + if (TaskWithPrivatesTy != Task) { + TaskSharedsPtr = Builder.CreateStructGEP(Task, TaskT, 0); + } + Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskSharedsPtr); + Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, SharedsSize); } + if (OffloadingArraysToPrivatize.size()) { + Value *Privates = + Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1); + for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) { + Value *PtrToPrivatize = OffloadingArraysToPrivatize[i]; + Type *ArrayType = nullptr; + if (auto *GEP = dyn_cast(PtrToPrivatize)) + ArrayType = GEP->getSourceElementType(); + else if (auto *Alloca = dyn_cast(PtrToPrivatize)) + ArrayType = Alloca->getAllocatedType(); + else + llvm_unreachable("Unhandled Instruction type"); + assert(ArrayType && "ArrayType cannot be nullptr"); + + Type *ElementType = PrivatesTy->getElementType(i); + assert(ElementType == ArrayType && + "ElementType should match ArrayType"); + + Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i); + Builder.CreateMemCpy( + Dst, Alignment, PtrToPrivatize, Alignment, + Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType))); + } + } Value *DepArray = emitTaskDependencies(*this, Dependencies); @@ -7530,9 +7712,11 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, // Arguments that are intended to be directly forwarded to an // emitKernelLaunch call are pased as nullptr, since // OutlinedFnID=nullptr results in that call not being done. + // OpenMPIRBuilder::TargetDataInfo Info; + OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs; return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr, /*RTLoc=*/nullptr, AllocaIP, - Dependencies, HasNoWait); + Dependencies, EmptyRTArgs, HasNoWait); } return EmitTargetCallFallbackCB(Builder.saveIP()); }()); @@ -7544,6 +7728,7 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, auto &&EmitTargetCallThen = [&](OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { + Info.HasNoWait = HasNoWait; OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); OpenMPIRBuilder::TargetDataRTArgs RTArgs; if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs( @@ -7621,7 +7806,8 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, // explicit generation of the target task. if (RequiresOuterTargetTask) return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, - Dependencies, HasNoWait); + Dependencies, KArgs.RTArgs, + Info.HasNoWait); return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir index f2948c6510138..0f2437639319a 100644 --- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir @@ -126,7 +126,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // CHECK-DAG: %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 // CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func) -// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8 +// CHECK: %[[SHARED_PTR:.+]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASKDATA]], i32 0, i32 0 +// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[SHARED_PTR]], align 8 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false) // CHECK: %[[DEP_INFO:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0 diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir index 8124d02ef2174..dba8c553aaca5 100644 --- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir @@ -14,25 +14,20 @@ llvm.func @_QPopenmp_target_data_enter() { // CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc // CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr -// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}}) +// CHECK-SAME: @[[TASK_PROXY_FUNC_ENTER:.*]], i64 {{.*}}) // CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]]) // CHECK: } -// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) { -// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0 -// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8 -// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1 -// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8 - +// CHECK: define internal void @[[TASK_BODY_FUNC_ENTER:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) { // CHECK: call void @__tgt_target_data_begin_nowait_mapper( // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1, -// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]], +// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]], // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null) // CHECK: } -// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) { -// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) +// CHECK: define internal void @[[TASK_PROXY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}) { +// CHECK: call void @[[TASK_BODY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}, ptr %{{.*}}) // CHECK: } // ----- @@ -51,25 +46,20 @@ llvm.func @_QPopenmp_target_data_update() { // CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc // CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr -// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}}) +// CHECK-SAME: @[[TASK_PROXY_FUNC_UPDATE:.*]], i64 {{.*}}) // CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]]) // CHECK: } -// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) { -// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0 -// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8 -// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1 -// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8 - +// CHECK: define internal void @[[TASK_BODY_FUNC_UPDATE:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) { // CHECK: call void @__tgt_target_data_update_nowait_mapper( // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1, -// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]], +// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]], // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null) // CHECK: } -// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) { -// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) +// CHECK: define internal void @[[TASK_PROXY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) { +// CHECK: call void @[[TASK_BODY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) // CHECK: } // ----- @@ -88,23 +78,18 @@ llvm.func @_QPopenmp_target_data_exit() { // CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc // CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr -// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}}) +// CHECK-SAME: @[[TASK_PROXY_FUNC_EXIT:.*]], i64 {{.*}}) // CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]]) // CHECK: } -// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) { -// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0 -// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8 -// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1 -// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8 - +// CHECK: define internal void @[[TASK_BODY_FUNC_EXIT:.*]](i32 %{{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) { // CHECK: call void @__tgt_target_data_end_nowait_mapper( // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1, -// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]], +// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]], // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null) // CHECK: } -// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) { -// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) +// CHECK: define internal void @[[TASK_PROXY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) { +// CHECK: call void @[[TASK_BODY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) // CHECK: }