-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[Clang][OpenCL][AMDGPU] OpenCL Kernel stubs should be assigned alwaysinline attribute #137769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
41ea0f3
to
ad9e9dc
Compare
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-amdgpu Author: Aniket Lal (lalaniket8) ChangesOpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821). Patch is 239.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137769.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e917f3c42da06..384c4f3627212 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6174,6 +6174,14 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
setNonAliasAttributes(GD, Fn);
+
+ if (D->hasAttr<OpenCLKernelAttr>())
+ if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+ !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+ !Fn->hasFnAttribute(llvm::Attribute::InlineHint) &&
+ !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone))
+ Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+
SetLLVMFunctionAttributesForDefinition(D, Fn);
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 49604c6c5e61b..58c358672dd0e 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -128,18 +128,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -152,7 +163,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
-// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
// X86-NEXT: ret void
//
@@ -168,18 +179,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker_large(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -192,7 +214,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
-// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
// X86-NEXT: ret void
//
@@ -227,16 +249,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]]
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local(
-// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -244,61 +268,74 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-SAME: ) #[[ATTR0]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 8, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir(
-// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 800, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -332,32 +369,40 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @KernelTwoMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 16, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false)
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
-// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: Aniket Lal (lalaniket8) ChangesOpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821). Patch is 239.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137769.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e917f3c42da06..384c4f3627212 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6174,6 +6174,14 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
setNonAliasAttributes(GD, Fn);
+
+ if (D->hasAttr<OpenCLKernelAttr>())
+ if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+ !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+ !Fn->hasFnAttribute(llvm::Attribute::InlineHint) &&
+ !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone))
+ Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+
SetLLVMFunctionAttributesForDefinition(D, Fn);
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 49604c6c5e61b..58c358672dd0e 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -128,18 +128,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -152,7 +163,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
-// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
// X86-NEXT: ret void
//
@@ -168,18 +179,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker_large(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -192,7 +214,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
-// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
// X86-NEXT: ret void
//
@@ -227,16 +249,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]]
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local(
-// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -244,61 +268,74 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-SAME: ) #[[ATTR0]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 8, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir(
-// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 800, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -332,32 +369,40 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @KernelTwoMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 16, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false)
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
-// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL...
[truncated]
|
ad9e9dc
to
faed4e3
Compare
In practice this should be a single use of an internal function and should not require this hint. Is this papering over a different issue? |
Why do you think it is an internal function? I thought it's an externally callable stub. |
faed4e3
to
8fd3b0c
Compare
Agree. OpenCL allows calling an external kernel function https://godbolt.org/z/ThPKfxKqo so the stub function should have external linkage. |
Yes at construction, but in the real optimization pipeline it will be internalized |
OpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821).
The stub function should be alwaysinlined, since call to stub can cause performance drop.