[Cuda Codegen] Emit launch bounds

Theodoros Theodoridis · Theodoros Theodoridis · commit f6a78dcc1d2e · 2018-06-20T09:11:05.000+02:00
Cuda functions can be annotated with launch bounds, that is the maximum
number of threads per block (the minimum blocks per multiprocessor can
also be specified). This information is used by nvrtc/nvcc during
register allocation (and probably other phases as well).
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -153,9 +153,17 @@ void emitArgs(stringstream& ss, const Scop& scop) {
 void emitKernelSignature(
     stringstream& ss,
     const std::string& specializedName,
-    const Scop& scop) {
+    const Scop& scop,
+    const Block& block) {
   TC_CHECK_NE(specializedName, "") << "name not provided";
-  ss << "__global__ void " << specializedName << "(";
+  auto b0 = block.view[0];
+  b0 = b0 == 0 ? 1 : b0;
+  auto b1 = block.view[1];
+  b1 = b1 == 0 ? 1 : b1;
+  auto b2 = block.view[2];
+  b1 = b2 == 0 ? 1 : b2;
+  ss << "__global__ __launch_bounds__(" << b0 * b1 * b2 << ") void "
+     << specializedName << "(";
   emitArgs(ss, scop);
   ss << ") {" << endl;
 }
@@ -753,7 +761,7 @@ string emitCudaKernel(
   }
 
   stringstream ss;
-  emitKernelSignature(ss, specializedName, scop);
+  emitKernelSignature(ss, specializedName, scop, mscop.numThreads);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
   emitTensorViews(ss, scop.halide.inputs, paramValues);
diff --git a/test/test_cuda_mapper.cc b/test/test_cuda_mapper.cc
@@ -451,7 +451,7 @@ def fun(float(N, N) A) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected(
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[N] = reinterpret_cast<float32 (*)[N]>(pO);
@@ -480,7 +480,7 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected =
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[512] = reinterpret_cast<float32 (*)[512]>(pO);