cuda::MappedScop: introduce maxPrivateElements mapping option

ftynse · ftynse · commit 22b36e1e6c59 · 2018-07-26T12:13:27.000+02:00
This mapping option controls the maximum number of elements per thread
that are promoted into the private memory (hopefully, registers, but we
cannot guarantee this at the CUDA level).  The value is optional in the
protocol buffers.  When not provided, query the maximum number of
threads per block from CUDA device properties and divide it by the
number of threads in the block to obtain the per-thread limitation.
Note that using all registers in a single block will likely limit the
occupancy of SMs, potentially degrading performance.  Introducing the
limiting factor is primarily motivated by this effect, and it lets the
caller to require the mapper to use less registers, potentially
increasing the occupancy.  Since register allocation is performed by the
downstream compiler, this option is a mere recommendation and is
expressed in terms of (untyped) elements rather than actual registers.
It would be impossible to account for all registers required by the main
computation (that is, necessary to store the data loaded from memory
during operations) at the CUDA level, that also contribute to the
register pressure of the kernel.

Although limiting the number of promoted elements number of registers
available per thread may seem too constraining for occupancy, it is
strictly better than the current approach where we may promote even more
elements, which then get spilled into the slow local memory.
diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
+  ownedProto_.set_max_private_elements(nElements);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
@@ -197,6 +197,7 @@ class CudaMappingOptions {
   CudaMappingOptions& useReadOnlyCache(bool b);
   CudaMappingOptions& privateDepth(uint32_t depth);
   CudaMappingOptions& sharedDepth(uint32_t depth);
+  CudaMappingOptions& maxPrivateElements(uint64_t nElements);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
   }
   prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
   prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
+  if (cudaOptions.proto().has_max_private_elements()) {
+    prn.printValueOption(
+        "maxPrivateElements", cudaOptions.proto().max_private_elements());
+  }
   prn.endStmt();
   return prn;
 }
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 9. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
+    auto blockSizes = cudaOptions.block.extractVector();
+    auto nThreadsPerBlock = std::accumulate(
+        blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
+    auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
+        ? cudaOptions.proto().max_private_elements()
+        : queryRegistersPerBlock() / nThreadsPerBlock;
+    promoteToRegistersAtDepth(
+        *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
   }
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
   optional uint32 private_depth = 9;
   // Depth of promotion to shared memory, ignored if use_shared_memory is false.
   optional uint32 shared_depth = 10;
+  // Maximum number of elements to promote to registers per thread.  If not
+  // provided, the number 32-bit registers per thread will be used.
+  optional uint64 max_private_elements = 11;
 }
 
 message CpuMappingOptionsProto {
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
           "usePrivateMemory",
           &tc::CudaMappingOptions::usePrivateMemory,
           "Create thread-local copies of data in private memory")
+      .def(
+          "maxPrivateElements",
+          &tc::CudaMappingOptions::maxPrivateElements,
+          "The maximum number of elements per thread for which thread-local "
+          "copies are created")
       .def(
           "unrollCopyShared",
           &tc::CudaMappingOptions::unrollCopyShared,

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(`
`40`	`40`	`}`
`41`	`41`	`prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());`
`42`	`42`	`prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());`
	`43`	`+ if (cudaOptions.proto().has_max_private_elements()) {`
	`44`	`+ prn.printValueOption(`
	`45`	`+ "maxPrivateElements", cudaOptions.proto().max_private_elements());`
	`46`	`+ }`
`43`	`47`	`prn.endStmt();`
`44`	`48`	`return prn;`
`45`	`49`	`}`
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {`
`74`	`74`	`optional uint32 private_depth = 9;`
`75`	`75`	`// Depth of promotion to shared memory, ignored if use_shared_memory is false.`
`76`	`76`	`optional uint32 shared_depth = 10;`
	`77`	`+ // Maximum number of elements to promote to registers per thread. If not`
	`78`	`+ // provided, the number 32-bit registers per thread will be used.`
	`79`	`+ optional uint64 max_private_elements = 11;`
`77`	`80`	`}`
`78`	`81`
`79`	`82`	`message CpuMappingOptionsProto {`