Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 22b36e1

Browse files
committed
cuda::MappedScop: introduce maxPrivateElements mapping option
This mapping option controls the maximum number of elements per thread that are promoted into the private memory (hopefully, registers, but we cannot guarantee this at the CUDA level). The value is optional in the protocol buffers. When not provided, query the maximum number of threads per block from CUDA device properties and divide it by the number of threads in the block to obtain the per-thread limitation. Note that using all registers in a single block will likely limit the occupancy of SMs, potentially degrading performance. Introducing the limiting factor is primarily motivated by this effect, and it lets the caller to require the mapper to use less registers, potentially increasing the occupancy. Since register allocation is performed by the downstream compiler, this option is a mere recommendation and is expressed in terms of (untyped) elements rather than actual registers. It would be impossible to account for all registers required by the main computation (that is, necessary to store the data loaded from memory during operations) at the CUDA level, that also contribute to the register pressure of the kernel. Although limiting the number of promoted elements number of registers available per thread may seem too constraining for occupancy, it is strictly better than the current approach where we may promote even more elements, which then get spilled into the slow local memory.
1 parent c200a4e commit 22b36e1

File tree

6 files changed

+26
-1
lines changed

6 files changed

+26
-1
lines changed

tc/core/cuda/cuda_mapping_options.cc

+5
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
299299
return *this;
300300
}
301301

302+
CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
303+
ownedProto_.set_max_private_elements(nElements);
304+
return *this;
305+
}
306+
302307
CudaMappingOptions& CudaMappingOptions::mapToThreads(
303308
const std::string& commaSeparatedSizes) {
304309
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);

tc/core/cuda/cuda_mapping_options.h

+1
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ class CudaMappingOptions {
197197
CudaMappingOptions& useReadOnlyCache(bool b);
198198
CudaMappingOptions& privateDepth(uint32_t depth);
199199
CudaMappingOptions& sharedDepth(uint32_t depth);
200+
CudaMappingOptions& maxPrivateElements(uint64_t nElements);
200201
///@}
201202

202203
/// Static constructors for predefined strategies.

tc/core/cuda/cuda_mapping_options_cpp_printer.cc

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
4040
}
4141
prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
4242
prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
43+
if (cudaOptions.proto().has_max_private_elements()) {
44+
prn.printValueOption(
45+
"maxPrivateElements", cudaOptions.proto().max_private_elements());
46+
}
4347
prn.endStmt();
4448
return prn;
4549
}

tc/core/polyhedral/cuda/mapped_scop.cc

+8-1
Original file line numberDiff line numberDiff line change
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10861086

10871087
// 9. Promote to registers below the loops mapped to threads.
10881088
if (cudaOptions.proto().use_private_memory()) {
1089-
promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
1089+
auto blockSizes = cudaOptions.block.extractVector();
1090+
auto nThreadsPerBlock = std::accumulate(
1091+
blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
1092+
auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
1093+
? cudaOptions.proto().max_private_elements()
1094+
: queryRegistersPerBlock() / nThreadsPerBlock;
1095+
promoteToRegistersAtDepth(
1096+
*mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
10901097
}
10911098

10921099
LOG_IF(INFO, FLAGS_debug_tc_mapper)

tc/proto/mapping_options.proto

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
7474
optional uint32 private_depth = 9;
7575
// Depth of promotion to shared memory, ignored if use_shared_memory is false.
7676
optional uint32 shared_depth = 10;
77+
// Maximum number of elements to promote to registers per thread. If not
78+
// provided, the number 32-bit registers per thread will be used.
79+
optional uint64 max_private_elements = 11;
7780
}
7881

7982
message CpuMappingOptionsProto {

tensor_comprehensions/pybinds/tclib.cc

+5
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
672672
"usePrivateMemory",
673673
&tc::CudaMappingOptions::usePrivateMemory,
674674
"Create thread-local copies of data in private memory")
675+
.def(
676+
"maxPrivateElements",
677+
&tc::CudaMappingOptions::maxPrivateElements,
678+
"The maximum number of elements per thread for which thread-local "
679+
"copies are created")
675680
.def(
676681
"unrollCopyShared",
677682
&tc::CudaMappingOptions::unrollCopyShared,

0 commit comments

Comments
 (0)