diff --git a/tc/c2/tc_op.h b/tc/c2/tc_op.h index c362237b3..051659dce 100644 --- a/tc/c2/tc_op.h +++ b/tc/c2/tc_op.h @@ -119,11 +119,14 @@ class TcOp : public Operator { compiled_ = true; } + // Get CUDA stream id from C2 + tc::CudaBackend::RuntimeInformation info = {context_.cuda_stream()}; + // run if (!check_sizes_) { - executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_); + executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_, info); } else { - executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_); + executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_, info); } return true; } diff --git a/tc/core/cpu/cpu_tc_executor.cc b/tc/core/cpu/cpu_tc_executor.cc index 0f9fe4a1b..a76efc714 100644 --- a/tc/core/cpu/cpu_tc_executor.cc +++ b/tc/core/cpu/cpu_tc_executor.cc @@ -52,7 +52,8 @@ CpuCompilationResult CpuBackend::compileWithTcMapper( void CpuTcExecutor::uncheckedRun( const std::vector& inputs, - const std::vector& outputs) const { + const std::vector& outputs, + typename CpuBackend::RuntimeInformation info) const { LOG(ERROR) << "NYI: CpuTcExecutor::uncheckedRun"; } diff --git a/tc/core/cpu/cpu_tc_executor.h b/tc/core/cpu/cpu_tc_executor.h index 736fde83b..079ae1589 100644 --- a/tc/core/cpu/cpu_tc_executor.h +++ b/tc/core/cpu/cpu_tc_executor.h @@ -39,7 +39,8 @@ class CpuTcExecutor : public TcExecutor { /// doesn't then segfault will likely occur. void uncheckedRun( const std::vector& inputs, - const std::vector& outputs) const; + const std::vector& outputs, + typename CpuBackend::RuntimeInformation info) const; /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime /// (microseconds). diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc index f1abf5917..57781156d 100644 --- a/tc/core/cuda/cuda_tc_executor.cc +++ b/tc/core/cuda/cuda_tc_executor.cc @@ -111,16 +111,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper( void CudaTcExecutor::uncheckedRun( const std::vector& inputs, - const std::vector& outputs) const { + const std::vector& outputs, + typename CudaBackend::RuntimeInformation info) const { CHECK(rtcFun_) << "No rtcFun_ attached, cannot launch"; - cudaStream_t stream = 0; CHECK_NE(grid_.view[0], 0u) << "Grid dims are not set up"; CHECK_NE(block_.view[0], 0u) << "Block dims are not set up"; rtcFun_->Launch( grid_.view.extractDefaultedArray(), block_.view.extractDefaultedArray(), 0, - stream, + info.stream, parameters_, outputs, inputs); diff --git a/tc/core/cuda/cuda_tc_executor.h b/tc/core/cuda/cuda_tc_executor.h index 7fc00797d..c604521d2 100644 --- a/tc/core/cuda/cuda_tc_executor.h +++ b/tc/core/cuda/cuda_tc_executor.h @@ -39,7 +39,8 @@ class CudaTcExecutor : public TcExecutor { /// doesn't then segfault will likely occur. void uncheckedRun( const std::vector& inputs, - const std::vector& outputs) const; + const std::vector& outputs, + typename CudaBackend::RuntimeInformation info = {}) const; /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime /// (microseconds). diff --git a/tc/core/tc_executor-inl.h b/tc/core/tc_executor-inl.h index 5eb83b1d5..3ccf646c4 100644 --- a/tc/core/tc_executor-inl.h +++ b/tc/core/tc_executor-inl.h @@ -109,7 +109,8 @@ inline std::pair, std::vector> prepareRun( template void TcExecutor::run( const std::vector& inputs, - const std::vector& outputs) const { + const std::vector& outputs, + typename Backend::RuntimeInformation info) const { std::vector rawInputs; std::vector rawOutputs; std::tie(rawInputs, rawOutputs) = detail::prepareRun( @@ -117,7 +118,7 @@ void TcExecutor::run( // Static dispatch instead of virtual functions requires this cast. static_cast(*this).uncheckedRun( - rawInputs, rawOutputs); + rawInputs, rawOutputs, info); } template diff --git a/tc/core/tc_executor.h b/tc/core/tc_executor.h index 7ecd61cd7..66362a197 100644 --- a/tc/core/tc_executor.h +++ b/tc/core/tc_executor.h @@ -88,7 +88,8 @@ class TcExecutor { /// advanced aliasing) properties of the input and output tensors. void run( const std::vector& inputs, - const std::vector& outputs) const; + const std::vector& outputs, + typename Backend::RuntimeInformation info = {}) const; /// Calls run and profiles the cpu overhead and kernel runtime (microseconds). /// \returns profiling information diff --git a/tc/examples/CMakeLists.txt b/tc/examples/CMakeLists.txt index 77430dfc2..c52354bf8 100644 --- a/tc/examples/CMakeLists.txt +++ b/tc/examples/CMakeLists.txt @@ -16,6 +16,7 @@ set(EXAMPLES_FILES blockdiagperm group_normalization tensordot + upsample wavenet ) foreach(i ${EXAMPLES_FILES}) diff --git a/tc/examples/upsample.cc b/tc/examples/upsample.cc new file mode 100644 index 000000000..ab0e609c8 --- /dev/null +++ b/tc/examples/upsample.cc @@ -0,0 +1,165 @@ +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include +#include + +#include "common.h" + +#include "tc/aten/aten.h" +#include "tc/aten/aten_autotuner.h" +#include "tc/aten/aten_compiler.h" +#include "tc/autotuner/genetic_search.h" +#include "tc/core/cpu/cpu_mapping_options.h" +#include "tc/core/cpu/cpu_tc_executor.h" +#include "tc/core/cuda/cuda_mapping_options.h" +#include "tc/core/cuda/cuda_tc_executor.h" +#include "tc/core/flags.h" + +DEFINE_string(proto_path, "", "Filename to load and store proto cache "); +DEFINE_bool( + use_best_options, + false, + "Start from hardcoded best options; if false start from naive options "); + +// These options were copied from GroupNormalization +auto previouslyTunedBestOptions = + tc::CudaMappingOptions::makeNaiveMappingOptions() + .outerScheduleFusionStrategy(tc::FusionStrategy::Max) + .outerScheduleAllowSkewing(false) + .outerSchedulePositiveOrthant(true) + .intraTileScheduleFusionStrategy( + tc::FusionStrategy::Preserve3Coincident) + .intraTileScheduleAllowSkewing(false) + .intraTileSchedulePositiveOrthant(true) + .tile(6, 1, 24) + .unroll(16) + .tileImperfectlyNested(false) + .matchLibraryCalls(false) + .mapToThreads(48, 6) + .mapToBlocks(256, 32) + .useSharedMemory(true) + .usePrivateMemory(true) + .unrollCopyShared(false); + +template +void testOnBackend() { + // 1. Define and setup the TC compilation unit with CUDA memory + // management backed by ATen tensors. + std::string tc = R"TC( +def upsample( + float(N, C, H, W) X, float(1) rheight, float(1) rwidth, float(1) height, float(1) width) + -> (output, h1, w1, h1r, w1r, h1p, w1p, h1lambda, h0lambda, w1lambda, w0lambda) +{ + h1r(i) = rheight(0) * i where i in 0:H + h1(i) = int32(h1r(i)) where i in 0:H + h1p(i) = (h1(i) < (height(0) - 1)) ? 1 : 0 where i in 0:H + h1lambda(i) = h1r(i) - h1(i) where i in 0:H + h0lambda(i) = 1.0 - h1lambda(i) where i in 0:H + + w1r(j) = rwidth(0) * j where j in 0:W + w1(j) = int32(w1r(j)) where j in 0:W + w1p(j) = (w1(j) < (width(0) - 1)) ? 1 : 0 where j in 0:W + w1lambda(j) = w1r(j) - w1(j) where j in 0:W + w0lambda(j) = 1.0 - w1lambda(j) where j in 0:W + + # Maybe: split kernels here if fusion does not occur + + output(n, c, i, j) +=! h0lambda(i) * (w0lambda(i) * X(n, c, h1(i), w1(j)) + + w1lambda(j) * X(n, c, h1(i), w1(j) + w1p(j))) + + h1lambda(i) * (w0lambda(j) * X(n, c, h1(i) + h1p(i), w1(j)) + + w1lambda(j) * X(n, c, h1(i) + h1p(i), w1(j) + w1p(j))) + where i in 0:H, j in 0:W +} + )TC"; + + // 2. Allocate tensors with random data. + auto N = 8, C = 4, H = 4, W = 8; + auto widthScale = 2.0, heightScale = 2.0; + + auto outH = H * heightScale; + auto outW = W * widthScale; + auto rh = (outH > 1) ? (float)(H - 1) / (outH - 1) : 0.f; + auto rw = (outW > 1) ? (float)(W - 1) / (outW - 1) : 0.f; + + at::Tensor X = makeATenTensor({N, C, H, W}); + at::Tensor inputHeight = makeATenTensor({1}); + at::Tensor inputWidth = makeATenTensor({1}); + at::Tensor rheight = makeATenTensor({1}); + at::Tensor rwidth = makeATenTensor({1}); + at::Tensor h1 = makeATenTensor({1}); + at::Tensor w1 = makeATenTensor({1}); + at::Tensor h1r = makeATenTensor({1}); + at::Tensor w1r = makeATenTensor({1}); + at::Tensor h1p = makeATenTensor({1}); + at::Tensor w1p = makeATenTensor({1}); + at::Tensor h1lamada = makeATenTensor({1}); + at::Tensor h0lamada = makeATenTensor({1}); + at::Tensor w1lamada = makeATenTensor({1}); + at::Tensor w0lamada = makeATenTensor({1}); + + inputHeight.fill_(H); + inputWidth.fill_(W); + rheight.fill_(rh); + rwidth.fill_(rw); + + // 3. Run autotuning with evolutionary search starting from a naive option. + auto baseOptions = FLAGS_use_best_options + ? previouslyTunedBestOptions + : Backend::MappingOptionsType::makeNaiveMappingOptions(); + tc::aten::ATenAutotuner + geneticAutotuneATen(tc); + auto bestOption = geneticAutotuneATen.tune( + "upsample", {X, rheight, rwidth, inputHeight, inputWidth}, baseOptions, FLAGS_proto_path); + CHECK_GT(bestOption.size(), 0u); + + // 4. Compile and run the TC with the best option. + auto pExecutor = tc::aten::compile( + tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth}, bestOption[0]); + auto outputs = + tc::aten::prepareOutputs(tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth}); + auto timings = tc::aten::profile(*pExecutor, {X, rheight, rwidth, inputHeight, inputWidth}, outputs); + std::cout << "upsample size X: " << X.sizes() << ", " + << " ran in: " << timings.kernelRuntime.toMicroSeconds() << "us\n"; + LOG(INFO) << "best option: " << bestOption << "\n"; +} + +TEST(UpSampleGPU, SimpleAutotune) { + testOnBackend(); +} + +/* + Short run: from build dir, run with: + ./tc/examples/upsample --tuner_threads=10 \ + --tuner_gen_pop_size=10 --tuner_gen_generations=3 \ + --tuner_gen_number_elites=4 \ + --proto_path="/tmp/upsample" + + Long run: from build dir, run with: + ./tc/examples/upsample --tuner_threads=10 \ + --proto_path="/tmp/upsample" +*/ +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::gflags::ParseCommandLineFlags(&argc, &argv, true); + ::google::InitGoogleLogging(argv[0]); + tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA); + return RUN_ALL_TESTS(); +}