facebookresearch · mingzhe09088 · May 8, 2018 · May 14, 2018 · May 16, 2018 · May 16, 2018
diff --git a/tc/c2/tc_op.h b/tc/c2/tc_op.h
@@ -119,11 +119,14 @@ class TcOp : public Operator<Context> {
       compiled_ = true;
     }
 
+    // Get CUDA stream id from C2
+    tc::CudaBackend::RuntimeInformation info = {context_.cuda_stream()};
+
     // run
     if (!check_sizes_) {
-      executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_);
+      executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_, info);
     } else {
-      executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_);
+      executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_, info);
     }
     return true;
   }

diff --git a/tc/core/cpu/cpu_tc_executor.cc b/tc/core/cpu/cpu_tc_executor.cc
@@ -52,7 +52,8 @@ CpuCompilationResult CpuBackend::compileWithTcMapper(
 
 void CpuTcExecutor::uncheckedRun(
     const std::vector<const void*>& inputs,
-    const std::vector<void*>& outputs) const {
+    const std::vector<void*>& outputs,
+    typename CpuBackend::RuntimeInformation info) const {
   LOG(ERROR) << "NYI: CpuTcExecutor::uncheckedRun";
 }
 

diff --git a/tc/core/cpu/cpu_tc_executor.h b/tc/core/cpu/cpu_tc_executor.h
@@ -39,7 +39,8 @@ class CpuTcExecutor : public TcExecutor<CpuBackend> {
   /// doesn't then segfault will likely occur.
   void uncheckedRun(
       const std::vector<const void*>& inputs,
-      const std::vector<void*>& outputs) const;
+      const std::vector<void*>& outputs,
+      typename CpuBackend::RuntimeInformation info) const;
 
   /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
   /// (microseconds).

diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
@@ -111,16 +111,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(
 
 void CudaTcExecutor::uncheckedRun(
     const std::vector<const void*>& inputs,
-    const std::vector<void*>& outputs) const {
+    const std::vector<void*>& outputs,
+    typename CudaBackend::RuntimeInformation info) const {
   CHECK(rtcFun_) << "No rtcFun_ attached, cannot launch";
-  cudaStream_t stream = 0;
   CHECK_NE(grid_.view[0], 0u) << "Grid dims are not set up";
   CHECK_NE(block_.view[0], 0u) << "Block dims are not set up";
   rtcFun_->Launch(
       grid_.view.extractDefaultedArray(),
       block_.view.extractDefaultedArray(),
       0,
-      stream,
+      info.stream,
       parameters_,
       outputs,
       inputs);

diff --git a/tc/core/cuda/cuda_tc_executor.h b/tc/core/cuda/cuda_tc_executor.h
@@ -39,7 +39,8 @@ class CudaTcExecutor : public TcExecutor<CudaBackend> {
   /// doesn't then segfault will likely occur.
   void uncheckedRun(
       const std::vector<const void*>& inputs,
-      const std::vector<void*>& outputs) const;
+      const std::vector<void*>& outputs,
+      typename CudaBackend::RuntimeInformation info = {}) const;
 
   /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
   /// (microseconds).

diff --git a/tc/core/tc_executor-inl.h b/tc/core/tc_executor-inl.h
@@ -109,15 +109,16 @@ inline std::pair<std::vector<const void*>, std::vector<void*>> prepareRun(
 template <typename Backend>
 void TcExecutor<Backend>::run(
     const std::vector<const DLConstTensor*>& inputs,
-    const std::vector<const DLTensor*>& outputs) const {
+    const std::vector<const DLTensor*>& outputs,
+    typename Backend::RuntimeInformation info) const {
   std::vector<const void*> rawInputs;
   std::vector<void*> rawOutputs;
   std::tie(rawInputs, rawOutputs) = detail::prepareRun(
       inputs, outputs, inputsInfo_, outputsInfo_, halideComponents_);
 
   // Static dispatch instead of virtual functions requires this cast.
   static_cast<const typename Backend::ExecutorType&>(*this).uncheckedRun(
-      rawInputs, rawOutputs);
+      rawInputs, rawOutputs, info);
 }
 
 template <typename Backend>

diff --git a/tc/core/tc_executor.h b/tc/core/tc_executor.h
@@ -88,7 +88,8 @@ class TcExecutor {
   /// advanced aliasing) properties of the input and output tensors.
   void run(
       const std::vector<const DLConstTensor*>& inputs,
-      const std::vector<const DLTensor*>& outputs) const;
+      const std::vector<const DLTensor*>& outputs,
+      typename Backend::RuntimeInformation info = {}) const;
 
   /// Calls run and profiles the cpu overhead and kernel runtime (microseconds).
   /// \returns profiling information

diff --git a/tc/examples/CMakeLists.txt b/tc/examples/CMakeLists.txt
@@ -16,6 +16,7 @@ set(EXAMPLES_FILES
   blockdiagperm
   group_normalization
   tensordot
+  upsample
   wavenet
 )
 foreach(i ${EXAMPLES_FILES})

diff --git a/tc/examples/upsample.cc b/tc/examples/upsample.cc
@@ -0,0 +1,165 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "common.h"
+
+#include "tc/aten/aten.h"
+#include "tc/aten/aten_autotuner.h"
+#include "tc/aten/aten_compiler.h"
+#include "tc/autotuner/genetic_search.h"
+#include "tc/core/cpu/cpu_mapping_options.h"
+#include "tc/core/cpu/cpu_tc_executor.h"
+#include "tc/core/cuda/cuda_mapping_options.h"
+#include "tc/core/cuda/cuda_tc_executor.h"
+#include "tc/core/flags.h"
+
+DEFINE_string(proto_path, "", "Filename to load and store proto cache ");
+DEFINE_bool(
+    use_best_options,
+    false,
+    "Start from hardcoded best options; if false start from naive options ");
+
+// These options were copied from GroupNormalization
+auto previouslyTunedBestOptions =
+    tc::CudaMappingOptions::makeNaiveMappingOptions()
+        .outerScheduleFusionStrategy(tc::FusionStrategy::Max)
+        .outerScheduleAllowSkewing(false)
+        .outerSchedulePositiveOrthant(true)
+        .intraTileScheduleFusionStrategy(
+            tc::FusionStrategy::Preserve3Coincident)
+        .intraTileScheduleAllowSkewing(false)
+        .intraTileSchedulePositiveOrthant(true)
+        .tile(6, 1, 24)
+        .unroll(16)
+        .tileImperfectlyNested(false)
+        .matchLibraryCalls(false)
+        .mapToThreads(48, 6)
+        .mapToBlocks(256, 32)
+        .useSharedMemory(true)
+        .usePrivateMemory(true)
+        .unrollCopyShared(false);
+
+template <typename Backend>
+void testOnBackend() {
+  // 1. Define and setup the TC compilation unit with CUDA memory
+  // management backed by ATen tensors.
+  std::string tc = R"TC(
+def upsample(
+    float(N, C, H, W) X, float(1) rheight, float(1) rwidth, float(1) height, float(1) width)
+    -> (output, h1, w1, h1r, w1r, h1p, w1p, h1lambda, h0lambda, w1lambda, w0lambda)
+{
+    h1r(i) = rheight(0) * i where i in 0:H
+    h1(i) = int32(h1r(i)) where i in 0:H
+    h1p(i) = (h1(i) < (height(0) - 1)) ? 1 : 0 where i in 0:H
+    h1lambda(i) = h1r(i) - h1(i) where i in 0:H
+    h0lambda(i) = 1.0 - h1lambda(i) where i in 0:H
+
+    w1r(j) = rwidth(0) * j where j in 0:W
+    w1(j) = int32(w1r(j)) where j in 0:W
+    w1p(j) = (w1(j) < (width(0) - 1)) ? 1 : 0 where j in 0:W
+    w1lambda(j) = w1r(j) - w1(j) where j in 0:W
+    w0lambda(j) = 1.0 - w1lambda(j) where j in 0:W
+
+    # Maybe: split kernels here if fusion does not occur
+
+    output(n, c, i, j) +=! h0lambda(i) * (w0lambda(i) * X(n, c, h1(i), w1(j)) +
+        w1lambda(j) * X(n, c, h1(i), w1(j) + w1p(j))) +
+        h1lambda(i) * (w0lambda(j) * X(n, c, h1(i) + h1p(i), w1(j)) +
+        w1lambda(j) * X(n, c, h1(i) + h1p(i), w1(j) + w1p(j)))
+      where i in 0:H, j in 0:W
+}
+  )TC";
+
+  // 2. Allocate tensors with random data.
+  auto N = 8, C = 4, H = 4, W = 8;
+  auto widthScale = 2.0, heightScale = 2.0;
+
+  auto outH = H * heightScale;
+  auto outW = W * widthScale;
+  auto rh = (outH > 1) ? (float)(H - 1) / (outH - 1) : 0.f;
+  auto rw = (outW > 1) ? (float)(W - 1) / (outW - 1) : 0.f;
+
+  at::Tensor X = makeATenTensor<Backend>({N, C, H, W});
+  at::Tensor inputHeight = makeATenTensor<Backend>({1});
+  at::Tensor inputWidth = makeATenTensor<Backend>({1});
+  at::Tensor rheight = makeATenTensor<Backend>({1});
+  at::Tensor rwidth = makeATenTensor<Backend>({1});
+  at::Tensor h1 = makeATenTensor<Backend>({1});
+  at::Tensor w1 = makeATenTensor<Backend>({1});
+  at::Tensor h1r = makeATenTensor<Backend>({1});
+  at::Tensor w1r = makeATenTensor<Backend>({1});
+  at::Tensor h1p = makeATenTensor<Backend>({1});
+  at::Tensor w1p = makeATenTensor<Backend>({1});
+  at::Tensor h1lamada = makeATenTensor<Backend>({1});
+  at::Tensor h0lamada = makeATenTensor<Backend>({1});
+  at::Tensor w1lamada = makeATenTensor<Backend>({1});
+  at::Tensor w0lamada = makeATenTensor<Backend>({1});
+
+  inputHeight.fill_(H);
+  inputWidth.fill_(W);
+  rheight.fill_(rh);
+  rwidth.fill_(rw);
+
+  // 3. Run autotuning with evolutionary search starting from a naive option.
+  auto baseOptions = FLAGS_use_best_options
+      ? previouslyTunedBestOptions
+      : Backend::MappingOptionsType::makeNaiveMappingOptions();
+  tc::aten::ATenAutotuner<Backend, tc::autotune::GeneticSearch>
+      geneticAutotuneATen(tc);
+  auto bestOption = geneticAutotuneATen.tune(
+      "upsample", {X, rheight, rwidth, inputHeight, inputWidth}, baseOptions, FLAGS_proto_path);
+  CHECK_GT(bestOption.size(), 0u);
+
+  // 4. Compile and run the TC with the best option.
+  auto pExecutor = tc::aten::compile<Backend>(
+      tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth}, bestOption[0]);
+  auto outputs =
+      tc::aten::prepareOutputs(tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth});
+  auto timings = tc::aten::profile(*pExecutor, {X, rheight, rwidth, inputHeight, inputWidth}, outputs);
+  std::cout << "upsample size X: " << X.sizes() << ", "
+            << " ran in: " << timings.kernelRuntime.toMicroSeconds() << "us\n";
+  LOG(INFO) << "best option: " << bestOption << "\n";
+}
+
+TEST(UpSampleGPU, SimpleAutotune) {
+  testOnBackend<tc::CudaBackend>();
+}
+
+/*
+  Short run: from build dir, run with:
+    ./tc/examples/upsample --tuner_threads=10 \
+    --tuner_gen_pop_size=10 --tuner_gen_generations=3 \
+    --tuner_gen_number_elites=4 \
+    --proto_path="/tmp/upsample"
+
+  Long run: from build dir, run with:
+    ./tc/examples/upsample --tuner_threads=10 \
+    --proto_path="/tmp/upsample"
+*/
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::gflags::ParseCommandLineFlags(&argc, &argv, true);
+  ::google::InitGoogleLogging(argv[0]);
+  tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
+  return RUN_ALL_TESTS();
+}