Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Add UpSample example #413

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions tc/c2/tc_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,14 @@ class TcOp : public Operator<Context> {
compiled_ = true;
}

// Get CUDA stream id from C2
tc::CudaBackend::RuntimeInformation info = {context_.cuda_stream()};

// run
if (!check_sizes_) {
executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_);
executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_, info);
} else {
executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_);
executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_, info);
}
return true;
}
Expand Down
3 changes: 2 additions & 1 deletion tc/core/cpu/cpu_tc_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ CpuCompilationResult CpuBackend::compileWithTcMapper(

void CpuTcExecutor::uncheckedRun(
const std::vector<const void*>& inputs,
const std::vector<void*>& outputs) const {
const std::vector<void*>& outputs,
typename CpuBackend::RuntimeInformation info) const {
LOG(ERROR) << "NYI: CpuTcExecutor::uncheckedRun";
}

Expand Down
3 changes: 2 additions & 1 deletion tc/core/cpu/cpu_tc_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class CpuTcExecutor : public TcExecutor<CpuBackend> {
/// doesn't then segfault will likely occur.
void uncheckedRun(
const std::vector<const void*>& inputs,
const std::vector<void*>& outputs) const;
const std::vector<void*>& outputs,
typename CpuBackend::RuntimeInformation info) const;

/// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
/// (microseconds).
Expand Down
6 changes: 3 additions & 3 deletions tc/core/cuda/cuda_tc_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(

void CudaTcExecutor::uncheckedRun(
const std::vector<const void*>& inputs,
const std::vector<void*>& outputs) const {
const std::vector<void*>& outputs,
typename CudaBackend::RuntimeInformation info) const {
CHECK(rtcFun_) << "No rtcFun_ attached, cannot launch";
cudaStream_t stream = 0;
CHECK_NE(grid_.view[0], 0u) << "Grid dims are not set up";
CHECK_NE(block_.view[0], 0u) << "Block dims are not set up";
rtcFun_->Launch(
grid_.view.extractDefaultedArray(),
block_.view.extractDefaultedArray(),
0,
stream,
info.stream,
parameters_,
outputs,
inputs);
Expand Down
3 changes: 2 additions & 1 deletion tc/core/cuda/cuda_tc_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class CudaTcExecutor : public TcExecutor<CudaBackend> {
/// doesn't then segfault will likely occur.
void uncheckedRun(
const std::vector<const void*>& inputs,
const std::vector<void*>& outputs) const;
const std::vector<void*>& outputs,
typename CudaBackend::RuntimeInformation info = {}) const;

/// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
/// (microseconds).
Expand Down
5 changes: 3 additions & 2 deletions tc/core/tc_executor-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,16 @@ inline std::pair<std::vector<const void*>, std::vector<void*>> prepareRun(
template <typename Backend>
void TcExecutor<Backend>::run(
const std::vector<const DLConstTensor*>& inputs,
const std::vector<const DLTensor*>& outputs) const {
const std::vector<const DLTensor*>& outputs,
typename Backend::RuntimeInformation info) const {
std::vector<const void*> rawInputs;
std::vector<void*> rawOutputs;
std::tie(rawInputs, rawOutputs) = detail::prepareRun(
inputs, outputs, inputsInfo_, outputsInfo_, halideComponents_);

// Static dispatch instead of virtual functions requires this cast.
static_cast<const typename Backend::ExecutorType&>(*this).uncheckedRun(
rawInputs, rawOutputs);
rawInputs, rawOutputs, info);
}

template <typename Backend>
Expand Down
3 changes: 2 additions & 1 deletion tc/core/tc_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ class TcExecutor {
/// advanced aliasing) properties of the input and output tensors.
void run(
const std::vector<const DLConstTensor*>& inputs,
const std::vector<const DLTensor*>& outputs) const;
const std::vector<const DLTensor*>& outputs,
typename Backend::RuntimeInformation info = {}) const;

/// Calls run and profiles the cpu overhead and kernel runtime (microseconds).
/// \returns profiling information
Expand Down
1 change: 1 addition & 0 deletions tc/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(EXAMPLES_FILES
blockdiagperm
group_normalization
tensordot
upsample
wavenet
)
foreach(i ${EXAMPLES_FILES})
Expand Down
165 changes: 165 additions & 0 deletions tc/examples/upsample.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/**
* Copyright (c) 2017-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <string>
#include <vector>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>

#include "common.h"

#include "tc/aten/aten.h"
#include "tc/aten/aten_autotuner.h"
#include "tc/aten/aten_compiler.h"
#include "tc/autotuner/genetic_search.h"
#include "tc/core/cpu/cpu_mapping_options.h"
#include "tc/core/cpu/cpu_tc_executor.h"
#include "tc/core/cuda/cuda_mapping_options.h"
#include "tc/core/cuda/cuda_tc_executor.h"
#include "tc/core/flags.h"

DEFINE_string(proto_path, "", "Filename to load and store proto cache ");
DEFINE_bool(
use_best_options,
false,
"Start from hardcoded best options; if false start from naive options ");

// These options were copied from GroupNormalization
auto previouslyTunedBestOptions =
tc::CudaMappingOptions::makeNaiveMappingOptions()
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
.outerScheduleAllowSkewing(false)
.outerSchedulePositiveOrthant(true)
.intraTileScheduleFusionStrategy(
tc::FusionStrategy::Preserve3Coincident)
.intraTileScheduleAllowSkewing(false)
.intraTileSchedulePositiveOrthant(true)
.tile(6, 1, 24)
.unroll(16)
.tileImperfectlyNested(false)
.matchLibraryCalls(false)
.mapToThreads(48, 6)
.mapToBlocks(256, 32)
.useSharedMemory(true)
.usePrivateMemory(true)
.unrollCopyShared(false);

template <typename Backend>
void testOnBackend() {
// 1. Define and setup the TC compilation unit with CUDA memory
// management backed by ATen tensors.
std::string tc = R"TC(
def upsample(
float(N, C, H, W) X, float(1) rheight, float(1) rwidth, float(1) height, float(1) width)
-> (output, h1, w1, h1r, w1r, h1p, w1p, h1lambda, h0lambda, w1lambda, w0lambda)
{
h1r(i) = rheight(0) * i where i in 0:H
h1(i) = int32(h1r(i)) where i in 0:H
h1p(i) = (h1(i) < (height(0) - 1)) ? 1 : 0 where i in 0:H
h1lambda(i) = h1r(i) - h1(i) where i in 0:H
h0lambda(i) = 1.0 - h1lambda(i) where i in 0:H

w1r(j) = rwidth(0) * j where j in 0:W
w1(j) = int32(w1r(j)) where j in 0:W
w1p(j) = (w1(j) < (width(0) - 1)) ? 1 : 0 where j in 0:W
w1lambda(j) = w1r(j) - w1(j) where j in 0:W
w0lambda(j) = 1.0 - w1lambda(j) where j in 0:W

# Maybe: split kernels here if fusion does not occur

output(n, c, i, j) +=! h0lambda(i) * (w0lambda(i) * X(n, c, h1(i), w1(j)) +
w1lambda(j) * X(n, c, h1(i), w1(j) + w1p(j))) +
h1lambda(i) * (w0lambda(j) * X(n, c, h1(i) + h1p(i), w1(j)) +
w1lambda(j) * X(n, c, h1(i) + h1p(i), w1(j) + w1p(j)))
where i in 0:H, j in 0:W
}
)TC";

// 2. Allocate tensors with random data.
auto N = 8, C = 4, H = 4, W = 8;
auto widthScale = 2.0, heightScale = 2.0;

auto outH = H * heightScale;
auto outW = W * widthScale;
auto rh = (outH > 1) ? (float)(H - 1) / (outH - 1) : 0.f;
auto rw = (outW > 1) ? (float)(W - 1) / (outW - 1) : 0.f;

at::Tensor X = makeATenTensor<Backend>({N, C, H, W});
at::Tensor inputHeight = makeATenTensor<Backend>({1});
at::Tensor inputWidth = makeATenTensor<Backend>({1});
at::Tensor rheight = makeATenTensor<Backend>({1});
at::Tensor rwidth = makeATenTensor<Backend>({1});
at::Tensor h1 = makeATenTensor<Backend>({1});
at::Tensor w1 = makeATenTensor<Backend>({1});
at::Tensor h1r = makeATenTensor<Backend>({1});
at::Tensor w1r = makeATenTensor<Backend>({1});
at::Tensor h1p = makeATenTensor<Backend>({1});
at::Tensor w1p = makeATenTensor<Backend>({1});
at::Tensor h1lamada = makeATenTensor<Backend>({1});
at::Tensor h0lamada = makeATenTensor<Backend>({1});
at::Tensor w1lamada = makeATenTensor<Backend>({1});
at::Tensor w0lamada = makeATenTensor<Backend>({1});

inputHeight.fill_(H);
inputWidth.fill_(W);
rheight.fill_(rh);
rwidth.fill_(rw);

// 3. Run autotuning with evolutionary search starting from a naive option.
auto baseOptions = FLAGS_use_best_options
? previouslyTunedBestOptions
: Backend::MappingOptionsType::makeNaiveMappingOptions();
tc::aten::ATenAutotuner<Backend, tc::autotune::GeneticSearch>
geneticAutotuneATen(tc);
auto bestOption = geneticAutotuneATen.tune(
"upsample", {X, rheight, rwidth, inputHeight, inputWidth}, baseOptions, FLAGS_proto_path);
CHECK_GT(bestOption.size(), 0u);

// 4. Compile and run the TC with the best option.
auto pExecutor = tc::aten::compile<Backend>(
tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth}, bestOption[0]);
auto outputs =
tc::aten::prepareOutputs(tc, "upsample", {X, rheight, rwidth, inputHeight, inputWidth});
auto timings = tc::aten::profile(*pExecutor, {X, rheight, rwidth, inputHeight, inputWidth}, outputs);
std::cout << "upsample size X: " << X.sizes() << ", "
<< " ran in: " << timings.kernelRuntime.toMicroSeconds() << "us\n";
LOG(INFO) << "best option: " << bestOption << "\n";
}

TEST(UpSampleGPU, SimpleAutotune) {
testOnBackend<tc::CudaBackend>();
}

/*
Short run: from build dir, run with:
./tc/examples/upsample --tuner_threads=10 \
--tuner_gen_pop_size=10 --tuner_gen_generations=3 \
--tuner_gen_number_elites=4 \
--proto_path="/tmp/upsample"

Long run: from build dir, run with:
./tc/examples/upsample --tuner_threads=10 \
--proto_path="/tmp/upsample"
*/
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
::gflags::ParseCommandLineFlags(&argc, &argv, true);
::google::InitGoogleLogging(argv[0]);
tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
return RUN_ALL_TESTS();
}