From 3e5e3a267f90833ddf05f52a3f33a2ab5d34fa12 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 25 May 2022 11:03:11 +0200 Subject: [PATCH 1/4] atting atomic attempts with atomix --- Project.toml | 3 + examples/histogram.jl | 119 +++++++++++++++++++++++++++++ lib/CUDAKernels/Project.toml | 2 + lib/CUDAKernels/src/CUDAKernels.jl | 1 + src/KernelAbstractions.jl | 1 + 5 files changed, 126 insertions(+) create mode 100644 examples/histogram.jl diff --git a/Project.toml b/Project.toml index 81f5514a6..4bb18942a 100644 --- a/Project.toml +++ b/Project.toml @@ -5,12 +5,15 @@ version = "0.8.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] Adapt = "0.4, 1.0, 2.0, 3.0" diff --git a/examples/histogram.jl b/examples/histogram.jl new file mode 100644 index 000000000..65cc430e1 --- /dev/null +++ b/examples/histogram.jl @@ -0,0 +1,119 @@ +using KernelAbstractions, Test +using Atomix: @atomic, @atomicswap, @atomicreplace +include(joinpath(@__DIR__, "utils.jl")) # Load backend + + +# Function to use as a baseline for CPU metrics +function create_histogram(input) + histogram_output = zeros(Int, maximum(input)) + for i = 1:length(input) + histogram_output[input[i]] += 1 + end + return histogram_output +end + +# This a 1D histogram kernel where the histogramming happens on shmem +@kernel function histogram_kernel!(histogram_output, input) + tid = @index(Global, Linear) + lid = @index(Local, Linear) + + @uniform warpsize = Int(32) + + @uniform gs = @groupsize()[1] + @uniform N = length(histogram_output) + + shared_histogram = @localmem Int (gs) + + # This will go through all input elements and assign them to a location in + # shmem. Note that if there is not enough shem, we create different shmem + # blocks to write to. For example, if shmem is of size 256, but it's + # possible to get a value of 312, then we will have 2 separate shmem blocks, + # one from 1->256, and another from 256->512 + @uniform max_element = 1 + for min_element = 1:gs:N + + # Setting shared_histogram to 0 + @inbounds shared_histogram[lid] = 0 + @synchronize() + + max_element = min_element + gs + if max_element > N + max_element = N+1 + end + + # Defining bin on shared memory and writing to it if possible + bin = input[tid] + if bin >= min_element && bin < max_element + bin -= min_element-1 + GC.@preserve shared_histogram begin + @atomic shared_histogram[bin] += 1 + end + end + + @synchronize() + + if ((lid+min_element-1) <= N) + @atomic histogram_output[lid+min_element-1] += shared_histogram[lid] + end + + end + +end + +function histogram!(histogram_output, input; + numcores = 4, numthreads = 256) + + if isa(input, Array) + kernel! = histogram_kernel!(CPU(), numcores) + else + kernel! = histogram_kernel!(CUDADevice(), numthreads) + end + + kernel!(histogram_output, input, ndrange=size(input)) +end + +@testset "histogram tests" begin + + rand_input = [rand(1:128) for i = 1:1000] + linear_input = [i for i = 1:1024] + all_2 = [2 for i = 1:512] + + histogram_rand_baseline = create_histogram(rand_input) + histogram_linear_baseline = create_histogram(linear_input) + histogram_2_baseline = create_histogram(all_2) + + if Base.VERSION >= v"1.7.0" + CPU_rand_histogram = zeros(Int, 128) + CPU_linear_histogram = zeros(Int, 1024) + CPU_2_histogram = zeros(Int, 2) + + wait(histogram!(CPU_rand_histogram, rand_input)) + wait(histogram!(CPU_linear_histogram, linear_input)) + wait(histogram!(CPU_2_histogram, all_2)) + + @test isapprox(CPU_rand_histogram, histogram_rand_baseline) + @test isapprox(CPU_linear_histogram, histogram_linear_baseline) + @test isapprox(CPU_2_histogram, histogram_2_baseline) + end + + if has_cuda_gpu() + CUDA.allowscalar(false) + + GPU_rand_input = CuArray(rand_input) + GPU_linear_input = CuArray(linear_input) + GPU_2_input = CuArray(all_2) + + GPU_rand_histogram = CuArray(zeros(Int, 128)) + GPU_linear_histogram = CuArray(zeros(Int, 1024)) + GPU_2_histogram = CuArray(zeros(Int, 2)) + + wait(histogram!(GPU_rand_histogram, GPU_rand_input)) + wait(histogram!(GPU_linear_histogram, GPU_linear_input)) + wait(histogram!(GPU_2_histogram, GPU_2_input)) + + @test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline) + @test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline) + @test isapprox(Array(GPU_2_histogram), histogram_2_baseline) + end + +end diff --git a/lib/CUDAKernels/Project.toml b/lib/CUDAKernels/Project.toml index 095f37553..3b26a8315 100644 --- a/lib/CUDAKernels/Project.toml +++ b/lib/CUDAKernels/Project.toml @@ -7,7 +7,9 @@ version = "0.4.1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] Adapt = "3.0" diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index 43337d730..a9dae97ac 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -5,6 +5,7 @@ import StaticArrays import StaticArrays: MArray import Adapt import KernelAbstractions +import UnsafeAtomicsLLVM export CUDADevice diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 5d98efe88..b698be197 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -5,6 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize, export Device, GPU, CPU, Event, MultiEvent, NoneEvent export async_copy! +using Atomix: @atomic, @atomicswap, @atomicreplace using LinearAlgebra using MacroTools From 7b9a7aabc47a2ae03da8683bdf745db938fb4597 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 25 May 2022 12:29:00 +0200 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Takafumi Arakaki --- Project.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Project.toml b/Project.toml index 4bb18942a..6613a8681 100644 --- a/Project.toml +++ b/Project.toml @@ -7,13 +7,11 @@ version = "0.8.0" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [compat] Adapt = "0.4, 1.0, 2.0, 3.0" From b0f7374e5feda67cda796c4dc70d0d6e37db4f59 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 25 May 2022 14:31:03 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Valentin Churavy --- lib/CUDAKernels/Project.toml | 1 - src/KernelAbstractions.jl | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/CUDAKernels/Project.toml b/lib/CUDAKernels/Project.toml index 3b26a8315..c85ec27b3 100644 --- a/lib/CUDAKernels/Project.toml +++ b/lib/CUDAKernels/Project.toml @@ -7,7 +7,6 @@ version = "0.4.1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index b698be197..0c0084734 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -5,7 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize, export Device, GPU, CPU, Event, MultiEvent, NoneEvent export async_copy! -using Atomix: @atomic, @atomicswap, @atomicreplace +import Atomix: @atomic, @atomicswap, @atomicreplace using LinearAlgebra using MacroTools From 93842383553c31faa66db178bfe9db4c9c82b16c Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 31 May 2022 14:26:11 -0400 Subject: [PATCH 4/4] Update examples/histogram.jl Co-authored-by: Takafumi Arakaki --- examples/histogram.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 65cc430e1..20778e92f 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -1,5 +1,5 @@ using KernelAbstractions, Test -using Atomix: @atomic, @atomicswap, @atomicreplace +using KernelAbstractions: @atomic, @atomicswap, @atomicreplace include(joinpath(@__DIR__, "utils.jl")) # Load backend