From 3e5e3a267f90833ddf05f52a3f33a2ab5d34fa12 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 25 May 2022 11:03:11 +0200
Subject: [PATCH 1/4] atting atomic attempts with atomix

---
 Project.toml                       |   3 +
 examples/histogram.jl              | 119 +++++++++++++++++++++++++++++
 lib/CUDAKernels/Project.toml       |   2 +
 lib/CUDAKernels/src/CUDAKernels.jl |   1 +
 src/KernelAbstractions.jl          |   1 +
 5 files changed, 126 insertions(+)
 create mode 100644 examples/histogram.jl

diff --git a/Project.toml b/Project.toml
index 81f5514a6..4bb18942a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,12 +5,15 @@ version = "0.8.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 Adapt = "0.4, 1.0, 2.0, 3.0"
diff --git a/examples/histogram.jl b/examples/histogram.jl
new file mode 100644
index 000000000..65cc430e1
--- /dev/null
+++ b/examples/histogram.jl
@@ -0,0 +1,119 @@
+using KernelAbstractions, Test
+using Atomix: @atomic, @atomicswap, @atomicreplace
+include(joinpath(@__DIR__, "utils.jl")) # Load backend
+
+
+# Function to use as a baseline for CPU metrics
+function create_histogram(input)
+    histogram_output = zeros(Int, maximum(input))
+    for i = 1:length(input)
+        histogram_output[input[i]] += 1
+    end
+    return histogram_output
+end
+
+# This a 1D histogram kernel where the histogramming happens on shmem
+@kernel function histogram_kernel!(histogram_output, input)
+    tid = @index(Global, Linear)
+    lid = @index(Local, Linear)
+
+    @uniform warpsize = Int(32)
+
+    @uniform gs = @groupsize()[1]
+    @uniform N = length(histogram_output)
+
+    shared_histogram = @localmem Int (gs)
+
+    # This will go through all input elements and assign them to a location in
+    # shmem. Note that if there is not enough shem, we create different shmem
+    # blocks to write to. For example, if shmem is of size 256, but it's
+    # possible to get a value of 312, then we will have 2 separate shmem blocks,
+    # one from 1->256, and another from 256->512
+    @uniform max_element = 1
+    for min_element = 1:gs:N
+
+        # Setting shared_histogram to 0
+        @inbounds shared_histogram[lid] = 0
+        @synchronize()
+
+        max_element = min_element + gs
+        if max_element > N
+            max_element = N+1
+        end
+
+        # Defining bin on shared memory and writing to it if possible
+        bin = input[tid]
+        if bin >= min_element && bin < max_element
+            bin -= min_element-1
+            GC.@preserve shared_histogram begin
+                 @atomic shared_histogram[bin] += 1
+            end
+        end
+
+        @synchronize()
+
+        if ((lid+min_element-1) <= N)
+            @atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
+        end
+
+    end
+
+end
+
+function histogram!(histogram_output, input;
+                    numcores = 4, numthreads = 256)
+
+    if isa(input, Array)
+        kernel! = histogram_kernel!(CPU(), numcores)
+    else
+        kernel! = histogram_kernel!(CUDADevice(), numthreads)
+    end
+
+    kernel!(histogram_output, input, ndrange=size(input))
+end
+
+@testset "histogram tests" begin
+
+    rand_input = [rand(1:128) for i = 1:1000]
+    linear_input = [i for i = 1:1024]
+    all_2 = [2 for i = 1:512]
+
+    histogram_rand_baseline = create_histogram(rand_input)
+    histogram_linear_baseline = create_histogram(linear_input)
+    histogram_2_baseline = create_histogram(all_2)
+
+    if Base.VERSION >= v"1.7.0"
+        CPU_rand_histogram = zeros(Int, 128)
+        CPU_linear_histogram = zeros(Int, 1024)
+        CPU_2_histogram = zeros(Int, 2)
+
+        wait(histogram!(CPU_rand_histogram, rand_input))
+        wait(histogram!(CPU_linear_histogram, linear_input))
+        wait(histogram!(CPU_2_histogram, all_2))
+
+        @test isapprox(CPU_rand_histogram, histogram_rand_baseline)
+        @test isapprox(CPU_linear_histogram, histogram_linear_baseline)
+        @test isapprox(CPU_2_histogram, histogram_2_baseline)
+    end
+
+    if has_cuda_gpu()
+        CUDA.allowscalar(false)
+
+        GPU_rand_input = CuArray(rand_input)
+        GPU_linear_input = CuArray(linear_input)
+        GPU_2_input = CuArray(all_2)
+
+        GPU_rand_histogram = CuArray(zeros(Int, 128))
+        GPU_linear_histogram = CuArray(zeros(Int, 1024))
+        GPU_2_histogram = CuArray(zeros(Int, 2))
+
+        wait(histogram!(GPU_rand_histogram, GPU_rand_input))
+        wait(histogram!(GPU_linear_histogram, GPU_linear_input))
+        wait(histogram!(GPU_2_histogram, GPU_2_input))
+
+        @test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline)
+        @test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline)
+        @test isapprox(Array(GPU_2_histogram), histogram_2_baseline)
+    end
+
+end
diff --git a/lib/CUDAKernels/Project.toml b/lib/CUDAKernels/Project.toml
index 095f37553..3b26a8315 100644
--- a/lib/CUDAKernels/Project.toml
+++ b/lib/CUDAKernels/Project.toml
@@ -7,7 +7,9 @@ version = "0.4.1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 Adapt = "3.0"
diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index 43337d730..a9dae97ac 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -5,6 +5,7 @@ import StaticArrays
 import StaticArrays: MArray
 import Adapt
 import KernelAbstractions
+import UnsafeAtomicsLLVM
 
 export CUDADevice
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 5d98efe88..b698be197 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -5,6 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize,
 export Device, GPU, CPU, Event, MultiEvent, NoneEvent
 export async_copy!
 
+using Atomix: @atomic, @atomicswap, @atomicreplace
 
 using LinearAlgebra
 using MacroTools

From 7b9a7aabc47a2ae03da8683bdf745db938fb4597 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 25 May 2022 12:29:00 +0200
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Takafumi Arakaki <takafumi.a@gmail.com>
---
 Project.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4bb18942a..6613a8681 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,13 +7,11 @@ version = "0.8.0"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
 Adapt = "0.4, 1.0, 2.0, 3.0"

From b0f7374e5feda67cda796c4dc70d0d6e37db4f59 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 25 May 2022 14:31:03 +0200
Subject: [PATCH 3/4] Apply suggestions from code review

Co-authored-by: Valentin Churavy <vchuravy@users.noreply.github.com>
---
 lib/CUDAKernels/Project.toml | 1 -
 src/KernelAbstractions.jl    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/CUDAKernels/Project.toml b/lib/CUDAKernels/Project.toml
index 3b26a8315..c85ec27b3 100644
--- a/lib/CUDAKernels/Project.toml
+++ b/lib/CUDAKernels/Project.toml
@@ -7,7 +7,6 @@ version = "0.4.1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index b698be197..0c0084734 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -5,7 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize,
 export Device, GPU, CPU, Event, MultiEvent, NoneEvent
 export async_copy!
 
-using Atomix: @atomic, @atomicswap, @atomicreplace
+import Atomix: @atomic, @atomicswap, @atomicreplace
 
 using LinearAlgebra
 using MacroTools

From 93842383553c31faa66db178bfe9db4c9c82b16c Mon Sep 17 00:00:00 2001
From: Valentin Churavy <vchuravy@users.noreply.github.com>
Date: Tue, 31 May 2022 14:26:11 -0400
Subject: [PATCH 4/4] Update examples/histogram.jl

Co-authored-by: Takafumi Arakaki <takafumi.a@gmail.com>
---
 examples/histogram.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 65cc430e1..20778e92f 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -1,5 +1,5 @@
 using KernelAbstractions, Test
-using Atomix: @atomic, @atomicswap, @atomicreplace
+using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
 include(joinpath(@__DIR__, "utils.jl")) # Load backend