Skip to content

Commit c5fa0c7

Browse files
committed
adding basic atomic interface to KA
adding GC checks
1 parent fcc4f57 commit c5fa0c7

File tree

7 files changed

+594
-0
lines changed

7 files changed

+594
-0
lines changed

examples/histogram.jl

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
using KernelAbstractions, Test
2+
include(joinpath(@__DIR__, "utils.jl")) # Load backend
3+
4+
5+
# Function to use as a baseline for CPU metrics
6+
function create_histogram(input)
7+
histogram_output = zeros(Int, maximum(input))
8+
for i = 1:length(input)
9+
histogram_output[input[i]] += 1
10+
end
11+
return histogram_output
12+
end
13+
14+
# This a 1D histogram kernel where the histogramming happens on shmem
15+
@kernel function histogram_kernel!(histogram_output, input)
16+
tid = @index(Global, Linear)
17+
lid = @index(Local, Linear)
18+
19+
@uniform warpsize = Int(32)
20+
21+
@uniform gs = @groupsize()[1]
22+
@uniform N = length(histogram_output)
23+
24+
shared_histogram = @localmem Int (gs)
25+
26+
# This will go through all input elements and assign them to a location in
27+
# shmem. Note that if there is not enough shem, we create different shmem
28+
# blocks to write to. For example, if shmem is of size 256, but it's
29+
# possible to get a value of 312, then we will have 2 separate shmem blocks,
30+
# one from 1->256, and another from 256->512
31+
@uniform max_element = 1
32+
for min_element = 1:gs:N
33+
34+
# Setting shared_histogram to 0
35+
@inbounds shared_histogram[lid] = 0
36+
@synchronize()
37+
38+
max_element = min_element + gs
39+
if max_element > N
40+
max_element = N+1
41+
end
42+
43+
# Defining bin on shared memory and writing to it if possible
44+
bin = input[tid]
45+
if bin >= min_element && bin < max_element
46+
bin -= min_element-1
47+
GC.@preserve shared_histogram begin
48+
atomic_add!(pointer(shared_histogram, bin), Int(1))
49+
end
50+
end
51+
52+
@synchronize()
53+
54+
if ((lid+min_element-1) <= N)
55+
atomic_add!(pointer(histogram_output, lid+min_element-1),
56+
shared_histogram[lid])
57+
end
58+
59+
end
60+
61+
end
62+
63+
function histogram!(histogram_output, input;
64+
numcores = 4, numthreads = 256)
65+
66+
if isa(input, Array)
67+
kernel! = histogram_kernel!(CPU(), numcores)
68+
else
69+
kernel! = histogram_kernel!(CUDADevice(), numthreads)
70+
end
71+
72+
kernel!(histogram_output, input, ndrange=size(input))
73+
end
74+
75+
@testset "histogram tests" begin
76+
77+
rand_input = [rand(1:128) for i = 1:1000]
78+
linear_input = [i for i = 1:1024]
79+
all_2 = [2 for i = 1:512]
80+
81+
histogram_rand_baseline = create_histogram(rand_input)
82+
histogram_linear_baseline = create_histogram(linear_input)
83+
histogram_2_baseline = create_histogram(all_2)
84+
85+
if Base.VERSION >= v"1.7.0"
86+
CPU_rand_histogram = zeros(Int, 128)
87+
CPU_linear_histogram = zeros(Int, 1024)
88+
CPU_2_histogram = zeros(Int, 2)
89+
90+
wait(histogram!(CPU_rand_histogram, rand_input))
91+
wait(histogram!(CPU_linear_histogram, linear_input))
92+
wait(histogram!(CPU_2_histogram, all_2))
93+
94+
@test isapprox(CPU_rand_histogram, histogram_rand_baseline)
95+
@test isapprox(CPU_linear_histogram, histogram_linear_baseline)
96+
@test isapprox(CPU_2_histogram, histogram_2_baseline)
97+
end
98+
99+
if has_cuda_gpu()
100+
CUDA.allowscalar(false)
101+
102+
GPU_rand_input = CuArray(rand_input)
103+
GPU_linear_input = CuArray(linear_input)
104+
GPU_2_input = CuArray(all_2)
105+
106+
GPU_rand_histogram = CuArray(zeros(Int, 128))
107+
GPU_linear_histogram = CuArray(zeros(Int, 1024))
108+
GPU_2_histogram = CuArray(zeros(Int, 2))
109+
110+
wait(histogram!(GPU_rand_histogram, GPU_rand_input))
111+
wait(histogram!(GPU_linear_histogram, GPU_linear_input))
112+
wait(histogram!(GPU_2_histogram, GPU_2_input))
113+
114+
@test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline)
115+
@test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline)
116+
@test isapprox(Array(GPU_2_histogram), histogram_2_baseline)
117+
end
118+
119+
end

lib/CUDAKernels/src/CUDAKernels.jl

+26
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ else
320320
end
321321

322322
import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize, __size
323+
import KernelAbstractions: atomic_add!, atomic_and!, atomic_cas!, atomic_dec!, atomic_inc!, atomic_max!, atomic_min!, atomic_op!, atomic_or!, atomic_sub!, atomic_xchg!, atomic_xor!
323324

324325
###
325326
# GPU implementation of shared memory
@@ -381,4 +382,29 @@ end
381382
CUDA.ptx_isa_version(args...)
382383
end
383384

385+
###
386+
# GPU implementation of atomics
387+
###
388+
389+
afxs = Dict(
390+
atomic_add! => CUDA.atomic_add!,
391+
atomic_and! => CUDA.atomic_and!,
392+
atomic_cas! => CUDA.atomic_cas!,
393+
atomic_dec! => CUDA.atomic_dec!,
394+
atomic_inc! => CUDA.atomic_inc!,
395+
atomic_max! => CUDA.atomic_max!,
396+
atomic_min! => CUDA.atomic_min!,
397+
atomic_op! => CUDA.atomic_op!,
398+
atomic_or! => CUDA.atomic_or!,
399+
atomic_sub! => CUDA.atomic_sub!,
400+
atomic_xchg! => CUDA.atomic_xchg!,
401+
atomic_xor! => CUDA.atomic_xor!
402+
)
403+
404+
for (afx, cfx) in afxs
405+
@inline function Cassette.overdub(::CUDACtx, ::typeof(afx), args...)
406+
cfx(args...)
407+
end
408+
end
409+
384410
end

src/KernelAbstractions.jl

+4
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,10 @@ include("extras/extras.jl")
482482

483483
include("reflection.jl")
484484

485+
# Atomics
486+
487+
include("atomics.jl")
488+
485489
# CPU backend
486490

487491
include("cpu.jl")

src/atomics.jl

+203
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
###
2+
# Atomics
3+
###
4+
5+
export atomic_add!, atomic_sub!, atomic_and!, atomic_or!, atomic_xor!,
6+
atomic_min!, atomic_max!, atomic_inc!, atomic_dec!, atomic_xchg!,
7+
atomic_op!, atomic_cas!
8+
9+
# helper functions for inc(rement) and dec(rement)
10+
function dec(a::T,b::T) where T
11+
((a == 0) | (a > b)) ? b : (a-T(1))
12+
end
13+
14+
function inc(a::T,b::T) where T
15+
(a >= b) ? T(0) : (a+T(1))
16+
end
17+
18+
# arithmetic, bitwise, min/max, and inc/dec operations
19+
const ops = Dict(
20+
:atomic_add! => +,
21+
:atomic_sub! => -,
22+
:atomic_and! => &,
23+
:atomic_or! => |,
24+
:atomic_xor! => ,
25+
:atomic_min! => min,
26+
:atomic_max! => max,
27+
:atomic_inc! => inc,
28+
:atomic_dec! => dec,
29+
)
30+
31+
# Note: the type T prevents type convertion (for example, Float32 -> 64)
32+
# can lead to errors if b is chosen to be of a different, compatible type
33+
for (name, op) in ops
34+
@eval @inline function $name(ptr::Ptr{T}, b::T) where T
35+
Core.Intrinsics.atomic_pointermodify(ptr::Ptr{T}, $op, b::T, :monotonic)
36+
end
37+
end
38+
39+
"""
40+
atomic_cas!(ptr::Ptr{T}, cmp::T, val::T)
41+
42+
This is an atomic Compare And Swap (CAS).
43+
It reads the value `old` located at address `ptr` and compare with `cmp`.
44+
If `old` equals `cmp`, it stores `val` at the same address.
45+
Otherwise, doesn't change the value `old`.
46+
These operations are performed in one atomic transaction.
47+
The function returns `old`.
48+
49+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
50+
Additionally, on GPU hardware with compute capability 7.0+, values of type UInt16 are supported.
51+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
52+
"""
53+
function atomic_cas!(ptr::Ptr{T}, old::T, new::T) where T
54+
Core.Intrinsics.atomic_pointerreplace(ptr, old, new, :acquire_release, :monotonic)
55+
end
56+
57+
"""
58+
atomic_xchg!(ptr::Ptr{T}, val::T)
59+
60+
This is an atomic exchange.
61+
It reads the value `old` located at address `ptr` and stores `val` at the same address.
62+
These operations are performed in one atomic transaction. The function returns `old`.
63+
64+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
65+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
66+
"""
67+
function atomic_xchg!(ptr::Ptr{T}, b::T) where T
68+
Core.Intrinsics.atomic_pointerswap(ptr::Ptr{T}, b::T, :monotonic)
69+
end
70+
71+
"""
72+
atomic_op!(ptr::Ptr{T}, val::T)
73+
74+
This is an arbitrary atomic operation.
75+
It reads the value `old` located at address `ptr` and uses `val` in the operation `op` (defined elsewhere)
76+
These operations are performed in one atomic transaction. The function returns `old`.
77+
78+
This function is somewhat experimental.
79+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
80+
"""
81+
function atomic_op!(ptr::Ptr{T}, op, b::T) where T
82+
Core.Intrinsics.atomic_pointermodify(ptr::Ptr{T}, op, b::T, :monotonic)
83+
end
84+
85+
# Other Documentation
86+
87+
"""
88+
atomic_add!(ptr::Ptr{T}, val::T)
89+
90+
This is an atomic addition.
91+
It reads the value `old` located at address `ptr`, computes `old + val`, and stores the result back to memory at the same address.
92+
These operations are performed in one atomic transaction.
93+
The function returns `old`.
94+
95+
This operation is supported for values of type Int32, Int64, UInt32, UInt64, and Float32.
96+
Additionally, on GPU hardware with compute capability 6.0+, values of type Float64 are supported.
97+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
98+
"""
99+
atomic_add!
100+
101+
"""
102+
atomic_sub!(ptr::Ptr{T}, val::T)
103+
104+
This is an atomic subtraction.
105+
It reads the value `old` located at address `ptr`, computes `old - val`, and stores the result back to memory at the same address.
106+
These operations are performed in one atomic transaction.
107+
The function returns `old`.
108+
109+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
110+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
111+
"""
112+
atomic_sub!
113+
114+
"""
115+
atomic_and!(ptr::Ptr{T}, val::T)
116+
117+
This is an atomic and.
118+
It reads the value `old` located at address `ptr`, computes `old & val`, and stores the result back to memory at the same address.
119+
These operations are performed in one atomic transaction.
120+
The function returns `old`.
121+
122+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
123+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
124+
"""
125+
atomic_and!
126+
127+
"""
128+
atomic_or!(ptr::Ptr{T}, val::T)
129+
130+
This is an atomic or.
131+
It reads the value `old` located at address `ptr`, computes `old | val`, and stores the result back to memory at the same address.
132+
These operations are performed in one atomic transaction.
133+
The function returns `old`.
134+
135+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
136+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
137+
"""
138+
atomic_or!
139+
140+
"""
141+
atomic_xor!(ptr::Ptr{T}, val::T)
142+
143+
This is an atomic xor.
144+
It reads the value `old` located at address `ptr`, computes `old ⊻ val`, and stores the result back to memory at the same address.
145+
These operations are performed in one atomic transaction.
146+
The function returns `old`.
147+
148+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
149+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
150+
"""
151+
atomic_xor!
152+
153+
"""
154+
atomic_min!(ptr::Ptr{T}, val::T)
155+
156+
This is an atomic min.
157+
It reads the value `old` located at address `ptr`, computes `min(old, val)`, and st ores the result back to memory at the same address.
158+
These operations are performed in one atomic transaction.
159+
The function returns `old`.
160+
161+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
162+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
163+
"""
164+
atomic_min!
165+
166+
"""
167+
atomic_max!(ptr::Ptr{T}, val::T)
168+
169+
This is an atomic max.
170+
It reads the value `old` located at address `ptr`, computes `max(old, val)`, and st ores the result back to memory at the same address.
171+
These operations are performed in one atomic transaction.
172+
The function returns `old`.
173+
174+
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
175+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
176+
"""
177+
atomic_max!
178+
179+
"""
180+
atomic_inc!(ptr::Ptr{T}, val::T)
181+
182+
This is an atomic increment function that counts up to a certain number before starting again at 0.
183+
It reads the value `old` located at address `ptr`, computes `((old >= val) ? 0 : (o ld+1))`, and stores the result back to memory at the same address.
184+
These three operations are performed in one atomic transaction.
185+
The function returns `old`.
186+
187+
This operation is only supported for values of type Int32.
188+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
189+
"""
190+
atomic_inc!
191+
192+
"""
193+
atomic_dec!(ptr::Ptr{T}, val::T)
194+
195+
This is an atomic decrement function that counts down to 0 from a defined value `val`.
196+
It reads the value `old` located at address `ptr`, computes `(((old == 0) | (old > val)) ? val : (old-1))`, and stores the result back to memory at the same address.
197+
These three operations are performed in one atomic transaction.
198+
The function returns `old`.
199+
200+
This operation is only supported for values of type Int32.
201+
Also: atomic operations for the CPU requires a Julia version of 1.7.0 or above.
202+
"""
203+
atomic_dec!

0 commit comments

Comments
 (0)