File tree 2 files changed +24
-6
lines changed
2 files changed +24
-6
lines changed Original file line number Diff line number Diff line change @@ -56,14 +56,21 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
56
56
_Pragma ("omp end declare variant" );
57
57
_Pragma ("omp end declare target" );
58
58
59
+
60
+
59
61
#if defined(__NVPTX__ )
60
- #include <nvptxintrin.h>
62
+ #include <nvptxintrin.h>
61
63
#elif defined(__AMDGPU__ )
62
- #include <amdgpuintrin.h>
64
+ #if defined(__SPIRV64__ )
65
+ // the spirv64-amd-amdhsa triple claims to be amdgpu and spirv
66
+ #include <spirvintrin.h>
67
+ #else
68
+ #include <amdgpuintrin.h>
69
+ #endif
63
70
#elif defined(__SPIRV64__ )
64
- #include <spirvintrin.h>
71
+ #include <spirvintrin.h>
65
72
#elif !defined(_OPENMP )
66
- #error "This header is only meant to be used on GPU architectures."
73
+ #error "This header is only meant to be used on GPU architectures."
67
74
#endif
68
75
69
76
_Pragma ("omp begin declare target device_type(nohost)" );
Original file line number Diff line number Diff line change 37
37
#define __gpu_kernel
38
38
39
39
// Returns the number of workgroups in the 'x' dimension of the grid.
40
- _DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_x (void );
40
+ _DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_x (void ) {
41
+ #if defined(__AMDGPU__ )
42
+ return __builtin_amdgcn_grid_size_x () / __builtin_amdgcn_workgroup_size_x ();
43
+ #else
44
+ #error "Not yet implemented"
45
+ #endif
46
+ }
41
47
42
48
// Returns the number of workgroups in the 'y' dimension of the grid.
43
49
_DEFAULT_FN_ATTRS uint32_t __gpu_num_blocks_y (void );
@@ -83,7 +89,12 @@ _DEFAULT_FN_ATTRS uint64_t __gpu_lane_mask(void);
83
89
84
90
// Copies the value from the first active thread in the wave to the rest.
85
91
_DEFAULT_FN_ATTRS uint32_t __gpu_read_first_lane_u32 (uint64_t __lane_mask ,
86
- uint32_t __x );
92
+ uint32_t __x ) {
93
+
94
+ uint64_t __gpu_read_first_lane_u32_impl (
95
+ uint64_t __lane_mask , uint32_t __x ) asm("llvm.spv.wave.readlane.i64" );
96
+ return __gpu_read_first_lane_u32_impl (__lane_mask , __x );
97
+ }
87
98
88
99
// Returns a bitmask of threads in the current lane for which \p x is true.
89
100
_DEFAULT_FN_ATTRS uint64_t __gpu_ballot (uint64_t __lane_mask , bool __x );
You can’t perform that action at this time.
0 commit comments