-
Notifications
You must be signed in to change notification settings - Fork 64
Global scan #665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Global scan #665
Changes from all commits
a34827a
d61a2aa
0eec0ee
2185b4e
2df4ad7
20a60f5
960dbf2
2be8f13
6366697
26cb75d
7117cc3
fa979c7
53e1655
0bcc325
71f4398
51b4f74
4b29049
277db27
2b0c1a2
056cbaf
3d29bc3
4f9f9ee
af438bc
6c45bc6
61d4806
44edb59
14d66a6
cdcc5d0
74f6dab
1565fb8
e16a195
734e84a
40953e3
cc43691
3a5d1ff
658ac5b
d4a947d
03ae90a
3d252d0
dd2cc09
ce78fc0
f6d7adb
0d71686
3ce449d
6a0c6c6
e467ece
5dae823
79ec513
7f4fcd5
462fec5
3f9bdd8
201636d
21bfa0f
0a3728c
1c32528
c9519c3
9f9ae44
d21e216
a53fb5c
82bd9a1
87ae80f
6e740b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
+6 −0 | 14_ComputeReduce/CMakeLists.txt | |
+28 −0 | 14_ComputeReduce/config.json.template | |
+276 −0 | 14_ComputeReduce/main.cpp | |
+50 −0 | 14_ComputeReduce/pipeline.groovy | |
+6 −0 | 15_ComputeScan/CMakeLists.txt | |
+28 −0 | 15_ComputeScan/config.json.template | |
+340 −0 | 15_ComputeScan/main.cpp | |
+50 −0 | 15_ComputeScan/pipeline.groovy | |
+3 −1 | CMakeLists.txt | |
+2 −2 | Readme.md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,64 @@ | ||
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ | ||
#define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ | ||
|
||
// REVIEW: Not sure if this file is needed in HLSL implementation | ||
|
||
#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
|
||
#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ | ||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
Parameters_t getParameters(); | ||
} | ||
} | ||
} | ||
#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ | ||
#ifndef NBL_BUILTIN_MAX_LEVELS | ||
#define NBL_BUILTIN_MAX_LEVELS 7 | ||
#endif | ||
|
||
#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ | ||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<typename Storage_t> | ||
void getData( | ||
inout Storage_t data, | ||
in uint levelInvocationIndex, | ||
in uint localWorkgroupIndex, | ||
in uint treeLevel, | ||
in uint pseudoLevel | ||
); | ||
} | ||
} | ||
} | ||
#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ | ||
#endif | ||
// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have |
||
struct Parameters_t { | ||
uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add documentation about what |
||
uint32_t topLevel; | ||
uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2]; | ||
}; | ||
|
||
Parameters_t getParameters(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why have a forward declaration for this!? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO you need a constructor of |
||
|
||
#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ | ||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<typename Storage_t> | ||
void setData( | ||
in Storage_t data, | ||
in uint levelInvocationIndex, | ||
in uint localWorkgroupIndex, | ||
in uint treeLevel, | ||
in uint pseudoLevel, | ||
in bool inRange | ||
); | ||
struct DefaultSchedulerParameters_t | ||
{ | ||
uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS]; | ||
Comment on lines
+33
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure you need either |
||
|
||
}; | ||
|
||
DefaultSchedulerParameters_t getSchedulerParameters(); | ||
|
||
template<typename Storage_t, bool isExclusive=false> | ||
void getData( | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel | ||
); | ||
|
||
template<typename Storage_t, bool isScan> | ||
void setData( | ||
NBL_CONST_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel, | ||
NBL_CONST_REF_ARG(bool) inRange | ||
); | ||
|
||
} | ||
} | ||
} | ||
#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ | ||
#endif | ||
|
||
#endif |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,118 @@ | ||
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
|
||
// choerent -> globallycoherent | ||
#include "nbl/builtin/hlsl/scan/declarations.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" | ||
|
||
// coherent -> globallycoherent | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
|
||
template<uint32_t dataElementCount=SCRATCH_EL_CNT - NBL_BUILTIN_MAX_LEVELS> | ||
struct Scratch | ||
{ | ||
uint32_t reduceResult; | ||
uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t data[dataElementCount]; | ||
}; | ||
|
||
[[vk::binding(0 ,0)]] RWStructuredBuffer<Storage_t> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Buffer Device Address There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. alternatively you can use accessor pattern |
||
[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this one you need to use globallycoherent can't be used with VMM, but DXc doesn't support/emit/upgrade to VMM IIRC you can mark individual load/store as coherent even before VMM (just no acquire/release cause thats VMM) with spir-v intrinsics and that should be enough Also because the scratch needs to be coherent, it only makes sense to come from a buffer, and you might as well use BDA for it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even without VMM you can use Release/Acquire/SeqCst The only difference between pre and post VMM, is because Volatile used to be a Memory Operand, and in VMM its a MEmory Semantic. https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_memory_semantics_id |
||
|
||
template<typename Storage_t, bool isExclusive> | ||
void getData( | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel | ||
) | ||
{ | ||
const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders | ||
|
||
uint32_t offset = levelInvocationIndex; | ||
const bool notFirstOrLastLevel = bool(treeLevel); | ||
if (notFirstOrLastLevel) | ||
offset += params.temporaryStorageOffset[treeLevel-1u]; | ||
|
||
//if (pseudoLevel!=treeLevel) // downsweep/scan | ||
//{ | ||
// const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; | ||
// if (bool(levelWorkgroupIndex) && firstInvocationInGroup) | ||
// data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]]; | ||
// | ||
// if (notFirstOrLastLevel) | ||
// { | ||
// if (!firstInvocationInGroup) | ||
// data = scanScratchBuf[0].data[offset-1u]; | ||
// } | ||
// else | ||
// { | ||
// if(isExclusive) | ||
// { | ||
// if (!firstInvocationInGroup) | ||
// data += scanBuffer[offset-1u]; | ||
// } | ||
// else | ||
// { | ||
// data += scanBuffer[offset]; | ||
// } | ||
// } | ||
//} | ||
//else | ||
//{ | ||
if (notFirstOrLastLevel) | ||
data = scanScratchBuf[0].data[offset]; | ||
else | ||
data = scanBuffer[offset]; | ||
//} | ||
} | ||
|
||
template<typename Storage_t, bool isScan> | ||
void setData( | ||
NBL_CONST_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(bool) inRange | ||
) | ||
{ | ||
const Parameters_t params = getParameters(); | ||
if (!isScan && treeLevel<params.topLevel) // is reduce and we're not at the last level (i.e. we still save into scratch) | ||
{ | ||
const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1u); | ||
if (lastInvocationInGroup) | ||
scanScratchBuf[0u].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data; | ||
} | ||
else if (inRange) | ||
{ | ||
if (!isScan && treeLevel == params.topLevel) | ||
{ | ||
scanScratchBuf[0u].reduceResult = data; | ||
} | ||
// The following only for isScan == true | ||
else if (bool(treeLevel)) | ||
{ | ||
const uint32_t offset = params.temporaryStorageOffset[treeLevel-1u]; | ||
scanScratchBuf[0].data[levelInvocationIndex+offset] = data; | ||
} | ||
else | ||
{ | ||
scanBuffer[levelInvocationIndex] = data; | ||
} | ||
} | ||
} | ||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,92 @@ | ||
#ifndef _NBL_HLSL_WORKGROUP_SIZE_ | ||
#define _NBL_HLSL_WORKGROUP_SIZE_ 256 | ||
#endif | ||
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#pragma shader_stage(compute) | ||
|
||
#include "nbl/builtin/hlsl/scan/descriptors.hlsl" | ||
#include "nbl/builtin/hlsl/functional.hlsl" | ||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" | ||
#include "nbl/builtin/hlsl/scan/declarations.hlsl" | ||
#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" | ||
#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" | ||
|
||
// ITEMS_PER_WG = WORKGROUP_SIZE | ||
static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<WORKGROUP_SIZE>::value; | ||
|
||
// TODO: Can we make it a static variable? | ||
groupshared uint32_t wgScratch[SharedScratchSz]; | ||
|
||
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" | ||
|
||
template<uint16_t offset> | ||
struct WGScratchProxy | ||
{ | ||
void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) | ||
{ | ||
value = wgScratch[ix+offset]; | ||
} | ||
void set(const uint32_t ix, const uint32_t value) | ||
{ | ||
wgScratch[ix+offset] = value; | ||
} | ||
|
||
uint32_t atomicAdd(uint32_t ix, uint32_t val) | ||
{ | ||
return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
static WGScratchProxy<0> accessor; | ||
|
||
// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 | ||
uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} | ||
|
||
struct ScanPushConstants | ||
{ | ||
nbl::hlsl::scan::Parameters_t scanParams; | ||
nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams; | ||
}; | ||
|
||
[[vk::push_constant]] | ||
ScanPushConstants spc; | ||
Comment on lines
+48
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. everything affecting the pipeline layout should be userspace |
||
|
||
/** | ||
* Required since we rely on SubgroupContiguousIndex instead of | ||
* gl_LocalInvocationIndex which means to match the global index | ||
* we can't use the gl_GlobalInvocationID but an index based on | ||
* SubgroupContiguousIndex. | ||
*/ | ||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ | ||
cbuffer PC // REVIEW: register and packoffset selection | ||
{ | ||
Parameters_t scanParams; | ||
DefaultSchedulerParameters_t schedulerParams; | ||
}; | ||
#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ | ||
#endif | ||
|
||
#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ | ||
Parameters_t getParameters() | ||
{ | ||
return pc.scanParams; | ||
return spc.scanParams; | ||
} | ||
#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ | ||
#endif | ||
|
||
#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ | ||
DefaultSchedulerParameters_t getSchedulerParameters() | ||
{ | ||
return pc.schedulerParams; | ||
return spc.schedulerParams; | ||
} | ||
#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ | ||
#endif | ||
|
||
} | ||
} | ||
} | ||
|
||
#ifndef _NBL_HLSL_MAIN_DEFINED_ | ||
[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] | ||
void CSMain() | ||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
nbl::hlsl::scan::main(); | ||
} | ||
#define _NBL_HLSL_MAIN_DEFINED_ | ||
#endif | ||
nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be a
NBL_CONSTEXPR_STATIC_INLINE
inParameters_t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I keep doing the same reviews
#665 (comment)