diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 5ab4d368b6..8a281b7e34 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96 +Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9 diff --git a/examples_tests b/examples_tests index f4cc4cd22e..359e840943 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f4cc4cd22ee4bd5506d794e63caafddf974ed7a4 +Subproject commit 359e84094383f7f0a7d5463f00d71112a8d1ce87 diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 337db5e22f..4395fb44fe 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -178,6 +178,10 @@ void memoryBarrierShared() { spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask); } +void memoryBarrierBuffer() { + spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask); +} + namespace impl { diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 2d2e66e66d..6e6c0f5563 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -1,66 +1,64 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ // REVIEW: Not sure if this file is needed in HLSL implementation -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - +#include "nbl/builtin/hlsl/cpp_compat.hlsl" -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - Parameters_t getParameters(); -} -} -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ +#ifndef NBL_BUILTIN_MAX_LEVELS +#define NBL_BUILTIN_MAX_LEVELS 7 #endif -#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ namespace nbl { namespace hlsl { namespace scan { - template - void getData( - inout Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel - ); -} -} -} -#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -#endif + // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) + struct Parameters_t { + uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1]; + uint32_t topLevel; + uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2]; + }; + + Parameters_t getParameters(); -#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void setData( - in Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel, - in bool inRange - ); + struct DefaultSchedulerParameters_t + { + uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS]; + uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS]; + uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS]; + + }; + + DefaultSchedulerParameters_t getSchedulerParameters(); + + template + void getData( + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel + ); + + template + void setData( + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange + ); + } } } -#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -#endif #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 450368475d..7f341b4cc2 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -1,31 +1,13 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - struct DefaultSchedulerParameters_t - { - uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; - uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - - }; -} -} -} - -#ifdef __cplusplus -#undef uint -#else +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" namespace nbl { @@ -35,187 +17,181 @@ namespace scan { namespace scheduler { - /** - * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation - * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions - * that are used for synchronization etc. - * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is - * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of - * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and - * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will - * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results - * in the downsweep phase of Blelloch Scan. - * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. - * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. - * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the - * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. - * - * Parameter meaning: - * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer - * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) - * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan - * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) - * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps - * synchronizing between workgroups with while-loop spinning. - */ - void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) - { + /** + * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation + * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions + * that are used for synchronization etc. + * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is + * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of + * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and + * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will + * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results + * in the downsweep phase of Blelloch Scan. + * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. + * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. + * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the + * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. + * + * Parameter meaning: + * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer + * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) + * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan + * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) + * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps + * synchronizing between workgroups with while-loop spinning. + */ + void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) + { #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) - _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - // REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime? - - for (uint i=0; i>_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - i = next; - } - _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); - _schedulerParams.finishedFlagOffset[0] = 0u; - switch(_scanParams.topLevel) - { - case 1u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = 1u; - - _scanParams.temporaryStorageOffset[0] = 2u; - break; - case 2u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; - // climb down - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - break; - case 3u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; - // climb down - _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; - _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); - break; - default: - break; -#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 + const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); + _scanParams.lastElement[0] = elementCount-1u; + _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; + + for (uint32_t i=0; i>workgroupSizeLog2; + i = next; + } + _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); + _schedulerParams.finishedFlagOffset[0] = 0u; + switch(_scanParams.topLevel) + { + case 1u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = 1u; + + _scanParams.temporaryStorageOffset[0] = 2u; + break; + case 2u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; + _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); + _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; + // climb down + _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; + + _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); + _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); + break; + case 3u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); + _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; + _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); + _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); + _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); + _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; + // climb down + _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; + _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); + + _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); + _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); + _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); + break; + default: + break; +#if NBL_BUILTIN_MAX_LEVELS>7 #error "Switch needs more cases" #endif - } + } #undef WorkgroupCount - } - - /** - * treeLevel - the current level in the Blelloch Scan - * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. - * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. - */ - template - bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) - { - ScratchAccessor sharedScratch; - if(SubgroupContiguousIndex() == 0u) - { - uint64_t original; - InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared - sharedScratch.set(SubgroupContiguousIndex(), original); - } - else if (SubgroupContiguousIndex() == 1u) - { - sharedScratch.set(SubgroupContiguousIndex(), 0u); - } - GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology? - - const uint globalWorkgroupIndex; // does every thread need to know? - sharedScratch.get(0u, globalWorkgroupIndex); - const uint lastLevel = topLevel<<1u; - if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) + } + + /** + * treeLevel - the current level in the Blelloch Scan + * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. + * For example, if we have dispatched 10 workgroups and we the virtu al workgroup number is 35, then the localWorkgroupIndex should be 5. + */ + template + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) + { + const uint32_t lastLevel = topLevel<<1u; + if(workgroup::SubgroupContiguousIndex() == 0u) { - InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ? + uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); + sharedScratch.set(0u, originalWGs); + treeLevel = 0; + while (originalWGs>=params.cumulativeWorkgroupCount[treeLevel]) { // doesn't work for now because PushConstant arrays can only be accessed by dynamically uniform indices + treeLevel++; + } + sharedScratch.set(1u, treeLevel); } - GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor? - - sharedScratch.get(1u, treeLevel); - if(treeLevel>lastLevel) - return true; - - localWorkgroupIndex = globalWorkgroupIndex; - const bool dependentLevel = treeLevel != 0u; - if(dependentLevel) - { - const uint prevLevel = treeLevel - 1u; - localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; - if(SubgroupContiguousIndex() == 0u) - { - uint dependentsCount = 1u; - if(treeLevel <= topLevel) - { - dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file? - const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; - if (lastWorkgroup) - { - const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader - dependentsCount = scanParams.lastElement[treeLevel]+1u; - if (treeLeveltopLevel) // !(prevLevellastLevel) + return true; + + localWorkgroupIndex = globalWorkgroupIndex; + const bool dependentLevel = treeLevel != 0u; + if(dependentLevel) + { + const uint32_t prevLevel = treeLevel - 1u; + localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; + if(workgroup::SubgroupContiguousIndex() == 0u) + { + uint32_t dependentsCount = 1u; + if(treeLevel <= topLevel) + { + dependentsCount = glsl::gl_WorkGroupSize().x; + const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; + if (lastWorkgroup) + { + const Parameters_t scanParams = getParameters(); + dependentsCount = scanParams.lastElement[treeLevel]+1u; + if (treeLeveltopLevel) // !(prevLevel globallycoherent \ No newline at end of file +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +// coherent -> globallycoherent + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct Scratch +{ + uint32_t reduceResult; + uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS]; + uint32_t data[dataElementCount]; +}; + +[[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(1 ,0)]] RWStructuredBuffer /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model + +template +void getData( + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel +) +{ + const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders + + uint32_t offset = levelInvocationIndex; + const bool notFirstOrLastLevel = bool(treeLevel); + if (notFirstOrLastLevel) + offset += params.temporaryStorageOffset[treeLevel-1u]; + + //if (pseudoLevel!=treeLevel) // downsweep/scan + //{ + // const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; + // if (bool(levelWorkgroupIndex) && firstInvocationInGroup) + // data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]]; + // + // if (notFirstOrLastLevel) + // { + // if (!firstInvocationInGroup) + // data = scanScratchBuf[0].data[offset-1u]; + // } + // else + // { + // if(isExclusive) + // { + // if (!firstInvocationInGroup) + // data += scanBuffer[offset-1u]; + // } + // else + // { + // data += scanBuffer[offset]; + // } + // } + //} + //else + //{ + if (notFirstOrLastLevel) + data = scanScratchBuf[0].data[offset]; + else + data = scanBuffer[offset]; + //} +} + +template +void setData( + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(bool) inRange +) +{ + const Parameters_t params = getParameters(); + if (!isScan && treeLevel::value; + +// TODO: Can we make it a static variable? +groupshared uint32_t wgScratch[SharedScratchSz]; + +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" + +template +struct WGScratchProxy +{ + void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) + { + value = wgScratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + wgScratch[ix+offset] = value; + } + + uint32_t atomicAdd(uint32_t ix, uint32_t val) + { + return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; +static WGScratchProxy<0> accessor; + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +struct ScanPushConstants +{ + nbl::hlsl::scan::Parameters_t scanParams; + nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams; +}; + +[[vk::push_constant]] +ScanPushConstants spc; + +/** + * Required since we rely on SubgroupContiguousIndex instead of + * gl_LocalInvocationIndex which means to match the global index + * we can't use the gl_GlobalInvocationID but an index based on + * SubgroupContiguousIndex. + */ +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} namespace nbl { @@ -12,39 +71,22 @@ namespace hlsl { namespace scan { -#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ - cbuffer PC // REVIEW: register and packoffset selection - { - Parameters_t scanParams; - DefaultSchedulerParameters_t schedulerParams; - }; -#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ Parameters_t getParameters() { - return pc.scanParams; + return spc.scanParams; } -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ DefaultSchedulerParameters_t getSchedulerParameters() { - return pc.schedulerParams; + return spc.schedulerParams; } -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif + } } } -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() +[numthreads(WORKGROUP_SIZE,1,1)] +void main() { - nbl::hlsl::scan::main(); -} -#define _NBL_HLSL_MAIN_DEFINED_ -#endif \ No newline at end of file + nbl::hlsl::scan::main, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor); +} \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl index 1191731f65..f330b1393d 100644 --- a/include/nbl/builtin/hlsl/scan/indirect.hlsl +++ b/include/nbl/builtin/hlsl/scan/indirect.hlsl @@ -1,8 +1,3 @@ -#ifndef _NBL_HLSL_WORKGROUP_SIZE_ -#define _NBL_HLSL_WORKGROUP_SIZE_ 256 -#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8 -#endif - #include "nbl/builtin/hlsl/scan/descriptors.hlsl" #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" @@ -13,36 +8,33 @@ namespace hlsl { namespace scan { -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ + +uint32_t getIndirectElementCount(); + Parameters_t scanParams; + Parameters_t getParameters() { - return scanParams; + return scanParams; } -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif - -uint getIndirectElementCount(); -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ DefaultSchedulerParameters_t schedulerParams; + DefaultSchedulerParameters_t getSchedulerParameters() { - scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams); + scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams); return schedulerParams; } -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif + } } } -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() +[numthreads(WORKGROUP_SIZE,1,1)] +void main() { - if (bool(nbl::hlsl::scan::getIndirectElementCount())) - nbl::hlsl::scan::main(); + if(bool(nbl::hlsl::scan::getIndirectElementCount())) { + // TODO call main from virtual_workgroup.hlsl + } } -#define _NBL_HLSL_MAIN_DEFINED_ #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl deleted file mode 100644 index bfeba13be2..0000000000 --- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ -#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ - -#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) - struct Parameters_t { - uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; - uint topLevel; - uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; - } -} -} -} - -#ifdef __cplusplus -#undef uint -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl new file mode 100644 index 0000000000..ea283a271f --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl @@ -0,0 +1,69 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ +#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ + +#include "nbl/builtin/hlsl/scan/declarations.hlsl" + +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" + +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template struct ScanScheduler +{ + + static ScanScheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) + { + ScanScheduler scheduler; + scheduler.params = params; + scheduler.schedParams = schedParams; + return scheduler; + } + + template bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() + { + + // This part ensures that each workgroup that was not chosen to run the next level + // will exit once all the work for the current level is finished + glsl::memoryBarrierBuffer(); + if (scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) + return false; + + if (workgroup::SubgroupContiguousIndex() == 0u) + { + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level], 1u); + } + uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); + // nothing really happens, we just check if there's a workgroup in the level to grab + return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); + } + + uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) { return uberWorkgroupIndex; } + + uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level) + { + return level == 0u + ? schedParams.cumulativeWorkgroupCount[level] + : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level - 1u]; + } + + Parameters_t params; + DefaultSchedulerParameters_t schedParams; + uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex + uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst +}; +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl new file mode 100644 index 0000000000..99effbd88a --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl @@ -0,0 +1,110 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_HLSL_SCHEDULER_INCLUDED_ +#define _NBL_HLSL_SCHEDULER_INCLUDED_ + +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct Scheduler +{ + static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) + { + Scheduler scheduler; + scheduler.params = params; + scheduler.schedParams = schedParams; + return scheduler; + } + + template + bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() + { + + // This part ensures that each workgroup that was not chosen to run the next level + // will exit once all the work for the current level is finished + glsl::memoryBarrierBuffer(); + if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) + return false; + + if (workgroup::SubgroupContiguousIndex()==0u) + { + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u); + } + uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); + // nothing really happens, we just check if there's a workgroup in the level to grab + return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); + } + + template + bool markDone(NBL_REF_ARG(Accessor) accessor) + { + if (level==(params.topLevel)) // check if we reached the lastLevel + { + // not even sure this is needed + uberWorkgroupIndex = ~0u; + return true; + } + + uint32_t prevLevel = level; + if (workgroup::SubgroupContiguousIndex()==0u) + { + // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset + const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; + const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize; + const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u); + //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + { + level++; + } + } + level = workgroup::BroadcastFirst(level, accessor); + return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level + } + + uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) + { + return uberWorkgroupIndex; + } + + uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level) + { + return level == 0 + ? schedParams.cumulativeWorkgroupCount[level] + : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u]; + } + + void resetWorkgroupsStartedBuffer() + { + // could do scanScratchBuf[0u].workgroupsStarted[SubgroupContiguousIndex()] = 0u but don't know how many invocations are live during this call + if(workgroup::SubgroupContiguousIndex() == 0u) + { + for(uint32_t i = 0; i < params.topLevel; i++) + { + scanScratchBuf[0u].workgroupsStarted[i] = 0u; + } + } + } + + Parameters_t params; + DefaultSchedulerParameters_t schedParams; + uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex + uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst +}; + +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 488bf29012..f9bd38a9e3 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -2,91 +2,128 @@ #define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ // TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor -#include "nbl/builtin/hlsl/limits/numeric.hlsl" -#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined #include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/scheduler.hlsl" +#include "nbl/builtin/hlsl/scan/scan_scheduler.hlsl" -#include "nbl/builtin/hlsl/binops.hlsl" - -#if 0 namespace nbl { namespace hlsl { namespace scan -{ - template - void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) - { - const Parameters_t params = getParameters(); - const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1); +{ + /** + * Finds the index on a higher level in the reduce tree which the + * levelInvocationIndex needs to add to level 0 + */ + template + uint32_t2 findHigherLevelIndex(uint32_t level, uint32_t levelInvocationIndex) + { + const uint32_t WgSzLog2 = firstbithigh(WorkgroupSize); + uint32_t rightIndexNonInclusive = (levelInvocationIndex >> (WgSzLog2 * level)); // formula for finding the index. If it underflows it means that there's nothing to do for the level + uint32_t leftIndexInclusive = (rightIndexNonInclusive >> WgSzLog2) << WgSzLog2; + uint32_t add = (levelInvocationIndex & (WgSzLog2 * level - 1)) > 0 ? 1u : 0u; // if the index mod WorkgroupSize*level > 0 then we must add 1u + return uint32_t2(leftIndexInclusive, rightIndexNonInclusive); // return both the index and the potential underflow information to know we must do nothing + } - const uint lastLevel = params.topLevel << 1u; - const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel]; + template + void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) + { + const Parameters_t params = getParameters(); + const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); + + const bool inRange = levelInvocationIndex <= params.lastElement[treeLevel]; // the lastElement array contains the lastElement's index hence the '<=' - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; + // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel + // so that it exits after reaching the top? + + // Seems that even though it's a mem barrier is must NOT diverge! + // This was called inside getData but due to inRange it can possibly diverge + // so it must be called outside! Not yet sure why a mem barrier must be uniformly called. + glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?) + + Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? + if(inRange) + { + getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel); + } - Storage_t data = Binop::identity(); - if(inRange) - { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); - } + bool doReduce = !isScan; + if(doReduce) + { + data = workgroup::reduction::template __call(data,accessor); + } + else + { + if (isExclusive) + { + data = workgroup::exclusive_scan::template __call(data,accessor); + } + else + { + data = workgroup::inclusive_scan::template __call(data,accessor); + } + + uint32_t reverseLevel = params.topLevel - treeLevel; + while(reverseLevel != ~0u) + { + Storage_t tempData = Binop::identity; + uint32_t2 idxRange = findHigherLevelIndex(reverseLevel--, levelInvocationIndex); + if(workgroup::SubgroupContiguousIndex() < idxRange.y - idxRange.x) + { + tempData = scanScratchBuf[0].data[params.temporaryStorageOffset[reverseLevel] + idxRange.x + workgroup::SubgroupContiguousIndex()]; + } - if(treeLevel < params.topLevel) - { - #error "Must also define some scratch accessor when calling operation()" - data = workgroup::reduction()(data); - } - // REVIEW: missing _TYPE_ check and extra case here - else if (treeLevel != params.topLevel) - { - data = workgroup::inclusive_scan()(data); - } - else - { - data = workgroup::exclusive_scan()(data); - } - setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); - } -} -} -} + // we could potentially do an inclusive scan of this part and cache it for other WGs + //tempData = workgroup::inclusive_scan::template __call(tempData,accessor); + data += workgroup::reduction::template __call(tempData,accessor); + } + } + + setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, inRange); + } -#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored? -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - void main() - { - const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint topLevel = getParameters().topLevel; - // persistent workgroups - while (true) - { - uint treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) - { - return; - } - - virtualWorkgroup(treeLevel,localWorkgroupIndex); + DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components + template + void main(NBL_REF_ARG(Accessor) accessor) + { + const Parameters_t params = getParameters(); + const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); + + if(!isScan) + { + Scheduler scheduler = Scheduler::create(params, schedulerParams); + // persistent workgroups + + while (true) + { + if (!scheduler.getWork(accessor)) + { + return; + } - scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); - } - } + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + accessor.workgroupExecutionAndMemoryBarrier(); + if(scheduler.markDone(accessor)) + { + return; + } + } + } + else + { + ScanScheduler scheduler = ScanScheduler::create(params, schedulerParams); + while(scheduler.getWork(accessor)) + { + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + accessor.workgroupExecutionAndMemoryBarrier(); + } + } + } } } } -#endif - -#define _NBL_HLSL_SCAN_MAIN_DEFINED_ -#endif #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl index 289c2050fa..d7c66e23d0 100644 --- a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl @@ -38,7 +38,7 @@ T Broadcast(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor, const uint template T BroadcastFirst(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor) { - return Broadcast(val,0); + return Broadcast(val, accessor, 0); } } diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h new file mode 100644 index 0000000000..ebeae08181 --- /dev/null +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -0,0 +1,288 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_ +#define _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_ + + +#include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/utilities/IDescriptorSetCache.h" + +#include "nbl/builtin/hlsl/scan/declarations.hlsl" + + +namespace nbl::video +{ +class CArithmeticOps : public core::IReferenceCounted +{ + public: + // Only 4 byte wide data types supported due to need to trade the via shared memory, + // different combinations of data type and operator have different identity elements. + // `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX` + enum E_DATA_TYPE : uint8_t + { + EDT_UINT=0u, + EDT_INT, + EDT_FLOAT, + EDT_COUNT + }; + enum E_OPERATOR : uint8_t + { + EO_AND = 0u, + EO_XOR, + EO_OR, + EO_ADD, + EO_MUL, + EO_MIN, + EO_MAX, + EO_COUNT + }; + + // This struct is only for managing where to store intermediate results of the scans + struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in HLSL + { + static inline constexpr uint32_t MaxLevels = NBL_BUILTIN_MAX_LEVELS; + + Parameters() + { + std::fill_n(lastElement,MaxLevels/2+1,0u); + std::fill_n(temporaryStorageOffset,MaxLevels/2,0u); + } + // build the constant tables for each level given the number of elements to scan and workgroupSize + Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters() + { + assert(_elementCount!=0u && "Input element count can't be 0!"); + const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxLevels/2u+1u); + assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!"); + + lastElement[0u] = _elementCount-1u; + for (topLevel=0u; lastElement[topLevel]>=workgroupSize;) + temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = std::ceil(lastElement[topLevel] / double(workgroupSize)); + + std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u); + } + // given already computed tables of lastElement indice s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed + inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) + { + uint32_t uint_count = 1u; // reduceResult field + uint_count += MaxLevels; // workgroup enumerator + uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset + uint_count += lastElement[topLevel]+1u; // starting from the last temporary storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based) + return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); + } + + inline uint32_t getWorkgroupEnumeratorSize() + { + return MaxLevels * sizeof(uint32_t); + } + }; + + // the default scheduler we provide works as described above in the big documentation block + struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t // this struct and its methods are also available in HLSL + { + SchedulerParameters() + { + std::fill_n(cumulativeWorkgroupCount,Parameters::MaxLevels,0u); + std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxLevels, 0u); + std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxLevels,0u); + } + + // given the number of elements and workgroup size, figure out how many atomics we need + // also account for the fact that we will want to use the same scratch buffer both for the + // scheduler's atomics and the aux data storage + SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters() + { + outScanParams = Parameters(_elementCount,workgroupSize); + const auto topLevel = outScanParams.topLevel; + + std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount); + std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); + cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction + for (auto i = 0u; i <= topLevel; i++) { + workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !! + lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u); + } + + const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u); + std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, workgroupFinishFlagsOffset, 0u); + for (auto i=0u; i= elementCount / workgroupSize^2 + 1 && "Too few workgroups! The workgroup count must be at least the elementCount/(wgSize^2)"); + } + + uint32_t wg_count; + }; + + CArithmeticOps(core::smart_refctd_ptr&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + + CArithmeticOps(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) + { + assert(core::isPoT(m_workgroupSize)); + const asset::SPushConstantRange pc_range[] = {asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; + const IGPUDescriptorSetLayout::SBinding bindings[2] = { + {.binding = 0u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr }, + { .binding = 1u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr } // scratch + }; + + m_ds_layout = m_device->createDescriptorSetLayout(bindings); + assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully"); + m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout)); + assert(m_pipeline_layout && "CArithmeticOps Pipeline Layout was not created successfully"); + } + + inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); } + + inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); } + + inline uint32_t getWorkgroupSize() const {return m_workgroupSize;} + + inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo) + { + pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize); + dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize); + } + + static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange& input_range, const asset::SBufferRange& scratch_range) + { + IGPUDescriptorSet::SDescriptorInfo infos[2]; + infos[0].desc = input_range.buffer; + infos[0].info.buffer = {input_range.offset,input_range.size}; + infos[1].desc = scratch_range.buffer; + infos[1].info.buffer = {scratch_range.offset,scratch_range.size}; + + video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (auto i=0u; i<2u; i++) + { + writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i }; + } + + device->updateDescriptorSets(2, writes, 0u, nullptr); + } + + static inline void dispatchHelper( + IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, + std::span bufferBarriers + ) + { + cmdbuf->pushConstants(layout, asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants); + if (bufferBarriers.size() > 0) + { + for (uint32_t i = 0; i < bufferBarriers.size(); i++) + { + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, bufferBarriers[i]); + } + } + cmdbuf->dispatch(dispatchInfo.wg_count, 1u, 1u); + } + + inline core::smart_refctd_ptr createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchElCount) const + { + auto system = m_device->getPhysicalDevice()->getSystem(); + core::smart_refctd_ptr hlsl; + { + auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr + { + nbl::system::ISystem::future_t> future; + system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); + if (future.wait()) + return future.copy(); + return nullptr; + }; + + // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ?? + hlsl = loadBuiltinData(shaderFile); + } + auto buffer = asset::ICPUBuffer::create({ {.size = hlsl->getSize()} }); + memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); + auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + + // REVIEW: Probably combine the commons parts of CReduce and CScanner + const char* storageType = nullptr; + switch (dataType) + { + case EDT_UINT: + storageType = "uint32_t"; + break; + case EDT_INT: + storageType = "int32_t"; + break; + case EDT_FLOAT: + storageType = "float32_t"; + break; + default: + assert(false); + break; + } + + const char* binop = nullptr; + switch (op) + { + case EO_AND: + binop = "bit_and"; + break; + case EO_OR: + binop = "bit_or"; + break; + case EO_XOR: + binop = "bit_xor"; + break; + case EO_ADD: + binop = "plus"; + break; + case EO_MUL: + binop = "multiplies"; + break; + case EO_MAX: + binop = "maximum"; + break; + case EO_MIN: + binop = "minimum"; + break; + default: + assert(false); + break; + } + + return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_EL_CNT %d\n", m_workgroupSize, storageType, binop, scratchElCount); + } + + inline ILogicalDevice* getDevice() const {return m_device.get();} + protected: + // REVIEW: Does it need an empty destructor? + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_ds_layout; + core::smart_refctd_ptr m_pipeline_layout; + + const uint32_t m_workgroupSize; +}; + +} + +#endif \ No newline at end of file diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h new file mode 100644 index 0000000000..4a311aa738 --- /dev/null +++ b/include/nbl/video/utilities/CReduce.h @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_VIDEO_C_REDUCE_H_INCLUDED_ +#define _NBL_VIDEO_C_REDUCE_H_INCLUDED_ + +#include "nbl/video/utilities/CArithmeticOps.h" + +namespace nbl::video +{ + + class NBL_API2 CReduce final : public CArithmeticOps + { + + public: + CReduce(core::smart_refctd_ptr&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + CReduce(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {} + asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + core::smart_refctd_ptr createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const; + + protected: + ~CReduce() + { + // all drop themselves automatically + } + + core::smart_refctd_ptr m_shaders[EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr m_pipelines[EDT_COUNT][EO_COUNT]; + }; +} + +#endif \ No newline at end of file diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 2ae67163eb..d705113f1a 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -8,15 +8,14 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/IDescriptorSetCache.h" +#include "nbl/video/utilities/CArithmeticOps.h" +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!"); namespace nbl::video { -#include "nbl/builtin/glsl/scan/parameters_struct.glsl" -#include "nbl/builtin/glsl/scan/default_scheduler.glsl" -static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); - /** Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU. @@ -143,270 +142,41 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about Console devs get to bring a gun to a knife fight... **/ -#if 0 // legacy & port to HLSL -class CScanner final : public core::IReferenceCounted +class NBL_API2 CScanner final : public CArithmeticOps { public: enum E_SCAN_TYPE : uint8_t { // computes output[n] = Sum_{i<=n}(input[i]) - EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_, + EST_INCLUSIVE = 0u, // computes output[n] = Sum_{i=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!"); - - lastElement[0u] = _elementCount-1u; - for (topLevel=0u; lastElement[topLevel]>=workgroupSize;) - temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize; - - std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u); - } - // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed - inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) - { - uint32_t uint_count = 1u; // workgroup enumerator - uint_count += temporaryStorageOffset[MaxScanLevels/2u-1u]; // last scratch offset - uint_count += lastElement[topLevel]+1u; // and its size - return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); - } - }; - // the default scheduler we provide works as described above in the big documentation block - struct SchedulerParameters : nbl_glsl_scan_DefaultSchedulerParameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches - { - SchedulerParameters() - { - std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u); - std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u); - } - // given the number of elements and workgroup size, figure out how many atomics we need - // also account for the fact that we will want to use the same scratch buffer both for the - // scheduler's atomics and the aux data storage - SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters() - { - outScanParams = Parameters(_elementCount,workgroupSize); - const auto topLevel = outScanParams.topLevel; - - std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount); - for (auto i=0u; i<=topLevel; i++) - cumulativeWorkgroupCount[i] += 1u; - std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); - - std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset); - std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel); - - const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t); - const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u); - std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u); - for (auto i=0u; i&& device) : CScanner(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} - // - CScanner(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) - { - assert(core::isPoT(m_workgroupSize)); - - const asset::SPushConstantRange pc_range = { asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants) }; - const IGPUDescriptorSetLayout::SBinding bindings[2] = { - { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer - { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch - }; - - m_ds_layout = m_device->createDescriptorSetLayout(bindings,bindings+sizeof(bindings)/sizeof(IGPUDescriptorSetLayout::SBinding)); - m_pipeline_layout = m_device->createPipelineLayout(&pc_range,&pc_range+1,core::smart_refctd_ptr(m_ds_layout)); - } - - // - inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); } - - // - inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); } - + // You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader - core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const - { - return createShader(true,scanType,dataType,op); - } + //core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const + //{ + // return createShader(true,scanType,dataType,op,scratchSz); + //} // - inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) - { - if (!m_shaders[scanType][dataType][op]) - m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op); - return m_shaders[scanType][dataType][op].get(); - } - // - inline IGPUSpecializedShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) - { - if (!m_specialized_shaders[scanType][dataType][op]) - { - auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op)); - cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp"); - cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); - - auto gpushader = m_device->createShader(std::move(cpuShader)); - - m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader( - gpushader.get(),{nullptr,nullptr,"main"}); - // , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp" - } - return m_specialized_shaders[scanType][dataType][op].get(); - } - - // - inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) - { - // ondemand - if (!m_pipelines[scanType][dataType][op]) - m_pipelines[scanType][dataType][op] = m_device->createComputePipeline( - nullptr,core::smart_refctd_ptr(m_pipeline_layout), - core::smart_refctd_ptr(getDefaultSpecializedShader(scanType,dataType,op)) - ); - return m_pipelines[scanType][dataType][op].get(); - } - - // - inline uint32_t getWorkgroupSize() const {return m_workgroupSize;} - - // - inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo) - { - pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize); - dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize); - } - - // - static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange& input_range, const asset::SBufferRange& scratch_range) - { - IGPUDescriptorSet::SDescriptorInfo infos[2]; - infos[0].desc = input_range.buffer; - infos[0].info.buffer = {input_range.offset,input_range.size}; - infos[1].desc = scratch_range.buffer; - infos[1].info.buffer = {scratch_range.offset,scratch_range.size}; - - video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (auto i=0u; i<2u; i++) - { - writes[i].dstSet = set; - writes[i].binding = i; - writes[i].arrayElement = 0u; - writes[i].count = 1u; - writes[i].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER; - writes[i].info = infos+i; - } - - device->updateDescriptorSets(2, writes, 0u, nullptr); - } - - // Half and sizeof(uint32_t) of the scratch buffer need to be cleared to 0s - static inline void dispatchHelper( - IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, - const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* srcBufferBarriers, - const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* dstBufferBarriers - ) - { - cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants); - if (srcBufferBarrierCount) - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {}; - info.bufBarrierCount = srcBufferBarrierCount; - info.bufBarriers = srcBufferBarriers; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u); - if (srcBufferBarrierCount) - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {}; - info.bufBarrierCount = dstBufferBarrierCount; - info.bufBarriers = dstBufferBarriers; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - } - - inline ILogicalDevice* getDevice() const {return m_device.get();} - + CScanner(core::smart_refctd_ptr&& device) : CScanner(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + CScanner(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {} + asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + core::smart_refctd_ptr createShader(/*const bool indirect, */const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; protected: ~CScanner() { // all drop themselves automatically } - core::smart_refctd_ptr createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const; - - - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_ds_layout; - core::smart_refctd_ptr m_pipeline_layout; core::smart_refctd_ptr m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; - core::smart_refctd_ptr m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT]; - const uint32_t m_workgroupSize; }; -#endif } #endif diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 8ef38fd080..ce2bb22299 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -11,6 +11,7 @@ #include "nbl/video/alloc/StreamingTransientDataBuffer.h" #include "nbl/video/utilities/SIntendedSubmitInfo.h" #include "nbl/video/utilities/CPropertyPoolHandler.h" +#include "nbl/video/utilities/CReduce.h" #include "nbl/video/utilities/CScanner.h" #include "nbl/video/utilities/CComputeBlit.h" @@ -132,9 +133,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile +#endif const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u); + m_reducer = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); m_scanner = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); -#endif } inline ~IUtilities() @@ -163,13 +165,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted { return m_propertyPoolHandler.get(); } +#endif + virtual CReduce* getDefaultReducer() const + { + return m_reducer.get(); + } - //! virtual CScanner* getDefaultScanner() const { return m_scanner.get(); } -#endif //! This function provides some guards against streamingBuffer fragmentation or allocation failure static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom) { @@ -716,8 +721,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted #if 0 // TODO: port core::smart_refctd_ptr m_propertyPoolHandler; - core::smart_refctd_ptr m_scanner; #endif + core::smart_refctd_ptr m_reducer; + core::smart_refctd_ptr m_scanner; }; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index ad464bd035..c6df0949de 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -227,6 +227,7 @@ set(NBL_VIDEO_SOURCES ${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CPropertyPoolHandler.cpp + ${NBL_ROOT_PATH}/src/nbl/video/utilities/CReduce.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CScanner.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CComputeBlit.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CAssetConverter.cpp diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 9333a0d3b4..023c673d3c 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -334,13 +334,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h") #workgroup LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/arithmetic.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") +#global scan +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/declarations.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/default_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/descriptors.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/direct.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scan_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/virtual_workgroup.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp new file mode 100644 index 0000000000..16b5cc5e29 --- /dev/null +++ b/src/nbl/video/utilities/CReduce.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/utilities/CReduce.h" + +namespace nbl::video +{ + +asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) +{ + if (!m_shaders[dataType][op]) + m_shaders[dataType][op] = createShader(dataType,op,scratchElCount); + return m_shaders[dataType][op].get(); +} + +IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) +{ + if (!m_specialized_shaders[dataType][op]) + { + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(dataType,op,scratchElCount)); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); + cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE); + + auto gpushader = m_device->createShader(cpuShader.get()); + + m_specialized_shaders[dataType][op] = gpushader; + } + return m_specialized_shaders[dataType][op].get(); +} + +IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) +{ + // ondemand + if (!m_pipelines[dataType][op]) { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipeline_layout.get(); + // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") + params.shader.entryPoint = "main"; + params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchElCount); + + m_device->createComputePipelines( + nullptr, { ¶ms,1 }, + & m_pipelines[dataType][op] + ); + } + return m_pipelines[dataType][op].get(); +} + +core::smart_refctd_ptr CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const +{ + core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchElCount); + return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false"); +} + +} \ No newline at end of file diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index ec31272358..3cdcc82ee7 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -1,52 +1,52 @@ #include "nbl/video/utilities/CScanner.h" -using namespace nbl; -using namespace video; +namespace nbl::video +{ + +asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) +{ + if (!m_shaders[scanType][dataType][op]) + m_shaders[scanType][dataType][op] = createShader(scanType,dataType,op,scratchSz); + return m_shaders[scanType][dataType][op].get(); +} -#if 0 // TODO: port -core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const +IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { - auto system = m_device->getPhysicalDevice()->getSystem(); - core::smart_refctd_ptr glsl; - { - auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr - { - nbl::system::ISystem::future_t> future; - system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); - if (future.wait()) - return future.copy(); - return nullptr; - }; + if (!m_specialized_shaders[scanType][dataType][op]) + { + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op,scratchSz)); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); + cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE); + + auto gpushader = m_device->createShader(cpuShader.get()); + + m_specialized_shaders[scanType][dataType][op] = gpushader; + } + return m_specialized_shaders[scanType][dataType][op].get(); +} - if(indirect) - glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp"); - else - glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp"); - } - auto buffer = core::make_smart_refctd_ptr(glsl->getSize()); - memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize()); - auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????"); - const char* storageType = nullptr; - switch (dataType) - { - case EDT_UINT: - storageType = "uint"; - break; - case EDT_INT: - storageType = "int"; - break; - case EDT_FLOAT: - storageType = "float"; - break; - default: - assert(false); - break; - } +IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) +{ + // ondemand + if (!m_pipelines[scanType][dataType][op]) { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipeline_layout.get(); + // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") + params.shader.entryPoint = "main"; + params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz); - return asset::CGLSLCompiler::createOverridenCopy( - cpushader.get(), - "#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n", - m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op) - ); + m_device->createComputePipelines( + nullptr, { ¶ms,1 }, + & m_pipelines[scanType][dataType][op] + ); + } + return m_pipelines[scanType][dataType][op].get(); } -#endif \ No newline at end of file + +core::smart_refctd_ptr CScanner::createShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const +{ + core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); + return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true"); +} + +} \ No newline at end of file