Devsh-Graphics-Programming · kpentaris · Jan 29, 2024 · Jan 30, 2024 · Feb 11, 2024 · Feb 12, 2024
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
diff --git a/examples_tests b/examples_tests
diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -178,6 +178,10 @@ void memoryBarrierShared() {
     spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask);
 }
 
+void memoryBarrierBuffer() {
+    spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask);
+}
+
 namespace impl 
 {
 

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -1,66 +1,64 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 
 // REVIEW: Not sure if this file is needed in HLSL implementation
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
+#ifndef NBL_BUILTIN_MAX_LEVELS
+#define NBL_BUILTIN_MAX_LEVELS 7
 #endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
-	template<typename Storage_t>
-	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
+    // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
+    struct Parameters_t {
+        uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1];
+        uint32_t topLevel;
+        uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2];
+    };
+
+    Parameters_t getParameters();
 
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
-	);
+    struct DefaultSchedulerParameters_t
+    {
+        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS];
+
+    };
+
+    DefaultSchedulerParameters_t getSchedulerParameters();
+
+    template<typename Storage_t, bool isExclusive=false>
+    void getData(
+        NBL_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+    );
+
+    template<typename Storage_t, bool isScan>
+    void setData(
+        NBL_CONST_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+        NBL_CONST_REF_ARG(bool) inRange
+    );
+
 }
 }
 }
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
 
 #endif
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -1,3 +1,118 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
 
+#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
+#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
 
-// choerent -> globallycoherent
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+// coherent -> globallycoherent
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint32_t dataElementCount=SCRATCH_EL_CNT - NBL_BUILTIN_MAX_LEVELS>
+struct Scratch
+{
+    uint32_t reduceResult;
+    uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS];
+    uint32_t data[dataElementCount];
+};
+
+[[vk::binding(0 ,0)]] RWStructuredBuffer<Storage_t> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+
+template<typename Storage_t, bool isExclusive>
+void getData(
+    NBL_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel
+)
+{
+    const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
+
+    uint32_t offset = levelInvocationIndex;
+    const bool notFirstOrLastLevel = bool(treeLevel);
+    if (notFirstOrLastLevel)
+        offset += params.temporaryStorageOffset[treeLevel-1u];
+
+    //if (pseudoLevel!=treeLevel) // downsweep/scan
+    //{
+    //    const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
+    //    if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
+    //        data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]];
+    //
+    //    if (notFirstOrLastLevel)
+    //    {
+    //        if (!firstInvocationInGroup)
+    //            data = scanScratchBuf[0].data[offset-1u];
+    //    }
+    //    else
+    //    {
+    //        if(isExclusive)
+    //        {
+    //            if (!firstInvocationInGroup)
+    //                data += scanBuffer[offset-1u];
+    //        }
+    //        else
+    //        {
+    //            data += scanBuffer[offset];
+    //        }
+    //    }
+    //}
+    //else
+    //{
+        if (notFirstOrLastLevel)
+            data = scanScratchBuf[0].data[offset];
+        else
+            data = scanBuffer[offset];
+    //}
+}
+
+template<typename Storage_t, bool isScan>
+void setData(
+    NBL_CONST_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(bool) inRange
+)
+{
+    const Parameters_t params = getParameters();
+    if (!isScan && treeLevel<params.topLevel) // is reduce and we're not at the last level (i.e. we still save into scratch)
+    {
+        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1u);
+        if (lastInvocationInGroup)
+            scanScratchBuf[0u].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+    }
+    else if (inRange)
+    {
+        if (!isScan && treeLevel == params.topLevel)
+        {
+            scanScratchBuf[0u].reduceResult = data;
+        }
+        // The following only for isScan == true
+        else if (bool(treeLevel))
+        {
+            const uint32_t offset = params.temporaryStorageOffset[treeLevel-1u];
+            scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
+        }
+        else
+        {
+            scanBuffer[levelInvocationIndex] = data;
+        }
+    }
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,50 +1,92 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
 
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
 #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
+
+// ITEMS_PER_WG = WORKGROUP_SIZE
+static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<WORKGROUP_SIZE>::value;
+
+// TODO: Can we make it a static variable?
+groupshared uint32_t wgScratch[SharedScratchSz];
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+template<uint16_t offset>
+struct WGScratchProxy
+{
+    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+    {
+        value = wgScratch[ix+offset];
+    }
+    void set(const uint32_t ix, const uint32_t value)
+    {
+        wgScratch[ix+offset] = value;
+    }
+
+    uint32_t atomicAdd(uint32_t ix, uint32_t val)
+    {
+        return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
+static WGScratchProxy<0> accessor;
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+struct ScanPushConstants
+{
+    nbl::hlsl::scan::Parameters_t scanParams;
+    nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams;
+};
+
+[[vk::push_constant]]
+ScanPushConstants spc;
+
+/**
+ * Required since we rely on SubgroupContiguousIndex instead of 
+ * gl_LocalInvocationIndex which means to match the global index 
+ * we can't use the gl_GlobalInvocationID but an index based on 
+ * SubgroupContiguousIndex.
+ */
+uint32_t globalIndex()
+{
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
 
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
 Parameters_t getParameters()
 {
-	return pc.scanParams;
+    return spc.scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	return pc.schedulerParams;
+    return spc.schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
+    nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor);
+}
+6 −0		14_ComputeReduce/CMakeLists.txt
+28 −0		14_ComputeReduce/config.json.template
+276 −0		14_ComputeReduce/main.cpp
+50 −0		14_ComputeReduce/pipeline.groovy
+6 −0		15_ComputeScan/CMakeLists.txt
+28 −0		15_ComputeScan/config.json.template
+340 −0		15_ComputeScan/main.cpp
+50 −0		15_ComputeScan/pipeline.groovy
+3 −1		CMakeLists.txt
+2 −2		Readme.md