diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5ab4d368b6..8a281b7e34 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96
+Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9
diff --git a/examples_tests b/examples_tests
index f4cc4cd22e..359e840943 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f4cc4cd22ee4bd5506d794e63caafddf974ed7a4
+Subproject commit 359e84094383f7f0a7d5463f00d71112a8d1ce87
diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
index 337db5e22f..4395fb44fe 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -178,6 +178,10 @@ void memoryBarrierShared() {
     spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask);
 }
 
+void memoryBarrierBuffer() {
+    spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask);
+}
+
 namespace impl 
 {
 
diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 2d2e66e66d..6e6c0f5563 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -1,66 +1,64 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 
 // REVIEW: Not sure if this file is needed in HLSL implementation
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
+#ifndef NBL_BUILTIN_MAX_LEVELS
+#define NBL_BUILTIN_MAX_LEVELS 7
 #endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
-	template<typename Storage_t>
-	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
+    // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
+    struct Parameters_t {
+        uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1];
+        uint32_t topLevel;
+        uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2];
+    };
+    
+    Parameters_t getParameters();
 
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
-	);
+    struct DefaultSchedulerParameters_t
+    {
+        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS];
+
+    };
+    
+    DefaultSchedulerParameters_t getSchedulerParameters();
+
+    template<typename Storage_t, bool isExclusive=false>
+    void getData(
+        NBL_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+    );
+
+    template<typename Storage_t, bool isScan>
+    void setData(
+        NBL_CONST_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+        NBL_CONST_REF_ARG(bool) inRange
+    );
+    
 }
 }
 }
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 450368475d..7f341b4cc2 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -1,31 +1,13 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{	
-	struct DefaultSchedulerParameters_t
-	{
-		uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
-		uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-
-	};
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#else
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 
 namespace nbl
 {
@@ -35,187 +17,181 @@ namespace scan
 {
 namespace scheduler
 {
-	/**
-	 * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
-	 * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
-	 * that are used for synchronization etc.
-	 * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
-	 * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
-	 * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
-	 * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
-	 * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
-	 * in the downsweep phase of Blelloch Scan.
-	 * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
-	 * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
-	 * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
-	 * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
-	 * 
-	 * Parameter meaning:
-	 * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
-	 * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
-	 * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
-	 * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
-	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
-	 *							synchronizing between workgroups with while-loop spinning.
-	 */
-	void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
-	{
+    /**
+     * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
+     * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
+     * that are used for synchronization etc.
+     * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
+     * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
+     * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
+     * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
+     * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
+     * in the downsweep phase of Blelloch Scan.
+     * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
+     * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
+     * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
+     * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
+     * 
+     * Parameter meaning:
+     * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
+     * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
+     * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
+     * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
+     * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
+     *                            synchronizing between workgroups with while-loop spinning.
+     */
+    void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
+    {
 #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
-		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-		// REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime?
-		
-		for (uint i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
-		{
-			const uint next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-			i = next;
-		}
-		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
-		_schedulerParams.finishedFlagOffset[0] = 0u;
-		switch(_scanParams.topLevel)
-		{
-			case 1u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = 1u;
-				
-				_scanParams.temporaryStorageOffset[0] = 2u;
-				break;
-			case 2u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				break;
-			case 3u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
-				_schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				_scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
-				break;
-			default:
-				break;
-#if NBL_BUILTIN_MAX_SCAN_LEVELS>7
+        const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x);
+        _scanParams.lastElement[0] = elementCount-1u;
+        _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
+        
+        for (uint32_t i=0; i<NBL_BUILTIN_MAX_LEVELS/2;)
+        {
+            const uint32_t next = i+1;
+            _scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
+            i = next;
+        }
+        _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
+        _schedulerParams.finishedFlagOffset[0] = 0u;
+        switch(_scanParams.topLevel)
+        {
+            case 1u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = 1u;
+                
+                _scanParams.temporaryStorageOffset[0] = 2u;
+                break;
+            case 2u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
+                _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
+                // climb down
+                _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
+                
+                _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
+                _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
+                break;
+            case 3u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
+                _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
+                _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
+                _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
+                _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
+                // climb down
+                _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
+                _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
+                
+                _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
+                _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
+                _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
+                break;
+            default:
+                break;
+#if NBL_BUILTIN_MAX_LEVELS>7
 #error "Switch needs more cases"
 #endif
-		}
+        }
 #undef WorkgroupCount
-	}
-	
-	/**
-	 * treeLevel - the current level in the Blelloch Scan
-	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
-	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
-	 */
-	template<class ScratchAccessor>
-	bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex)
-	{
-		ScratchAccessor sharedScratch;
-		if(SubgroupContiguousIndex() == 0u) 
-		{
-			uint64_t original;
-			InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			sharedScratch.set(SubgroupContiguousIndex(), original);
-		}
-		else if (SubgroupContiguousIndex() == 1u) 
-		{
-			sharedScratch.set(SubgroupContiguousIndex(), 0u);
-		}
-		GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology?
-		
-		const uint globalWorkgroupIndex; // does every thread need to know?
-		sharedScratch.get(0u, globalWorkgroupIndex);
-		const uint lastLevel = topLevel<<1u;
-		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
+    }
+    
+    /**
+     * treeLevel - the current level in the Blelloch Scan
+     * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
+     * For example, if we have dispatched 10 workgroups and we the virtu    al workgroup number is 35, then the localWorkgroupIndex should be 5.
+     */
+    template<class Accessor>
+    bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
+    {
+        const uint32_t lastLevel = topLevel<<1u;
+		if(workgroup::SubgroupContiguousIndex() == 0u) 
 		{
-			InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ?
+		  uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
+		  sharedScratch.set(0u, originalWGs);
+		  treeLevel = 0;
+		  while (originalWGs>=params.cumulativeWorkgroupCount[treeLevel]) { // doesn't work for now because PushConstant arrays can only be accessed by dynamically uniform indices
+			treeLevel++;
+		  }
+		  sharedScratch.set(1u, treeLevel);
 		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		
-		sharedScratch.get(1u, treeLevel);
-		if(treeLevel>lastLevel)
-			return true;
-		
-		localWorkgroupIndex = globalWorkgroupIndex;
-		const bool dependentLevel = treeLevel != 0u;
-		if(dependentLevel) 
-		{
-			const uint prevLevel = treeLevel - 1u;
-			localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
-			if(SubgroupContiguousIndex() == 0u) 
-			{
-				uint dependentsCount = 1u;
-				if(treeLevel <= topLevel) 
-				{
-					dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file?
-					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
-					if (lastWorkgroup) 
-					{
-						const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader
-						dependentsCount = scanParams.lastElement[treeLevel]+1u;
-						if (treeLevel<topLevel) 
-						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u]*_NBL_HLSL_WORKGROUP_SIZE_;
-						}
-					}
-				}
-				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
-				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= _NBL_HLSL_WORKGROUP_SIZE_;
-				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratch.data[dependentsFinishedFlagOffset]!=dependentsCount) // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-					GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-			}
-		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		return false;
-	}
-	
-	void markComplete(in DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		GroupMemoryBarrierWithGroupSync(); // must complete writing the data before flags itself as complete  // TODO (PentaKon): Possibly refactor?
-		if (SubgroupContiguousIndex()==0u)
-		{
-			uint finishedFlagOffset = params.finishedFlagOffset[treeLevel];
-			if (treeLevel<topLevel)
-			{
-				finishedFlagOffset += localWorkgroupIndex/_NBL_HLSL_WORKGROUP_SIZE_;
-				InterlockedAdd(scanScratch.data[finishedFlagOffset],1u);
-			}
-			else if (treeLevel!=(topLevel<<1u))
-			{
-				finishedFlagOffset += localWorkgroupIndex;
-				scanScratch.data[finishedFlagOffset] = 1u; // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			}
-		}
-	}
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u);
+        
+        treeLevel = sharedScratch.get(1u);
+        if(treeLevel>lastLevel)
+            return true;
+        
+        localWorkgroupIndex = globalWorkgroupIndex;
+        const bool dependentLevel = treeLevel != 0u;
+        if(dependentLevel) 
+        {
+            const uint32_t prevLevel = treeLevel - 1u;
+            localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
+            if(workgroup::SubgroupContiguousIndex() == 0u) 
+            {
+                uint32_t dependentsCount = 1u;
+                if(treeLevel <= topLevel) 
+                {
+                    dependentsCount = glsl::gl_WorkGroupSize().x;
+                    const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
+                    if (lastWorkgroup) 
+                    {
+                        const Parameters_t scanParams = getParameters();
+                        dependentsCount = scanParams.lastElement[treeLevel]+1u;
+                        if (treeLevel<topLevel) 
+                        {
+                            dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
+                        }
+                    }
+                }
+                uint32_t dependentsFinishedFlagOffset = localWorkgroupIndex;
+                if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
+                    dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
+                dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
+                while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
+                    glsl::memoryBarrierBuffer();
+            }
+        }
+        sharedScratch.workgroupExecutionAndMemoryBarrier();
+        glsl::memoryBarrierBuffer();
+        return false;
+    }
+    
+    void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
+    {
+        glsl::memoryBarrierBuffer();
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
+            if (treeLevel<topLevel)
+            {
+                finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
+                glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u);
+            }
+            else if (treeLevel!=(topLevel<<1u))
+            {
+                finishedFlagOffset += localWorkgroupIndex;
+                scanScratchBuf[0].data[finishedFlagOffset] = 1u;
+            }
+        }
+    }
 }
+
 }
 }
 }
-#endif
+
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 86c5589725..2e2896cf37 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -1,3 +1,118 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
 
+#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
+#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
 
-// choerent -> globallycoherent
\ No newline at end of file
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+// coherent -> globallycoherent
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint32_t dataElementCount=SCRATCH_EL_CNT - NBL_BUILTIN_MAX_LEVELS>
+struct Scratch
+{
+    uint32_t reduceResult;
+    uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS];
+    uint32_t data[dataElementCount];
+};
+
+[[vk::binding(0 ,0)]] RWStructuredBuffer<Storage_t> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+
+template<typename Storage_t, bool isExclusive>
+void getData(
+    NBL_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel
+)
+{
+    const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
+    
+    uint32_t offset = levelInvocationIndex;
+    const bool notFirstOrLastLevel = bool(treeLevel);
+    if (notFirstOrLastLevel)
+        offset += params.temporaryStorageOffset[treeLevel-1u];
+    
+    //if (pseudoLevel!=treeLevel) // downsweep/scan
+    //{
+    //    const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
+    //    if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
+    //        data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]];
+    //
+    //    if (notFirstOrLastLevel)
+    //    {
+    //        if (!firstInvocationInGroup)
+    //            data = scanScratchBuf[0].data[offset-1u];
+    //    }
+    //    else
+    //    {
+    //        if(isExclusive)
+    //        {
+    //            if (!firstInvocationInGroup)
+    //                data += scanBuffer[offset-1u];
+    //        }
+    //        else
+    //        {
+    //            data += scanBuffer[offset];
+    //        }
+    //    }
+    //}
+    //else
+    //{
+        if (notFirstOrLastLevel)
+            data = scanScratchBuf[0].data[offset];
+        else
+            data = scanBuffer[offset];
+    //}
+}
+
+template<typename Storage_t, bool isScan>
+void setData(
+    NBL_CONST_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(bool) inRange
+)
+{
+    const Parameters_t params = getParameters();
+    if (!isScan && treeLevel<params.topLevel) // is reduce and we're not at the last level (i.e. we still save into scratch)
+    {
+        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1u);
+        if (lastInvocationInGroup)
+            scanScratchBuf[0u].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+    }
+    else if (inRange)
+    {
+        if (!isScan && treeLevel == params.topLevel)
+        {
+            scanScratchBuf[0u].reduceResult = data;
+        }
+        // The following only for isScan == true
+        else if (bool(treeLevel))
+        {
+            const uint32_t offset = params.temporaryStorageOffset[treeLevel-1u];
+            scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
+        }
+        else
+        {
+            scanBuffer[levelInvocationIndex] = data;
+        }
+    }
+}
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index 325a08e3f0..3075816521 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,10 +1,69 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
 
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
 #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
+
+// ITEMS_PER_WG = WORKGROUP_SIZE
+static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<WORKGROUP_SIZE>::value;
+
+// TODO: Can we make it a static variable?
+groupshared uint32_t wgScratch[SharedScratchSz];
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+template<uint16_t offset>
+struct WGScratchProxy
+{
+    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
+    {
+        value = wgScratch[ix+offset];
+    }
+    void set(const uint32_t ix, const uint32_t value)
+    {
+        wgScratch[ix+offset] = value;
+    }
+
+    uint32_t atomicAdd(uint32_t ix, uint32_t val)
+    {
+        return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
+};
+static WGScratchProxy<0> accessor;
+
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+struct ScanPushConstants
+{
+    nbl::hlsl::scan::Parameters_t scanParams;
+    nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams;
+};
+
+[[vk::push_constant]]
+ScanPushConstants spc;
+
+/**
+ * Required since we rely on SubgroupContiguousIndex instead of 
+ * gl_LocalInvocationIndex which means to match the global index 
+ * we can't use the gl_GlobalInvocationID but an index based on 
+ * SubgroupContiguousIndex.
+ */
+uint32_t globalIndex()
+{
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
 
 namespace nbl
 {
@@ -12,39 +71,22 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
 Parameters_t getParameters()
 {
-	return pc.scanParams;
+    return spc.scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	return pc.schedulerParams;
+    return spc.schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
+    nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor);
+}
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl
index 1191731f65..f330b1393d 100644
--- a/include/nbl/builtin/hlsl/scan/indirect.hlsl
+++ b/include/nbl/builtin/hlsl/scan/indirect.hlsl
@@ -1,8 +1,3 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8
-#endif
-
 #include "nbl/builtin/hlsl/scan/descriptors.hlsl"
 #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
 #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
@@ -13,36 +8,33 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
+    
+uint32_t getIndirectElementCount();
+
 Parameters_t scanParams;
+    
 Parameters_t getParameters()
 {
-	return scanParams;
+    return scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-uint getIndirectElementCount();
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t schedulerParams;
+
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
+    scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
 	return schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	if (bool(nbl::hlsl::scan::getIndirectElementCount()))
-		nbl::hlsl::scan::main();
+    if(bool(nbl::hlsl::scan::getIndirectElementCount())) {
+        // TODO call main from virtual_workgroup.hlsl
+    }
 }
-#define _NBL_HLSL_MAIN_DEFINED_
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
deleted file mode 100644
index bfeba13be2..0000000000
--- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint topLevel;
-		uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl
new file mode 100644
index 0000000000..ea283a271f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl
@@ -0,0 +1,69 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template <uint16_t WorkgroupSize> struct ScanScheduler
+{
+
+    static ScanScheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
+    {
+        ScanScheduler<WorkgroupSize> scheduler;
+        scheduler.params = params;
+        scheduler.schedParams = schedParams;
+        return scheduler;
+    }
+
+    template <class Accessor> bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork()
+    {
+
+        // This part ensures that each workgroup that was not chosen to run the next level
+        // will exit once all the work for the current level is finished
+        glsl::memoryBarrierBuffer();
+        if (scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
+            return false;
+
+        if (workgroup::SubgroupContiguousIndex() == 0u)
+        {
+            uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level], 1u);
+        }
+        uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
+        // nothing really happens, we just check if there's a workgroup in the level to grab
+        return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
+    }
+
+    uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) { return uberWorkgroupIndex; }
+
+    uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level)
+    {
+        return level == 0u
+            ? schedParams.cumulativeWorkgroupCount[level]
+            : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level - 1u];
+    }
+
+    Parameters_t params;
+    DefaultSchedulerParameters_t schedParams;
+    uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex
+    uint32_t level;              // 32 bit to stop warnings from level = workgroup::BroadcastFirst
+};
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
new file mode 100644
index 0000000000..99effbd88a
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
@@ -0,0 +1,110 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_HLSL_SCHEDULER_INCLUDED_
+#define _NBL_HLSL_SCHEDULER_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint16_t WorkgroupSize>
+struct Scheduler
+{
+    static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
+    {
+        Scheduler<WorkgroupSize> scheduler;
+        scheduler.params = params;
+        scheduler.schedParams = schedParams;
+        return scheduler;
+    }
+    
+    template<class Accessor>
+    bool getWork(NBL_REF_ARG(Accessor) accessor)  // rename to hasWork()
+    {
+        
+        // This part ensures that each workgroup that was not chosen to run the next level 
+        // will exit once all the work for the current level is finished
+        glsl::memoryBarrierBuffer();
+        if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
+            return false;
+    
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u);
+        }
+        uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
+        // nothing really happens, we just check if there's a workgroup in the level to grab
+        return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
+    }
+
+    template<class Accessor>
+    bool markDone(NBL_REF_ARG(Accessor) accessor)
+    {
+        if (level==(params.topLevel)) // check if we reached the lastLevel
+        {
+            // not even sure this is needed
+            uberWorkgroupIndex = ~0u;
+            return true;
+        }
+
+        uint32_t prevLevel = level;
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
+            const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
+            const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize;
+            const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u);
+            //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+            if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel  ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+            {
+                level++;
+            }
+        }
+        level = workgroup::BroadcastFirst(level, accessor);
+        return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level
+    }
+
+    uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level)
+    {
+        return uberWorkgroupIndex;
+    }
+
+    uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level)
+    {
+        return level == 0 
+            ? schedParams.cumulativeWorkgroupCount[level] 
+            : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u];
+    }
+    
+    void resetWorkgroupsStartedBuffer()
+    {
+        // could do scanScratchBuf[0u].workgroupsStarted[SubgroupContiguousIndex()] = 0u but don't know how many invocations are live during this call
+        if(workgroup::SubgroupContiguousIndex() == 0u)
+        {
+            for(uint32_t i = 0; i < params.topLevel; i++)
+            {
+                scanScratchBuf[0u].workgroupsStarted[i] = 0u;
+            }
+        }
+    }
+
+  Parameters_t params;
+  DefaultSchedulerParameters_t schedParams;
+  uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex
+  uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst
+};
+
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 488bf29012..f9bd38a9e3 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -2,91 +2,128 @@
 #define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
 
 // TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor
-#include "nbl/builtin/hlsl/limits/numeric.hlsl"
-#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/scheduler.hlsl"
+#include "nbl/builtin/hlsl/scan/scan_scheduler.hlsl"
 
-#include "nbl/builtin/hlsl/binops.hlsl"
-
-#if 0
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
-{
-	template<class Binop, class Storage_t>
-	void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		const Parameters_t params = getParameters();
-		const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1);
+{   
+    /**
+     * Finds the index on a higher level in the reduce tree which the 
+     * levelInvocationIndex needs to add to level 0
+     */
+    template<uint32_t WorkgroupSize>
+    uint32_t2 findHigherLevelIndex(uint32_t level, uint32_t levelInvocationIndex)
+    {
+		const uint32_t WgSzLog2 = firstbithigh(WorkgroupSize);
+		uint32_t rightIndexNonInclusive = (levelInvocationIndex >> (WgSzLog2 * level)); // formula for finding the index. If it underflows it means that there's nothing to do for the level
+		uint32_t leftIndexInclusive = (rightIndexNonInclusive >> WgSzLog2) << WgSzLog2;
+		uint32_t add = (levelInvocationIndex & (WgSzLog2 * level - 1)) > 0 ? 1u : 0u; // if the index mod WorkgroupSize*level > 0 then we must add 1u
+		return uint32_t2(leftIndexInclusive, rightIndexNonInclusive); // return both the index and the potential underflow information to know we must do nothing
+    }
 
-		const uint lastLevel = params.topLevel << 1u;
-		const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel];
+    template<class Binop, typename Storage_t, bool isScan, bool isExclusive, uint16_t WorkgroupSize, class Accessor, class device_capabilities=void>
+    void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
+    {
+        const Parameters_t params = getParameters();
+        const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
+        
+        const bool inRange = levelInvocationIndex <= params.lastElement[treeLevel]; // the lastElement array contains the lastElement's index hence the '<='
 
-		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
+        // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
+        // so that it exits after reaching the top?
+        
+        // Seems that even though it's a mem barrier is must NOT diverge!
+        // This was called inside getData but due to inRange it can possibly diverge 
+        // so it must be called outside! Not yet sure why a mem barrier must be uniformly called.
+        glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?)
+        
+        Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
+        if(inRange)
+        {
+            getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel);
+        }
 
-		Storage_t data = Binop::identity();
-		if(inRange)
-		{
-			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
-		}
+        bool doReduce = !isScan;
+        if(doReduce)
+        {
+            data = workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+        }
+        else 
+        {
+            if (isExclusive)
+            {
+                data = workgroup::exclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+            }
+            else
+            {
+                data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+            }
+            
+            uint32_t reverseLevel = params.topLevel - treeLevel;
+            while(reverseLevel != ~0u)
+            {
+                Storage_t tempData = Binop::identity;
+                uint32_t2 idxRange = findHigherLevelIndex<uint32_t(WorkgroupSize)>(reverseLevel--, levelInvocationIndex);
+                if(workgroup::SubgroupContiguousIndex() < idxRange.y - idxRange.x)
+                {
+                    tempData = scanScratchBuf[0].data[params.temporaryStorageOffset[reverseLevel] + idxRange.x + workgroup::SubgroupContiguousIndex()];
+                }
 
-		if(treeLevel < params.topLevel) 
-		{
-			#error "Must also define some scratch accessor when calling operation()"
-			data = workgroup::reduction<Binop>()(data);
-		}
-		// REVIEW: missing _TYPE_ check and extra case here
-		else if (treeLevel != params.topLevel)
-		{
-			data = workgroup::inclusive_scan<Binop>()(data);
-		}
-		else
-		{
-			data = workgroup::exclusive_scan<Binop>()(data);
-		}
-		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
-	}
-}
-}
-}
+                // we could potentially do an inclusive scan of this part and cache it for other WGs
+                //tempData = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(tempData,accessor);
+                data += workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(tempData,accessor);
+            }
+        }
+        
+        setData<Storage_t, isScan>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, inRange);
+    }
 
-#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored?
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	void main()
-	{
-		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint topLevel = getParameters().topLevel;
-		// persistent workgroups
-		while (true)
-		{
-			uint treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
-			{
-				return;
-			}
-
-			virtualWorkgroup(treeLevel,localWorkgroupIndex);
+    DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
+    template<class Binop, typename Storage_t, bool isScan, bool isExclusive, uint16_t WorkgroupSize, class Accessor>
+    void main(NBL_REF_ARG(Accessor) accessor)
+    {
+        const Parameters_t params = getParameters();
+        const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
+        
+        if(!isScan)
+        {
+            Scheduler<WorkgroupSize> scheduler = Scheduler<WorkgroupSize>::create(params, schedulerParams);
+            // persistent workgroups
+            
+            while (true)
+            {
+                if (!scheduler.getWork(accessor))
+                {
+                    return;
+                }
 
-			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
-		}
-	}
+                virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+                accessor.workgroupExecutionAndMemoryBarrier();
+                if(scheduler.markDone(accessor))
+                {
+                    return;
+                }
+            }
+        }
+        else
+        {
+            ScanScheduler<WorkgroupSize> scheduler = ScanScheduler<WorkgroupSize>::create(params, schedulerParams);
+            while(scheduler.getWork(accessor))
+            {
+                virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+                accessor.workgroupExecutionAndMemoryBarrier();
+            }
+        }
+    }
 }
 }
 }
-#endif
-
-#define _NBL_HLSL_SCAN_MAIN_DEFINED_
-#endif
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
index 289c2050fa..d7c66e23d0 100644
--- a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
@@ -38,7 +38,7 @@ T Broadcast(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor, const uint
 template<typename T, class Accessor>
 T BroadcastFirst(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor)
 {
-    return Broadcast(val,0);
+    return Broadcast(val, accessor, 0);
 }
 
 }
diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
new file mode 100644
index 0000000000..ebeae08181
--- /dev/null
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -0,0 +1,288 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_
+#define _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_
+
+
+#include "nbl/video/IPhysicalDevice.h"
+#include "nbl/video/utilities/IDescriptorSetCache.h"
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+
+
+namespace nbl::video
+{
+class CArithmeticOps : public core::IReferenceCounted
+{
+    public:
+		// Only 4 byte wide data types supported due to need to trade the via shared memory,
+		// different combinations of data type and operator have different identity elements.
+		// `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX`  
+		enum E_DATA_TYPE : uint8_t
+		{
+			EDT_UINT=0u,
+			EDT_INT,
+			EDT_FLOAT,
+			EDT_COUNT
+		};
+		enum E_OPERATOR : uint8_t
+		{
+			EO_AND = 0u,
+			EO_XOR,
+			EO_OR,
+			EO_ADD,
+			EO_MUL,
+			EO_MIN,
+			EO_MAX,
+			EO_COUNT
+		};
+
+		// This struct is only for managing where to store intermediate results of the scans
+		struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in HLSL
+		{
+			static inline constexpr uint32_t MaxLevels = NBL_BUILTIN_MAX_LEVELS;
+
+			Parameters()
+			{
+				std::fill_n(lastElement,MaxLevels/2+1,0u);
+				std::fill_n(temporaryStorageOffset,MaxLevels/2,0u);
+			}
+			// build the constant tables for each level given the number of elements to scan and workgroupSize
+			Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters()
+			{
+				assert(_elementCount!=0u && "Input element count can't be 0!");
+				const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxLevels/2u+1u);
+				assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!");
+
+				lastElement[0u] = _elementCount-1u;
+				for (topLevel=0u; lastElement[topLevel]>=workgroupSize;)
+					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = std::ceil(lastElement[topLevel] / double(workgroupSize));
+				
+				std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u);
+			}
+            // given already computed tables of lastElement indice	s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
+			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
+			{
+				uint32_t uint_count = 1u; // reduceResult field
+				uint_count += MaxLevels; // workgroup enumerator
+				uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset
+				uint_count += lastElement[topLevel]+1u; // starting from the last temporary storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based)
+				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
+			}
+
+			inline uint32_t getWorkgroupEnumeratorSize()
+			{
+				return MaxLevels * sizeof(uint32_t);
+			}
+		};
+        
+        // the default scheduler we provide works as described above in the big documentation block
+		struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t  // this struct and its methods are also available in HLSL
+		{
+			SchedulerParameters()
+			{
+				std::fill_n(cumulativeWorkgroupCount,Parameters::MaxLevels,0u);
+				std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxLevels, 0u);
+				std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxLevels,0u);
+			}
+            
+            // given the number of elements and workgroup size, figure out how many atomics we need
+            // also account for the fact that we will want to use the same scratch buffer both for the
+            // scheduler's atomics and the aux data storage
+			SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters()
+			{
+				outScanParams = Parameters(_elementCount,workgroupSize);
+				const auto topLevel = outScanParams.topLevel;
+
+				std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount);
+				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
+				cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction
+				for (auto i = 0u; i <= topLevel; i++) {
+					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !!
+					lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u);
+				}
+
+				const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u);
+				std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, workgroupFinishFlagsOffset, 0u);
+				for (auto i=0u; i<sizeof(Parameters::temporaryStorageOffset)/sizeof(uint32_t); i++)
+					outScanParams.temporaryStorageOffset[i] += wgFinishedFlagsSize;
+				
+				std::inclusive_scan(cumulativeWorkgroupCount, cumulativeWorkgroupCount + Parameters::MaxLevels, cumulativeWorkgroupCount);
+			}
+		};
+        
+        // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
+		struct DefaultPushConstants
+		{
+			Parameters scanParams;
+			SchedulerParameters schedulerParams;
+		};
+        
+		struct DispatchInfo
+		{
+			DispatchInfo() : wg_count(0u)
+			{
+			}
+            
+            // in case we scan very few elements, you don't want to launch workgroups that wont do anything
+			DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize)
+			{
+				constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
+				wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
+				assert(wg_count >= elementCount / workgroupSize^2 + 1 && "Too few workgroups! The workgroup count must be at least the elementCount/(wgSize^2)");
+			}
+
+			uint32_t wg_count;
+		};
+
+		CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+		
+        CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
+		{
+			assert(core::isPoT(m_workgroupSize));
+			const asset::SPushConstantRange pc_range[] = {asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
+			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
+				{.binding = 0u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr },
+				{ .binding = 1u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, 
+					.stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr } // scratch
+			};
+
+			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
+			assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully");
+			m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout));
+			assert(m_pipeline_layout && "CArithmeticOps Pipeline Layout was not created successfully");
+		}
+
+		inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); }
+
+		inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); }
+
+		inline uint32_t getWorkgroupSize() const {return m_workgroupSize;}
+
+		inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo)
+		{
+			pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize);
+			dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize);
+		}
+
+		static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range)
+		{
+			IGPUDescriptorSet::SDescriptorInfo infos[2];
+			infos[0].desc = input_range.buffer;
+			infos[0].info.buffer = {input_range.offset,input_range.size};
+			infos[1].desc = scratch_range.buffer;
+			infos[1].info.buffer = {scratch_range.offset,scratch_range.size};
+
+			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+			for (auto i=0u; i<2u; i++)
+			{
+				writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i };
+			}
+
+			device->updateDescriptorSets(2, writes, 0u, nullptr);
+		}
+
+		static inline void dispatchHelper(
+			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
+			std::span<const IGPUCommandBuffer::SPipelineBarrierDependencyInfo> bufferBarriers
+		)
+		{
+			cmdbuf->pushConstants(layout, asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants);
+			if (bufferBarriers.size() > 0)
+			{
+				for (uint32_t i = 0; i < bufferBarriers.size(); i++)
+				{
+					cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, bufferBarriers[i]);
+				}
+			}
+			cmdbuf->dispatch(dispatchInfo.wg_count, 1u, 1u);
+		}
+
+        inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchElCount) const
+        {
+            auto system = m_device->getPhysicalDevice()->getSystem();
+            core::smart_refctd_ptr<const system::IFile> hlsl;
+            {
+                auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
+                {
+                    nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
+                    system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
+                    if (future.wait())
+                        return future.copy();
+                    return nullptr;
+                };
+
+                // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ??
+                hlsl = loadBuiltinData(shaderFile);
+            }
+			auto buffer = asset::ICPUBuffer::create({ {.size = hlsl->getSize()} });
+            memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
+            auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+
+            // REVIEW: Probably combine the commons parts of CReduce and CScanner
+            const char* storageType = nullptr;
+            switch (dataType)
+            {
+                case EDT_UINT:
+                    storageType = "uint32_t";
+                    break;
+                case EDT_INT:
+                    storageType = "int32_t";
+                    break;
+                case EDT_FLOAT:
+                    storageType = "float32_t";
+                    break;
+                default:
+                    assert(false);
+                    break;
+            }
+
+            const char* binop = nullptr;
+            switch (op)
+            {
+                case EO_AND:
+                    binop = "bit_and";
+                    break;
+                case EO_OR:
+                    binop = "bit_or";
+                    break;
+                case EO_XOR:
+                    binop = "bit_xor";
+                    break;
+                case EO_ADD:
+                    binop = "plus";
+                    break;
+                case EO_MUL:
+                    binop = "multiplies";
+                    break;
+                case EO_MAX:
+                    binop = "maximum";
+                    break;
+                case EO_MIN:
+                    binop = "minimum";
+                    break;
+                default:
+                    assert(false);
+                    break;
+            }
+            
+			return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_EL_CNT %d\n", m_workgroupSize, storageType, binop, scratchElCount);
+        }
+
+		inline ILogicalDevice* getDevice() const {return m_device.get();}
+    protected:
+        // REVIEW: Does it need an empty destructor?
+
+		core::smart_refctd_ptr<ILogicalDevice> m_device;
+		core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;
+		core::smart_refctd_ptr<IGPUPipelineLayout> m_pipeline_layout;
+		
+		const uint32_t m_workgroupSize;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h
new file mode 100644
index 0000000000..4a311aa738
--- /dev/null
+++ b/include/nbl/video/utilities/CReduce.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_VIDEO_C_REDUCE_H_INCLUDED_
+#define _NBL_VIDEO_C_REDUCE_H_INCLUDED_
+
+#include "nbl/video/utilities/CArithmeticOps.h"
+
+namespace nbl::video
+{
+
+    class NBL_API2 CReduce final : public CArithmeticOps
+    {
+
+    public:
+        CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+        CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {}
+        asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const;
+
+    protected:
+        ~CReduce()
+        {
+            // all drop themselves automatically
+        }
+
+        core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EDT_COUNT][EO_COUNT];
+        core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EDT_COUNT][EO_COUNT];
+        core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EDT_COUNT][EO_COUNT];
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 2ae67163eb..d705113f1a 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -8,15 +8,14 @@
 
 #include "nbl/video/IPhysicalDevice.h"
 #include "nbl/video/utilities/IDescriptorSetCache.h"
+#include "nbl/video/utilities/CArithmeticOps.h"
 
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!");
 
 namespace nbl::video
 {
 
-#include "nbl/builtin/glsl/scan/parameters_struct.glsl"
-#include "nbl/builtin/glsl/scan/default_scheduler.glsl"
-static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
-
 /**
 Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU.
 
@@ -143,270 +142,41 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about
 
 Console devs get to bring a gun to a knife fight...
 **/
-#if 0 // legacy & port to HLSL
-class CScanner final : public core::IReferenceCounted
+class NBL_API2 CScanner final : public CArithmeticOps
 {
 	public:		
 		enum E_SCAN_TYPE : uint8_t
 		{
 			 // computes output[n] = Sum_{i<=n}(input[i])
-			 EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_,
+			 EST_INCLUSIVE = 0u,
 			 // computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity
-			 EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_,
+			 EST_EXCLUSIVE,
 			 EST_COUNT
 		};
-		// Only 4 byte wide data types supported due to need to trade the via shared memory,
-		// different combinations of data type and operator have different identity elements.
-		// `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX`  
-		enum E_DATA_TYPE : uint8_t
-		{
-			EDT_UINT=0u,
-			EDT_INT,
-			EDT_FLOAT,
-			EDT_COUNT
-		};
-		enum E_OPERATOR : uint8_t
-		{
-			EO_AND = _NBL_GLSL_SCAN_OP_AND_,
-			EO_XOR = _NBL_GLSL_SCAN_OP_XOR_,
-			EO_OR = _NBL_GLSL_SCAN_OP_OR_,
-			EO_ADD = _NBL_GLSL_SCAN_OP_ADD_,
-			EO_MUL = _NBL_GLSL_SCAN_OP_MUL_,
-			EO_MIN = _NBL_GLSL_SCAN_OP_MIN_,
-			EO_MAX = _NBL_GLSL_SCAN_OP_MAX_,
-			EO_COUNT = _NBL_GLSL_SCAN_OP_COUNT_
-		};
-
-		// This struct is only for managing where to store intermediate results of the scans
-		struct Parameters : nbl_glsl_scan_Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
-		{
-			static inline constexpr uint32_t MaxScanLevels = NBL_BUILTIN_MAX_SCAN_LEVELS;
-
-			Parameters()
-			{
-				std::fill_n(lastElement,MaxScanLevels/2+1,0u);
-				std::fill_n(temporaryStorageOffset,MaxScanLevels/2,0u);
-			}
-			// build the constant tables for each level given the number of elements to scan and workgroupSize
-			Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters()
-			{
-				assert(_elementCount!=0u && "Input element count can't be 0!");
-				const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxScanLevels/2u+1u);
-				assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!");
-
-				lastElement[0u] = _elementCount-1u;
-				for (topLevel=0u; lastElement[topLevel]>=workgroupSize;)
-					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize;
-				
-				std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u);
-			}
-                        // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
-			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
-			{
-				uint32_t uint_count = 1u; // workgroup enumerator
-				uint_count += temporaryStorageOffset[MaxScanLevels/2u-1u]; // last scratch offset
-				uint_count += lastElement[topLevel]+1u; // and its size
-				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
-			}
-		};
-                // the default scheduler we provide works as described above in the big documentation block
-		struct SchedulerParameters : nbl_glsl_scan_DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
-		{
-			SchedulerParameters()
-			{
-				std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u);
-				std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u);
-			}
-                        // given the number of elements and workgroup size, figure out how many atomics we need
-                        // also account for the fact that we will want to use the same scratch buffer both for the
-                        // scheduler's atomics and the aux data storage
-			SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters()
-			{
-				outScanParams = Parameters(_elementCount,workgroupSize);
-				const auto topLevel = outScanParams.topLevel;
-
-				std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount);
-				for (auto i=0u; i<=topLevel; i++)
-					cumulativeWorkgroupCount[i] += 1u;
-				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
-
-				std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset);
-				std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel);
-
-				const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t);
-				const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u);
-				std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);
-				for (auto i=0u; i<sizeof(Parameters::temporaryStorageOffset)/sizeof(uint32_t); i++)
-					outScanParams.temporaryStorageOffset[i] += finishedFlagsSize;
-					
-				std::inclusive_scan(cumulativeWorkgroupCount,cumulativeWorkgroupCount+Parameters::MaxScanLevels,cumulativeWorkgroupCount);
-			}
-		};
-                // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
-		struct DefaultPushConstants
-		{
-			Parameters scanParams;
-			SchedulerParameters schedulerParams;
-		};
-		struct DispatchInfo
-		{
-			DispatchInfo() : wg_count(0u)
-			{
-			}
-                        // in case we scan very few elements, you don't want to launch workgroups that wont do anything
-			DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize)
-			{
-				constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
-				wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
-			}
-
-			uint32_t wg_count;
-		};
-
-		//
-		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device) : CScanner(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
-		//
-		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
-		{
-			assert(core::isPoT(m_workgroupSize));
-
-			const asset::SPushConstantRange pc_range = { asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants) };
-			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
-				{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
-				{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
-			};
-
-			m_ds_layout = m_device->createDescriptorSetLayout(bindings,bindings+sizeof(bindings)/sizeof(IGPUDescriptorSetLayout::SBinding));
-			m_pipeline_layout = m_device->createPipelineLayout(&pc_range,&pc_range+1,core::smart_refctd_ptr(m_ds_layout));
-		}
-
-		//
-		inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); }
-
-		//
-		inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); }
-
+		
 		// You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader
-		core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
-		{
-			return createShader(true,scanType,dataType,op);
-		}
+		//core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+		//{
+		//	return createShader(true,scanType,dataType,op,scratchSz);
+		//}
 
 		//
-		inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
-		{
-			if (!m_shaders[scanType][dataType][op])
-				m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op);
-			return m_shaders[scanType][dataType][op].get();
-		}
-		//
-		inline IGPUSpecializedShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
-		{
-			if (!m_specialized_shaders[scanType][dataType][op])
-			{
-				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op));
-				cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp");
-				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
-
-				auto gpushader = m_device->createShader(std::move(cpuShader));
-
-				m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader(
-					gpushader.get(),{nullptr,nullptr,"main"});
-				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp"
-			}
-			return m_specialized_shaders[scanType][dataType][op].get();
-		}
-
-		//
-		inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
-		{
-			// ondemand
-			if (!m_pipelines[scanType][dataType][op])
-				m_pipelines[scanType][dataType][op] = m_device->createComputePipeline(
-					nullptr,core::smart_refctd_ptr(m_pipeline_layout),
-					core::smart_refctd_ptr<IGPUSpecializedShader>(getDefaultSpecializedShader(scanType,dataType,op))
-				);
-			return m_pipelines[scanType][dataType][op].get();
-		}
-
-		//
-		inline uint32_t getWorkgroupSize() const {return m_workgroupSize;}
-
-		//
-		inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo)
-		{
-			pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize);
-			dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize);
-		}
-
-		//
-		static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range)
-		{
-			IGPUDescriptorSet::SDescriptorInfo infos[2];
-			infos[0].desc = input_range.buffer;
-			infos[0].info.buffer = {input_range.offset,input_range.size};
-			infos[1].desc = scratch_range.buffer;
-			infos[1].info.buffer = {scratch_range.offset,scratch_range.size};
-
-			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-			for (auto i=0u; i<2u; i++)
-			{
-				writes[i].dstSet = set;
-				writes[i].binding = i;
-				writes[i].arrayElement = 0u;
-				writes[i].count = 1u;
-				writes[i].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
-				writes[i].info = infos+i;
-			}
-
-			device->updateDescriptorSets(2, writes, 0u, nullptr);
-		}
-
-		// Half and sizeof(uint32_t) of the scratch buffer need to be cleared to 0s
-		static inline void dispatchHelper(
-			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
-			const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* srcBufferBarriers,
-			const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* dstBufferBarriers
-		)
-		{
-			cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants);
-			if (srcBufferBarrierCount)
-			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {};
-				info.bufBarrierCount = srcBufferBarrierCount;
-				info.bufBarriers = srcBufferBarriers;
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-			}
-			cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u);
-			if (srcBufferBarrierCount)
-			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {};
-				info.bufBarrierCount = dstBufferBarrierCount;
-				info.bufBarriers = dstBufferBarriers;
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-			}
-		}
-
-		inline ILogicalDevice* getDevice() const {return m_device.get();}
-
+		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device) : CScanner(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {}
+		asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+		IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+		IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(/*const bool indirect, */const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
     protected:
 		~CScanner()
 		{
 			// all drop themselves automatically
 		}
 
-		core::smart_refctd_ptr<asset::ICPUShader> createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const;
-
-
-		core::smart_refctd_ptr<ILogicalDevice> m_device;
-		core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;
-		core::smart_refctd_ptr<IGPUPipelineLayout> m_pipeline_layout;
 		core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
-		core::smart_refctd_ptr<IGPUSpecializedShader> m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
+		core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT];
-		const uint32_t m_workgroupSize;
 };
-#endif
 
 }
 #endif
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 8ef38fd080..ce2bb22299 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -11,6 +11,7 @@
 #include "nbl/video/alloc/StreamingTransientDataBuffer.h"
 #include "nbl/video/utilities/SIntendedSubmitInfo.h"
 #include "nbl/video/utilities/CPropertyPoolHandler.h"
+#include "nbl/video/utilities/CReduce.h"
 #include "nbl/video/utilities/CScanner.h"
 #include "nbl/video/utilities/CComputeBlit.h"
 
@@ -132,9 +133,10 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
             // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile
             // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile
+#endif
             const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u);
+            m_reducer = core::make_smart_refctd_ptr<CReduce>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
             m_scanner = core::make_smart_refctd_ptr<CScanner>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
-#endif
         }
 
         inline ~IUtilities()
@@ -163,13 +165,16 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         {
             return m_propertyPoolHandler.get();
         }
+#endif
+        virtual CReduce* getDefaultReducer() const
+        {
+            return m_reducer.get();
+        }
 
-        //!
         virtual CScanner* getDefaultScanner() const
         {
             return m_scanner.get();
         }
-#endif
         //! This function provides some guards against streamingBuffer fragmentation or allocation failure
         static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom)
         {
@@ -716,8 +721,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 
 #if 0 // TODO: port
         core::smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
-        core::smart_refctd_ptr<CScanner> m_scanner;
 #endif
+        core::smart_refctd_ptr<CReduce> m_reducer;
+        core::smart_refctd_ptr<CScanner> m_scanner;
 };
 
 }
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ad464bd035..c6df0949de 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -227,6 +227,7 @@ set(NBL_VIDEO_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CPropertyPoolHandler.cpp
+	${NBL_ROOT_PATH}/src/nbl/video/utilities/CReduce.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CScanner.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CComputeBlit.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CAssetConverter.cpp
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 9333a0d3b4..023c673d3c 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -334,13 +334,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h")
 #workgroup
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/arithmetic.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
+#global scan
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/declarations.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/default_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/descriptors.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/direct.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scan_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/virtual_workgroup.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")
diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp
new file mode 100644
index 0000000000..16b5cc5e29
--- /dev/null
+++ b/src/nbl/video/utilities/CReduce.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/video/utilities/CReduce.h"
+
+namespace nbl::video
+{
+
+asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
+{
+    if (!m_shaders[dataType][op])
+        m_shaders[dataType][op] = createShader(dataType,op,scratchElCount);
+    return m_shaders[dataType][op].get();
+}
+
+IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
+{
+    if (!m_specialized_shaders[dataType][op])
+    {
+        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(dataType,op,scratchElCount));
+        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
+        cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE);
+
+        auto gpushader = m_device->createShader(cpuShader.get());
+
+        m_specialized_shaders[dataType][op] = gpushader;
+    }
+    return m_specialized_shaders[dataType][op].get();
+}
+
+IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
+{
+    // ondemand
+    if (!m_pipelines[dataType][op]) {
+        IGPUComputePipeline::SCreationParams params = {};
+        params.layout = m_pipeline_layout.get();
+        // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+        params.shader.entryPoint = "main";
+        params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchElCount);
+
+        m_device->createComputePipelines(
+            nullptr, { &params,1 },
+            & m_pipelines[dataType][op]
+        );
+    }
+    return m_pipelines[dataType][op].get();
+}
+
+core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const
+{
+    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchElCount);
+    return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false");
+}
+
+}
\ No newline at end of file
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index ec31272358..3cdcc82ee7 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -1,52 +1,52 @@
 #include "nbl/video/utilities/CScanner.h"
 
-using namespace nbl;
-using namespace video;
+namespace nbl::video
+{
+    
+asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
+{
+    if (!m_shaders[scanType][dataType][op])
+        m_shaders[scanType][dataType][op] = createShader(scanType,dataType,op,scratchSz);
+    return m_shaders[scanType][dataType][op].get();
+}
 
-#if 0 // TODO: port
-core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
+IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 {
-	auto system = m_device->getPhysicalDevice()->getSystem();
-	core::smart_refctd_ptr<const system::IFile> glsl;
-	{
-		auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
-		{
-			nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
-			system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
-			if (future.wait())
-				return future.copy();
-			return nullptr;
-		};
+    if (!m_specialized_shaders[scanType][dataType][op])
+    {
+        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
+        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
+        cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE);
+
+        auto gpushader = m_device->createShader(cpuShader.get());
+
+        m_specialized_shaders[scanType][dataType][op] = gpushader;
+    }
+    return m_specialized_shaders[scanType][dataType][op].get();
+}
 
-		if(indirect)
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp");
-		else
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp");
-	}
-	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(glsl->getSize());
-	memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize());
-	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????");
-	const char* storageType = nullptr;
-	switch (dataType)
-	{
-		case EDT_UINT:
-			storageType = "uint";
-			break;
-		case EDT_INT:
-			storageType = "int";
-			break;
-		case EDT_FLOAT:
-			storageType = "float";
-			break;
-		default:
-			assert(false);
-			break;
-	}
+IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
+{
+    // ondemand
+    if (!m_pipelines[scanType][dataType][op]) {
+        IGPUComputePipeline::SCreationParams params = {};
+        params.layout = m_pipeline_layout.get();
+        // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+        params.shader.entryPoint = "main";
+        params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz);
 
-	return asset::CGLSLCompiler::createOverridenCopy(
-		cpushader.get(),
-		"#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n",
-		m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op)
-	);
+        m_device->createComputePipelines(
+            nullptr, { &params,1 },
+            & m_pipelines[scanType][dataType][op]
+        );
+    }
+    return m_pipelines[scanType][dataType][op].get();
 }
-#endif
\ No newline at end of file
+    
+core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+{
+    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+    return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true");
+}
+
+}
\ No newline at end of file