diff --git a/examples_tests b/examples_tests
index e5d5ae2ca9..2949212cba 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e5d5ae2ca9137a6966d00aa039f3e6dae7c23fb9
+Subproject commit 2949212cbacb5e31ea213808c545e93df91d9a59
diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
new file mode 100644
index 0000000000..355d792782
--- /dev/null
+++ b/include/nbl/asset/utils/IMeshPacker.h
@@ -0,0 +1,636 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__
+#define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__
+
+#include "nbl/asset/utils/IMeshManipulator.h"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
+namespace nbl
+{
+namespace asset
+{
+
+class IMeshPackerBase : public virtual core::IReferenceCounted
+{
+    public:
+        constexpr static uint32_t MAX_TRIANGLES_IN_BATCH_CNT = 21845u;
+        
+        struct ReservedAllocationMeshBuffersBase
+        {
+            uint32_t mdiAllocationOffset;
+            uint32_t mdiAllocationReservedCnt;
+            uint32_t indexAllocationOffset;
+            uint32_t indexAllocationReservedCnt;
+
+            inline bool isValid()
+            {
+                return this->mdiAllocationOffset!=core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address;
+            }
+        };
+        struct PackedMeshBufferData
+        {
+            uint32_t mdiParameterOffset; // add to `CCPUMeshPacker::getMultiDrawIndirectBuffer()->getPointer() to get `DrawElementsIndirectCommand_t` address
+            uint32_t mdiParameterCount;
+
+            inline bool isValid()
+            {
+                return this->mdiParameterOffset != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address;
+            }
+        };
+
+        inline uint16_t getMinTriangleCountPerMDI() const { return m_minTriangleCountPerMDIData; }
+        inline uint16_t getMaxTriangleCountPerMDI() const { return m_maxTriangleCountPerMDIData; }
+
+    protected:
+        using alctrTraits = core::address_allocator_traits<core::GeneralpurposeAddressAllocator<uint32_t>>;
+
+        IMeshPackerBase(uint16_t minTriangleCountPerMDIData, uint16_t maxTriangleCountPerMDIData)
+            :m_maxTriangleCountPerMDIData(maxTriangleCountPerMDIData),
+             m_minTriangleCountPerMDIData(minTriangleCountPerMDIData)
+        {
+            assert(minTriangleCountPerMDIData <= MAX_TRIANGLES_IN_BATCH_CNT);
+            assert(maxTriangleCountPerMDIData <= MAX_TRIANGLES_IN_BATCH_CNT);
+            assert(minTriangleCountPerMDIData <= maxTriangleCountPerMDIData);
+            assert(minTriangleCountPerMDIData > 0u);
+            assert(maxTriangleCountPerMDIData > 0u);
+        };
+
+        virtual ~IMeshPackerBase()
+        {
+            _NBL_ALIGNED_FREE(const_cast<void*>(alctrTraits::getReservedSpacePtr(m_MDIDataAlctr)));
+            _NBL_ALIGNED_FREE(const_cast<void*>(alctrTraits::getReservedSpacePtr(m_idxBuffAlctr)));
+            _NBL_ALIGNED_FREE(const_cast<void*>(alctrTraits::getReservedSpacePtr(m_vtxBuffAlctr)));
+        }
+
+        struct AllocationParamsCommon
+        {
+            // Maximum number of 16 bit indicies that may be allocated
+            size_t indexBuffSupportedCnt = 67108864ull;                    /*   128MB*/
+
+            /* Maximum byte size for vertex data allocation
+               For `CCPUMeshPackerV1` this will be maximum byte size of buffer containing only attributes with EVIR_PER_VERTEX input rate.
+               For `CCPUMeshPackerV2` this will be maximum byte size of buffer containing attributes with both EVIR_PER_VERTEX and EVIR_PER_INSTANCE input rate.
+            */
+            size_t vertexBuffSupportedByteSize = 134217728ull;                 /*   128MB*/
+
+            // Maximum number of MDI structs that may be allocated
+            size_t MDIDataBuffSupportedCnt = 16777216ull;                  /*   16MB assuming MDIStructType is DrawElementsIndirectCommand_t*/
+
+            // Minimum count of 16 bit indicies allocated per allocation
+            size_t indexBufferMinAllocCnt = 256ull;
+
+            // Minimum bytes of vertex data allocated per allocation
+            size_t vertexBufferMinAllocByteSize = 32ull;
+
+            // Minimum count of MDI structs allocated per allocation
+            size_t MDIDataBuffMinAllocCnt = 32ull;
+        };
+
+        void initializeCommonAllocators(const AllocationParamsCommon& allocParams)
+        {
+            if (allocParams.indexBuffSupportedCnt)
+            {
+                
+                void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator<uint32_t>::reserved_size(alignof(uint16_t), allocParams.indexBuffSupportedCnt, allocParams.indexBufferMinAllocCnt), _NBL_SIMD_ALIGNMENT);
+                assert(resSpcTmp != nullptr);
+                m_idxBuffAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(resSpcTmp, 0u, 0u, alignof(uint16_t), allocParams.indexBuffSupportedCnt, allocParams.indexBufferMinAllocCnt);
+            }
+
+            if (allocParams.vertexBuffSupportedByteSize)
+            {
+                void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator<uint32_t>::reserved_size(32u, allocParams.vertexBuffSupportedByteSize, allocParams.vertexBufferMinAllocByteSize), _NBL_SIMD_ALIGNMENT);
+                assert(resSpcTmp != nullptr);
+                m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(resSpcTmp, 0u, 0u, 32u, allocParams.vertexBuffSupportedByteSize, allocParams.vertexBufferMinAllocByteSize);
+            }
+
+            if (allocParams.MDIDataBuffSupportedCnt)
+            {
+                void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator<uint32_t>::reserved_size(alignof(std::max_align_t), allocParams.MDIDataBuffSupportedCnt, allocParams.MDIDataBuffMinAllocCnt), _NBL_SIMD_ALIGNMENT);
+                assert(resSpcTmp != nullptr);
+                m_MDIDataAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(resSpcTmp, 0u, 0u, alignof(std::max_align_t), allocParams.MDIDataBuffSupportedCnt, allocParams.MDIDataBuffMinAllocCnt);
+            }
+        }
+
+        void initializeCommonAllocators(
+            const core::GeneralpurposeAddressAllocator<uint32_t>& mdiAlctr,
+            const core::GeneralpurposeAddressAllocator<uint32_t>& idxAlctr,
+            const core::GeneralpurposeAddressAllocator<uint32_t>& vtxAlctr
+            )
+        {
+            uint32_t alctrBuffSz = alctrTraits::get_total_size(mdiAlctr);
+            void* resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, mdiAlctr), _NBL_SIMD_ALIGNMENT);
+            m_MDIDataAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(alctrBuffSz, mdiAlctr, resSpcTmp);
+
+            alctrBuffSz = alctrTraits::get_total_size(idxAlctr);
+            resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, idxAlctr), _NBL_SIMD_ALIGNMENT);
+            m_idxBuffAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(alctrBuffSz, idxAlctr, resSpcTmp);
+
+            alctrBuffSz = alctrTraits::get_total_size(vtxAlctr);
+            resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, vtxAlctr), _NBL_SIMD_ALIGNMENT);
+            m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator<uint32_t>(alctrBuffSz, vtxAlctr, resSpcTmp);
+        }
+
+        void free(const ReservedAllocationMeshBuffersBase& rambb)
+        {
+            if (rambb.indexAllocationOffset != INVALID_ADDRESS)
+                m_idxBuffAlctr.free_addr(rambb.indexAllocationOffset,rambb.indexAllocationReservedCnt);
+
+            if (rambb.mdiAllocationOffset != INVALID_ADDRESS)
+                m_MDIDataAlctr.free_addr(rambb.mdiAllocationOffset,rambb.mdiAllocationReservedCnt);
+        }
+        
+        //
+        _NBL_STATIC_INLINE_CONSTEXPR uint32_t INVALID_ADDRESS = core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address;
+
+        core::GeneralpurposeAddressAllocator<uint32_t> m_vtxBuffAlctr;
+        core::GeneralpurposeAddressAllocator<uint32_t> m_idxBuffAlctr;
+        core::GeneralpurposeAddressAllocator<uint32_t> m_MDIDataAlctr;
+
+        const uint16_t m_minTriangleCountPerMDIData;
+        const uint16_t m_maxTriangleCountPerMDIData;
+
+};
+
+#if 0 // REWRITE
+template <typename MeshBufferType, typename MDIStructType = DrawElementsIndirectCommand_t>
+class IMeshPacker : public IMeshPackerBase
+{
+    static_assert(std::is_base_of<DrawElementsIndirectCommand_t, MDIStructType>::value);
+
+public:
+    /*
+    @param minTriangleCountPerMDIData must be <= 21845
+    @param maxTriangleCountPerMDIData must be <= 21845
+    */
+    IMeshPacker(uint16_t minTriangleCountPerMDIData, uint16_t maxTriangleCountPerMDIData)
+        :IMeshPackerBase(minTriangleCountPerMDIData, maxTriangleCountPerMDIData)
+    {
+    }
+
+    //! shrinks byte size of all output buffers, so they are large enough to fit currently allocated contents. Call this function before `instantiateDataStorage`
+    virtual void shrinkOutputBuffersSize()
+    {
+        uint32_t mdiDataBuffNewSize = m_MDIDataAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_MDIDataAlctr));
+        uint32_t idxBuffNewSize = m_idxBuffAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_idxBuffAlctr));
+        uint32_t vtxBuffNewSize = m_vtxBuffAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_vtxBuffAlctr));
+
+        const void* oldReserved = alctrTraits::getReservedSpacePtr(m_MDIDataAlctr);
+        m_MDIDataAlctr = core::GeneralpurposeAddressAllocator(mdiDataBuffNewSize, std::move(m_MDIDataAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(mdiDataBuffNewSize, m_MDIDataAlctr), _NBL_SIMD_ALIGNMENT));
+        _NBL_ALIGNED_FREE(const_cast<void*>(oldReserved));
+
+        oldReserved = alctrTraits::getReservedSpacePtr(m_idxBuffAlctr);
+        m_idxBuffAlctr = core::GeneralpurposeAddressAllocator(idxBuffNewSize, std::move(m_idxBuffAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(idxBuffNewSize, m_idxBuffAlctr), _NBL_SIMD_ALIGNMENT));
+        _NBL_ALIGNED_FREE(const_cast<void*>(oldReserved));
+
+        oldReserved = alctrTraits::getReservedSpacePtr(m_vtxBuffAlctr);
+        m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator(vtxBuffNewSize, std::move(m_vtxBuffAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(vtxBuffNewSize, m_vtxBuffAlctr), _NBL_SIMD_ALIGNMENT));
+        _NBL_ALIGNED_FREE(const_cast<void*>(oldReserved));
+    }
+
+    //! Returns maximum number of mdi structs needed to draw range of mesh buffers described by range mbBegin .. mbEnd, actual number of MDI structs needed may differ
+    template <typename MeshBufferIterator>
+    uint32_t calcMDIStructMaxCount(const MeshBufferIterator mbBegin, const MeshBufferIterator mbEnd)
+    {
+        uint32_t acc = 0u;
+        for (auto mbIt = mbBegin; mbIt != mbEnd; mbIt++)
+        {
+            auto mb = *mbIt;
+            const size_t idxCnt = calcIdxCntAfterConversionToTriangleList(mb);
+            const uint32_t triCnt = idxCnt / 3;
+            assert(idxCnt % 3 == 0);
+
+            acc += calcBatchCountBound(triCnt);
+        }
+
+        return acc;
+    }
+
+protected:
+    virtual ~IMeshPacker() {}
+
+    static inline size_t calcVertexSize(const SVertexInputParams& vtxInputParams, const E_VERTEX_INPUT_RATE inputRate)
+    {
+        size_t size = 0ull;
+        for (size_t i = 0; i < SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT; ++i)
+        {
+            if (vtxInputParams.enabledAttribFlags & (1u << i))
+                if(vtxInputParams.bindings[i].inputRate == inputRate)
+                    size += asset::getTexelOrBlockBytesize(static_cast<E_FORMAT>(vtxInputParams.attributes[i].format));
+        }
+
+        return size;
+    }
+
+    static inline uint32_t calcVertexCountBoundWithBatchDuplication(const MeshBufferType* meshBuffer)
+    {
+        uint32_t triCnt;
+        if (IMeshManipulator::getPolyCount(triCnt,meshBuffer))
+            return triCnt * 3u;
+        return 0u;
+    }
+
+    inline uint32_t calcBatchCountBound(uint32_t triCnt) const
+    {
+        if (triCnt!=0u)
+            return (triCnt-1u)/m_minTriangleCountPerMDIData+1u;
+        return 0u;
+    }
+
+    struct Triangle
+    {
+        uint32_t oldIndices[3];
+    };
+
+    struct TriangleBatches
+    {
+        TriangleBatches(uint32_t triCnt)
+        {
+            triangles = core::vector<Triangle>(triCnt);
+        }
+
+        core::vector<Triangle> triangles;
+        core::vector<Triangle*> ranges;
+    };
+
+    struct IdxBufferParams
+    {
+        SBufferBinding<ICPUBuffer> idxBuffer = { 0u, nullptr };
+        E_INDEX_TYPE idxType = EIT_UNKNOWN;
+    };
+
+    //TODO: functions: constructTriangleBatches, convertIdxBufferToTriangles, deinterleaveAndCopyAttribute and deinterleaveAndCopyPerInstanceAttribute
+    //will not work with IGPUMeshBuffer as MeshBufferType, move it to new `ICPUMeshPacker`
+
+    TriangleBatches constructTriangleBatches(const MeshBufferType* meshBuffer, IdxBufferParams idxBufferParams, core::aabbox3df*& aabbs) const
+    {
+        uint32_t triCnt;
+        const bool success = IMeshManipulator::getPolyCount(triCnt,meshBuffer);
+        assert(success);
+         
+        const uint32_t batchCnt = calcBatchCountBound(triCnt);
+        assert(batchCnt != 0u);
+
+        struct MortonTriangle
+        {
+            MortonTriangle() = default;
+
+            MortonTriangle(uint16_t fixedPointPos[3], float area)
+            {
+                auto tmp = reinterpret_cast<uint16_t*>(key);
+                std::copy_n(fixedPointPos,3u,tmp);
+                tmp[3] = core::Float16Compressor::compress(area);
+            }
+
+            void complete(float maxArea)
+            {
+                auto tmp = reinterpret_cast<const uint16_t*>(key);
+                const float area = core::Float16Compressor::decompress(tmp[3]);
+                const float scale = 0.5f; // square root
+                uint16_t logRelArea = uint16_t(65535.5f+core::clamp(scale*std::log2f(area/maxArea),-65535.5f,0.f));
+                key = core::morton4d_encode(tmp[0],tmp[1],tmp[2],logRelArea);
+            }
+
+            uint64_t key;
+        };
+
+        //TODO: use SoA instead (with core::radix_sort):
+        //core::vector<Triangle> triangles;
+        //core::vector<MortonTriangle> triangleMortonCodes;
+        //where `triangles` is member of `TriangleBatch` struct
+        struct TriangleMortonCodePair
+        {
+            Triangle triangle;
+            MortonTriangle mortonCode;
+
+            inline bool operator<(const TriangleMortonCodePair& other)
+            {
+                return this->mortonCode.key < other.mortonCode.key;
+            }
+        };
+
+        TriangleBatches triangleBatches(triCnt);
+        core::vector<TriangleMortonCodePair> triangles(triCnt); //#1
+
+        core::smart_refctd_ptr<ICPUMeshBuffer> mbTmp = core::smart_refctd_ptr_static_cast<ICPUMeshBuffer>(meshBuffer->clone());
+        mbTmp->setIndexBufferBinding(std::move(idxBufferParams.idxBuffer));
+        mbTmp->setIndexType(idxBufferParams.idxType);
+        mbTmp->getPipeline()->getPrimitiveAssemblyParams().primitiveType = EPT_TRIANGLE_LIST;
+
+        //triangle reordering
+        {
+            const core::aabbox3df aabb = IMeshManipulator::calculateBoundingBox(mbTmp.get());
+
+            uint32_t ix = 0u;
+            float maxTriangleArea = 0.0f;
+            for (auto it = triangles.begin(); it != triangles.end(); it++)
+            {
+                auto triangleIndices = IMeshManipulator::getTriangleIndices(mbTmp.get(), ix++);
+                //have to copy there
+                std::copy(triangleIndices.begin(), triangleIndices.end(), it->triangle.oldIndices);
+
+                core::vectorSIMDf trianglePos[3];
+                trianglePos[0] = mbTmp->getPosition(it->triangle.oldIndices[0]);
+                trianglePos[1] = mbTmp->getPosition(it->triangle.oldIndices[1]);
+                trianglePos[2] = mbTmp->getPosition(it->triangle.oldIndices[2]);
+
+                const core::vectorSIMDf centroid = ((trianglePos[0] + trianglePos[1] + trianglePos[2]) / 3.0f) - core::vectorSIMDf(aabb.MinEdge.X, aabb.MinEdge.Y, aabb.MinEdge.Z);
+                uint16_t fixedPointPos[3];
+                fixedPointPos[0] = uint16_t(centroid.x * 65535.5f / aabb.getExtent().X);
+                fixedPointPos[1] = uint16_t(centroid.y * 65535.5f / aabb.getExtent().Y);
+                fixedPointPos[2] = uint16_t(centroid.z * 65535.5f / aabb.getExtent().Z);
+
+                float area = core::cross(trianglePos[1] - trianglePos[0], trianglePos[2] - trianglePos[0]).x;
+                it->mortonCode = MortonTriangle(fixedPointPos, area);
+
+                if (area > maxTriangleArea)
+                    maxTriangleArea = area;
+            }
+
+            //complete morton code
+            for (auto it = triangles.begin(); it != triangles.end(); it++)
+                it->mortonCode.complete(maxTriangleArea);
+
+            std::sort(triangles.begin(), triangles.end());
+        }
+
+        //copying, after radix_sort this will be removed
+        //TODO durning radix_sort integration:
+        //since there will be distinct arrays for triangles and their morton code use `triangleBatches.triangles` instead of #1
+        for (uint32_t i = 0u; i < triCnt; i++)
+            triangleBatches.triangles[i] = triangles[i].triangle;
+
+        //set ranges
+        Triangle* triangleArrayBegin = triangleBatches.triangles.data();
+        Triangle* triangleArrayEnd = triangleArrayBegin + triangleBatches.triangles.size();
+        const uint32_t triangleCnt = triangleBatches.triangles.size();
+
+        //aabb batch division
+        {
+            triangleBatches.ranges.push_back(triangleArrayBegin);
+            for (auto nextTriangle = triangleArrayBegin; nextTriangle < triangleArrayEnd; )
+            {
+                const Triangle* batchBegin = *(triangleBatches.ranges.end() - 1u);
+                const Triangle* batchEnd = batchBegin + m_minTriangleCountPerMDIData;
+
+                //find min and max edge
+                core::vector3df_SIMD min(std::numeric_limits<float>::max());
+                core::vector3df_SIMD max(-std::numeric_limits<float>::max());
+
+                auto extendAABB = [&min, &max, &meshBuffer](auto triangleIt) -> void
+                {
+                    for (uint32_t i = 0u; i < 3u; i++)
+                    {
+                        auto vxPos = meshBuffer->getPosition(triangleIt->oldIndices[i]);
+                        min = core::min(vxPos, min);
+                        max = core::max(vxPos, max);
+                    }
+                };
+
+                for (uint32_t i = 0u; i < m_minTriangleCountPerMDIData && nextTriangle != triangleArrayEnd; i++)
+                    extendAABB(nextTriangle++);
+
+                auto halfAreaAABB = [&min, &max]() -> float
+                {
+                    auto extent = max - min;
+                    return extent.x * extent.y + extent.x * extent.z + extent.y * extent.z;
+                };
+
+                constexpr float kGrowthLimit = 1.025f;
+                float batchArea = halfAreaAABB();
+                for (uint16_t i = m_minTriangleCountPerMDIData; nextTriangle != triangleArrayEnd && i < m_maxTriangleCountPerMDIData; i++)
+                {
+                    if(aabbs)
+                        *aabbs = core::aabbox3df(core::vector3df(min.x, min.y, min.z), core::vector3df(max.x, max.y, max.z));
+
+                    extendAABB(nextTriangle);
+                    float newBatchArea = halfAreaAABB();
+                    if (newBatchArea > kGrowthLimit* batchArea)
+                        break;
+                    nextTriangle++;
+                    batchArea = newBatchArea;
+                }
+
+                if (aabbs)
+                {
+                    if (nextTriangle == triangleArrayEnd || m_minTriangleCountPerMDIData == m_maxTriangleCountPerMDIData)
+                        *aabbs = core::aabbox3df(core::vector3df(min.x, min.y, min.z), core::vector3df(max.x, max.y, max.z));
+                    aabbs++;
+                }
+
+                triangleBatches.ranges.push_back(nextTriangle);
+            }
+                
+        }
+
+        return triangleBatches;
+    }
+
+    static core::unordered_map<uint32_t, uint16_t> constructNewIndicesFromTriangleBatchAndUpdateUnifiedIndexBuffer(TriangleBatches& batches, uint32_t batchIdx, uint16_t*& indexBuffPtr)
+    {
+        core::unordered_map<uint32_t, uint16_t> usedVertices;
+        core::vector<Triangle> newIdxTris = batches.triangles;
+
+        auto batchBegin = batches.ranges[batchIdx];
+        auto batchEnd = batches.ranges[batchIdx + 1];
+
+        const uint32_t triangleInBatchCnt = std::distance(batchBegin, batchEnd);
+        const uint32_t idxInBatchCnt = 3u * triangleInBatchCnt;
+
+        uint32_t newIdx = 0u;
+        for (uint32_t i = 0u; i < triangleInBatchCnt; i++)
+        {
+            const Triangle* const triangle = batchBegin + i;
+            for (int32_t j = 0; j < 3; j++)
+            {
+                const uint32_t oldIndex = triangle->oldIndices[j];
+                auto result = usedVertices.insert(std::make_pair(oldIndex, newIdx));
+
+                newIdxTris[i].oldIndices[j] = result.second ? newIdx++ : result.first->second;
+            }
+        }
+
+        //TODO: cache optimization
+        //copy indices into unified index buffer
+        for (size_t i = 0; i < triangleInBatchCnt; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                *indexBuffPtr = newIdxTris[i].oldIndices[j];
+                indexBuffPtr++;
+            }
+        }
+
+        return usedVertices;
+    }
+
+    static void deinterleaveAndCopyAttribute(MeshBufferType* meshBuffer, uint16_t attrLocation, const core::unordered_map<uint32_t, uint16_t>& usedVertices, uint8_t* dstAttrPtr)
+    {
+        const uint8_t* const srcAttrPtr = meshBuffer->getAttribPointer(attrLocation);
+        SVertexInputParams& mbVtxInputParams = meshBuffer->getPipeline()->getVertexInputParams();
+        SVertexInputAttribParams MBAttrib = mbVtxInputParams.attributes[attrLocation];
+        SVertexInputBindingParams attribBinding = mbVtxInputParams.bindings[MBAttrib.binding];
+        const size_t attrSize = asset::getTexelOrBlockBytesize(static_cast<E_FORMAT>(MBAttrib.format));
+        const size_t stride = (attribBinding.stride) == 0 ? attrSize : attribBinding.stride;
+
+        for (auto index : usedVertices)
+        {
+            const uint8_t* attrSrc = srcAttrPtr + (index.first * stride);
+            uint8_t* attrDest = dstAttrPtr + (index.second * attrSize);
+            memcpy(attrDest, attrSrc, attrSize);
+        }
+    }
+
+    static void deinterleaveAndCopyPerInstanceAttribute(MeshBufferType* meshBuffer, uint16_t attrLocation, uint8_t* dstAttrPtr)
+    {
+        const uint8_t* const srcAttrPtr = meshBuffer->getAttribPointer(attrLocation);
+        SVertexInputParams& mbVtxInputParams = meshBuffer->getPipeline()->getVertexInputParams();
+        SVertexInputAttribParams MBAttrib = mbVtxInputParams.attributes[attrLocation];
+        SVertexInputBindingParams attribBinding = mbVtxInputParams.bindings[MBAttrib.binding];
+        const size_t attrSize = asset::getTexelOrBlockBytesize(static_cast<E_FORMAT>(MBAttrib.format));
+        const size_t stride = (attribBinding.stride) == 0 ? attrSize : attribBinding.stride;
+
+        const uint32_t insCnt = meshBuffer->getInstanceCount();
+        for (uint32_t i = 0u; i < insCnt; i++)
+        {
+            const uint8_t* attrSrc = srcAttrPtr + (i * stride);
+            uint8_t* attrDest = dstAttrPtr + (i * attrSize);
+            memcpy(attrDest, attrSrc, attrSize);
+        }
+    }
+
+    inline uint32_t calcIdxCntAfterConversionToTriangleList(const MeshBufferType* meshBuffer)
+    {
+        const auto& params = meshBuffer->getPipeline()->getPrimitiveAssemblyParams();
+
+        switch (params.primitiveType)
+        {
+            case EPT_TRIANGLE_LIST: 
+            case EPT_TRIANGLE_STRIP:
+            case EPT_TRIANGLE_FAN:
+                break;
+            case EPT_POINT_LIST:
+            case EPT_LINE_LIST:
+            case EPT_LINE_STRIP:
+            case EPT_LINE_LIST_WITH_ADJACENCY:
+            case EPT_LINE_STRIP_WITH_ADJACENCY:
+            case EPT_TRIANGLE_LIST_WITH_ADJACENCY:
+            case EPT_TRIANGLE_STRIP_WITH_ADJACENCY:
+            case EPT_PATCH_LIST:
+            default:
+                assert(false);
+                break;
+        }
+
+        uint32_t triCnt;
+        const bool success = IMeshManipulator::getPolyCount(triCnt, meshBuffer);
+        assert(success);
+
+        return triCnt * 3;
+    }
+    inline uint32_t calcIdxCntAfterConversionToTriangleList(const core::smart_refctd_ptr<MeshBufferType>& meshBuffer)
+    {
+        return calcIdxCntAfterConversionToTriangleList(meshBuffer.get());
+    }
+    inline uint32_t calcIdxCntAfterConversionToTriangleList(const core::smart_refctd_ptr<const MeshBufferType>& meshBuffer)
+    {
+        return calcIdxCntAfterConversionToTriangleList(meshBuffer.get());
+    }
+
+    std::pair<uint32_t, core::smart_refctd_ptr<ICPUBuffer>> convertIdxBufferToTriangles(MeshBufferType* meshBuffer)
+    {
+        const auto mbIdxBuffer = meshBuffer->getIndexBufferBinding().buffer;
+        E_INDEX_TYPE idxType = meshBuffer->getIndexType();
+        const uint32_t idxCount = meshBuffer->getIndexCount();
+        if (idxCount == 0)
+            return { 0u, nullptr };
+
+        const bool iota = idxType == EIT_UNKNOWN || !mbIdxBuffer;
+        core::smart_refctd_ptr<ICPUBuffer> idxBufferToProcess;
+        if (iota)
+        {
+            idxBufferToProcess = core::make_smart_refctd_ptr<ICPUBuffer>(sizeof(uint32_t) * idxCount);
+            auto ptr = reinterpret_cast<uint32_t*>(idxBufferToProcess->getPointer());
+            std::iota(ptr, ptr + idxCount, 0u);
+            idxType = EIT_32BIT;
+        }
+        else
+        {
+            idxBufferToProcess = mbIdxBuffer;
+        }
+        
+        std::pair<uint32_t, core::smart_refctd_ptr<ICPUBuffer>> output;
+        output.first = meshBuffer->getIndexCount();
+
+        const auto& params = meshBuffer->getPipeline()->getPrimitiveAssemblyParams();
+        switch (params.primitiveType)
+        {
+            case EPT_TRIANGLE_STRIP:
+                output.second = IMeshManipulator::idxBufferFromTriangleStripsToTriangles(idxBufferToProcess->getPointer(), output.first, idxType, idxType);
+                return output;
+
+            case EPT_TRIANGLE_FAN:
+                output.second = IMeshManipulator::idxBufferFromTrianglesFanToTriangles(idxBufferToProcess->getPointer(), output.first, idxType, idxType);
+                return output;
+
+                //TODO: packer should return when there is mesh buffer with one of following:
+            case EPT_TRIANGLE_LIST:
+            case EPT_POINT_LIST:
+            case EPT_LINE_LIST:
+            case EPT_LINE_STRIP:
+            case EPT_LINE_LIST_WITH_ADJACENCY:
+            case EPT_LINE_STRIP_WITH_ADJACENCY:
+            case EPT_TRIANGLE_LIST_WITH_ADJACENCY:
+            case EPT_TRIANGLE_STRIP_WITH_ADJACENCY:
+            case EPT_PATCH_LIST:
+            default:
+                assert(false);
+                return { 0u, nullptr };
+        }
+    }
+
+    IdxBufferParams createNewIdxBufferParamsForNonTriangleListTopologies(MeshBufferType* meshBuffer)
+    {
+        IdxBufferParams output;
+
+        const auto& mbPrimitiveType = meshBuffer->getPipeline()->getPrimitiveAssemblyParams().primitiveType;
+        if (mbPrimitiveType == EPT_TRIANGLE_LIST)
+        {
+            const auto& mbIdxBuff = meshBuffer->getIndexBufferBinding();
+            output.idxBuffer.offset = mbIdxBuff.offset;
+            output.idxBuffer.buffer = core::smart_refctd_ptr(mbIdxBuff.buffer);
+            output.idxType = meshBuffer->getIndexType();
+        }
+        else
+        {
+            auto newIdxBuffer = convertIdxBufferToTriangles(meshBuffer);
+            output.idxBuffer.offset = 0u;
+            output.idxBuffer.buffer = newIdxBuffer.second;
+            output.idxType = EIT_32BIT;
+        }
+
+        return output;
+    }
+
+protected:
+    template <typename BufferType>
+    struct PackerDataStoreCommon
+    {
+        static_assert(std::is_base_of<core::IBuffer, BufferType>::value);
+
+        core::smart_refctd_ptr<BufferType> MDIDataBuffer;
+
+        inline bool isValid()
+        {
+            return this->MDIDataBuffer->getPointer() != nullptr;
+        }
+    };
+
+};
+#endif
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
new file mode 100644
index 0000000000..55d1713619
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct MeteringWindow
+{
+	using this_t = MeteringWindow;
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+
+	static this_t create(float32_t2 scale, float32_t2 offset) {
+		this_t retval;
+		retval.meteringWindowScale = scale;
+		retval.meteringWindowOffset = offset;
+		return retval;
+	}
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
new file mode 100644
index 0000000000..20af804603
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -0,0 +1,287 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+struct geom_meter {
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
+    {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
+        return retval;
+    }
+
+    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
+        return workgroup::reduction < plus < float_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t __computeLumaLog2(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    )
+    {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+
+        return log2(luma);
+    }
+
+    void __uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+    }
+
+    float_t __downloadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        return luma / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    )
+    {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
+
+        if (tid == 0) {
+            __uploadFloat(
+                val,
+                lumaLog2Sum,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            );
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val
+    )
+    {
+        uint32_t tid = glsl::gl_SubgroupInvocationID();
+        float_t luma = glsl::subgroupAdd(
+            __downloadFloat(
+                val,
+                tid,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            )
+        );
+
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;
+};
+
+template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+struct median_meter {
+    using int_t = typename SharedAccessor::type;
+    using float_t  = float32_t;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax) {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        return retval;
+    }
+
+    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t __computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    ) {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+    }
+
+    int_t __float2Int(
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+    }
+
+    float_t __int2Float(
+        int_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        return val / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(vid, 0);
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = __computeLuma(window, tex, shiftedCoord);
+
+        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
+        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+
+        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t histogram_value;
+        sdata.get(tid, histogram_value);
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t sum = inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        const bool is_last_wg_invocation = tid == (GroupSize - 1);
+        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
+
+        for (int i = 1; i < RoundedBinCount; i++) {
+            uint32_t keyBucketStart = GroupSize * i;
+            uint32_t vid = tid + keyBucketStart;
+
+            // no if statement about the last iteration needed
+            if (is_last_wg_invocation) {
+                float_t beforeSum;
+                sdata.get(keyBucketStart, beforeSum);
+                sdata.set(keyBucketStart, beforeSum + sum);
+            }
+
+            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            // no aliasing anymore
+            float_t atVid;
+            sdata.get(vid, atVid);
+            sum = inclusive_scan(atVid, sdata);
+            if (vid < BinCount) {
+                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+            }
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(SharedAccessor) sdata
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(
+                vid,
+                histo.get(vid & (BinCount - 1))
+            );
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t percentile40, percentile60;
+        sdata.get(BinCount * 0.4, percentile40);
+        sdata.get(BinCount * 0.6, percentile60);
+
+        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+    }
+
+    float_t2 lumaMinMax;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
new file mode 100644
index 0000000000..c0769fc88b
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -0,0 +1,160 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+
+#ifdef __HLSL_VERSION
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#else
+#include <cstdint>
+#endif
+
+namespace nbl
+{
+namespace hlsl
+{
+
+namespace impl
+{
+
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n)
+{
+    const static uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+}
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index a7614469dd..97077dffdd 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -140,37 +140,45 @@ enable_if_t<is_pointer_v<Ptr_U>/* && !is_same_v<T,U>*/,T> copyLogical(Ptr_U v);
 // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not.
 // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
new file mode 100644
index 0000000000..46d241c76c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -0,0 +1,106 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace tonemapper
+{
+
+template<typename T = float32_t>
+struct Reinhard
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = vector<float_t, 3>;
+	using this_t = Reinhard<float_t>;
+
+	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
+	{
+		this_t retval;
+
+		const float_t unit = 1.0;
+		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
+
+		return retval;
+	}
+
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
+		float_t exposureFactors = keyAndManualLinearExposure;
+		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
+		return rawCIEXYZcolor * colorMultiplier;
+	}
+
+	float_t keyAndManualLinearExposure;
+	float_t rcpWhite2;
+};
+
+template<typename T = float32_t>
+struct ACES
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = vector<float_t, 3>;
+	using float_t3x3 = matrix<float_t, 3, 3>;
+
+	using this_t = ACES<T>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
+		this_t retval;
+		retval.gamma = Contrast;
+		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		retval.exposure = EV + log2(key * reinhardMatchCorrection);
+		return retval;
+	}
+
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
+		float_t3 tonemapped = rawCIEXYZcolor;
+		if (tonemapped.y > bit_cast<float_t>(numeric_limits<float_t>::min))
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma);
+
+		// XYZ => RRT_SAT
+		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+		const float_t3x3 XYZ_RRT_Input = float_t3x3(
+			float_t3(1.594168310, -0.262608051, -0.231993079),
+			float_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+			float_t3(0.00892840419, 0.03648501260, 0.87711471300)
+		);
+
+		// this is obviously fitted to some particular simulated sensor/film and display
+		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
+		float_t3 a = v * (v + promote<float_t3>(0.0245786)) - promote<float_t3>(0.000090537);
+		float_t3 b = v * (v * promote<float_t3>(0.983729) + promote<float_t3>(0.4329510)) + promote<float_t3>(0.238081);
+		v = a / b;
+
+		// ODT_SAT => XYZ
+		// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+		const float_t3x3 ODT_XYZ_Output = float_t3x3(
+			float_t3(0.624798000, 0.164064825, 0.161605373),
+			float_t3(0.268048108, 0.674283803, 0.057667464),
+			float_t3(0.0157514643, 0.0526682511, 1.0204007600)
+		);
+		return mul(ODT_XYZ_Output, v);
+	}
+
+	float_t gamma; // 1.0
+	float_t exposure; // actualExposure+midGrayLog2
+};
+
+// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
+// or get proper ACES RRT and ODTs
+// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index e8798499f9..74b91f05c6 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -27,6 +27,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/struct_declare.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
+# luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+# tonemapper
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl")
 # bump mapping
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl`
@@ -231,6 +236,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
 #extra math
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl")
+#morton
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl")
 #acceleration structures
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl")
 #colorspace