diff --git a/examples_tests b/examples_tests index e5d5ae2ca9..2949212cba 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e5d5ae2ca9137a6966d00aa039f3e6dae7c23fb9 +Subproject commit 2949212cbacb5e31ea213808c545e93df91d9a59 diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h new file mode 100644 index 0000000000..355d792782 --- /dev/null +++ b/include/nbl/asset/utils/IMeshPacker.h @@ -0,0 +1,636 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__ +#define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__ + +#include "nbl/asset/utils/IMeshManipulator.h" +#include "nbl/builtin/hlsl/math/morton.hlsl" + +namespace nbl +{ +namespace asset +{ + +class IMeshPackerBase : public virtual core::IReferenceCounted +{ + public: + constexpr static uint32_t MAX_TRIANGLES_IN_BATCH_CNT = 21845u; + + struct ReservedAllocationMeshBuffersBase + { + uint32_t mdiAllocationOffset; + uint32_t mdiAllocationReservedCnt; + uint32_t indexAllocationOffset; + uint32_t indexAllocationReservedCnt; + + inline bool isValid() + { + return this->mdiAllocationOffset!=core::GeneralpurposeAddressAllocator::invalid_address; + } + }; + struct PackedMeshBufferData + { + uint32_t mdiParameterOffset; // add to `CCPUMeshPacker::getMultiDrawIndirectBuffer()->getPointer() to get `DrawElementsIndirectCommand_t` address + uint32_t mdiParameterCount; + + inline bool isValid() + { + return this->mdiParameterOffset != core::GeneralpurposeAddressAllocator::invalid_address; + } + }; + + inline uint16_t getMinTriangleCountPerMDI() const { return m_minTriangleCountPerMDIData; } + inline uint16_t getMaxTriangleCountPerMDI() const { return m_maxTriangleCountPerMDIData; } + + protected: + using alctrTraits = core::address_allocator_traits>; + + IMeshPackerBase(uint16_t minTriangleCountPerMDIData, uint16_t maxTriangleCountPerMDIData) + :m_maxTriangleCountPerMDIData(maxTriangleCountPerMDIData), + m_minTriangleCountPerMDIData(minTriangleCountPerMDIData) + { + assert(minTriangleCountPerMDIData <= MAX_TRIANGLES_IN_BATCH_CNT); + assert(maxTriangleCountPerMDIData <= MAX_TRIANGLES_IN_BATCH_CNT); + assert(minTriangleCountPerMDIData <= maxTriangleCountPerMDIData); + assert(minTriangleCountPerMDIData > 0u); + assert(maxTriangleCountPerMDIData > 0u); + }; + + virtual ~IMeshPackerBase() + { + _NBL_ALIGNED_FREE(const_cast(alctrTraits::getReservedSpacePtr(m_MDIDataAlctr))); + _NBL_ALIGNED_FREE(const_cast(alctrTraits::getReservedSpacePtr(m_idxBuffAlctr))); + _NBL_ALIGNED_FREE(const_cast(alctrTraits::getReservedSpacePtr(m_vtxBuffAlctr))); + } + + struct AllocationParamsCommon + { + // Maximum number of 16 bit indicies that may be allocated + size_t indexBuffSupportedCnt = 67108864ull; /* 128MB*/ + + /* Maximum byte size for vertex data allocation + For `CCPUMeshPackerV1` this will be maximum byte size of buffer containing only attributes with EVIR_PER_VERTEX input rate. + For `CCPUMeshPackerV2` this will be maximum byte size of buffer containing attributes with both EVIR_PER_VERTEX and EVIR_PER_INSTANCE input rate. + */ + size_t vertexBuffSupportedByteSize = 134217728ull; /* 128MB*/ + + // Maximum number of MDI structs that may be allocated + size_t MDIDataBuffSupportedCnt = 16777216ull; /* 16MB assuming MDIStructType is DrawElementsIndirectCommand_t*/ + + // Minimum count of 16 bit indicies allocated per allocation + size_t indexBufferMinAllocCnt = 256ull; + + // Minimum bytes of vertex data allocated per allocation + size_t vertexBufferMinAllocByteSize = 32ull; + + // Minimum count of MDI structs allocated per allocation + size_t MDIDataBuffMinAllocCnt = 32ull; + }; + + void initializeCommonAllocators(const AllocationParamsCommon& allocParams) + { + if (allocParams.indexBuffSupportedCnt) + { + + void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator::reserved_size(alignof(uint16_t), allocParams.indexBuffSupportedCnt, allocParams.indexBufferMinAllocCnt), _NBL_SIMD_ALIGNMENT); + assert(resSpcTmp != nullptr); + m_idxBuffAlctr = core::GeneralpurposeAddressAllocator(resSpcTmp, 0u, 0u, alignof(uint16_t), allocParams.indexBuffSupportedCnt, allocParams.indexBufferMinAllocCnt); + } + + if (allocParams.vertexBuffSupportedByteSize) + { + void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator::reserved_size(32u, allocParams.vertexBuffSupportedByteSize, allocParams.vertexBufferMinAllocByteSize), _NBL_SIMD_ALIGNMENT); + assert(resSpcTmp != nullptr); + m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator(resSpcTmp, 0u, 0u, 32u, allocParams.vertexBuffSupportedByteSize, allocParams.vertexBufferMinAllocByteSize); + } + + if (allocParams.MDIDataBuffSupportedCnt) + { + void* resSpcTmp = _NBL_ALIGNED_MALLOC(core::GeneralpurposeAddressAllocator::reserved_size(alignof(std::max_align_t), allocParams.MDIDataBuffSupportedCnt, allocParams.MDIDataBuffMinAllocCnt), _NBL_SIMD_ALIGNMENT); + assert(resSpcTmp != nullptr); + m_MDIDataAlctr = core::GeneralpurposeAddressAllocator(resSpcTmp, 0u, 0u, alignof(std::max_align_t), allocParams.MDIDataBuffSupportedCnt, allocParams.MDIDataBuffMinAllocCnt); + } + } + + void initializeCommonAllocators( + const core::GeneralpurposeAddressAllocator& mdiAlctr, + const core::GeneralpurposeAddressAllocator& idxAlctr, + const core::GeneralpurposeAddressAllocator& vtxAlctr + ) + { + uint32_t alctrBuffSz = alctrTraits::get_total_size(mdiAlctr); + void* resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, mdiAlctr), _NBL_SIMD_ALIGNMENT); + m_MDIDataAlctr = core::GeneralpurposeAddressAllocator(alctrBuffSz, mdiAlctr, resSpcTmp); + + alctrBuffSz = alctrTraits::get_total_size(idxAlctr); + resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, idxAlctr), _NBL_SIMD_ALIGNMENT); + m_idxBuffAlctr = core::GeneralpurposeAddressAllocator(alctrBuffSz, idxAlctr, resSpcTmp); + + alctrBuffSz = alctrTraits::get_total_size(vtxAlctr); + resSpcTmp = _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(alctrBuffSz, vtxAlctr), _NBL_SIMD_ALIGNMENT); + m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator(alctrBuffSz, vtxAlctr, resSpcTmp); + } + + void free(const ReservedAllocationMeshBuffersBase& rambb) + { + if (rambb.indexAllocationOffset != INVALID_ADDRESS) + m_idxBuffAlctr.free_addr(rambb.indexAllocationOffset,rambb.indexAllocationReservedCnt); + + if (rambb.mdiAllocationOffset != INVALID_ADDRESS) + m_MDIDataAlctr.free_addr(rambb.mdiAllocationOffset,rambb.mdiAllocationReservedCnt); + } + + // + _NBL_STATIC_INLINE_CONSTEXPR uint32_t INVALID_ADDRESS = core::GeneralpurposeAddressAllocator::invalid_address; + + core::GeneralpurposeAddressAllocator m_vtxBuffAlctr; + core::GeneralpurposeAddressAllocator m_idxBuffAlctr; + core::GeneralpurposeAddressAllocator m_MDIDataAlctr; + + const uint16_t m_minTriangleCountPerMDIData; + const uint16_t m_maxTriangleCountPerMDIData; + +}; + +#if 0 // REWRITE +template +class IMeshPacker : public IMeshPackerBase +{ + static_assert(std::is_base_of::value); + +public: + /* + @param minTriangleCountPerMDIData must be <= 21845 + @param maxTriangleCountPerMDIData must be <= 21845 + */ + IMeshPacker(uint16_t minTriangleCountPerMDIData, uint16_t maxTriangleCountPerMDIData) + :IMeshPackerBase(minTriangleCountPerMDIData, maxTriangleCountPerMDIData) + { + } + + //! shrinks byte size of all output buffers, so they are large enough to fit currently allocated contents. Call this function before `instantiateDataStorage` + virtual void shrinkOutputBuffersSize() + { + uint32_t mdiDataBuffNewSize = m_MDIDataAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_MDIDataAlctr)); + uint32_t idxBuffNewSize = m_idxBuffAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_idxBuffAlctr)); + uint32_t vtxBuffNewSize = m_vtxBuffAlctr.safe_shrink_size(0u, alctrTraits::max_alignment(m_vtxBuffAlctr)); + + const void* oldReserved = alctrTraits::getReservedSpacePtr(m_MDIDataAlctr); + m_MDIDataAlctr = core::GeneralpurposeAddressAllocator(mdiDataBuffNewSize, std::move(m_MDIDataAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(mdiDataBuffNewSize, m_MDIDataAlctr), _NBL_SIMD_ALIGNMENT)); + _NBL_ALIGNED_FREE(const_cast(oldReserved)); + + oldReserved = alctrTraits::getReservedSpacePtr(m_idxBuffAlctr); + m_idxBuffAlctr = core::GeneralpurposeAddressAllocator(idxBuffNewSize, std::move(m_idxBuffAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(idxBuffNewSize, m_idxBuffAlctr), _NBL_SIMD_ALIGNMENT)); + _NBL_ALIGNED_FREE(const_cast(oldReserved)); + + oldReserved = alctrTraits::getReservedSpacePtr(m_vtxBuffAlctr); + m_vtxBuffAlctr = core::GeneralpurposeAddressAllocator(vtxBuffNewSize, std::move(m_vtxBuffAlctr), _NBL_ALIGNED_MALLOC(alctrTraits::reserved_size(vtxBuffNewSize, m_vtxBuffAlctr), _NBL_SIMD_ALIGNMENT)); + _NBL_ALIGNED_FREE(const_cast(oldReserved)); + } + + //! Returns maximum number of mdi structs needed to draw range of mesh buffers described by range mbBegin .. mbEnd, actual number of MDI structs needed may differ + template + uint32_t calcMDIStructMaxCount(const MeshBufferIterator mbBegin, const MeshBufferIterator mbEnd) + { + uint32_t acc = 0u; + for (auto mbIt = mbBegin; mbIt != mbEnd; mbIt++) + { + auto mb = *mbIt; + const size_t idxCnt = calcIdxCntAfterConversionToTriangleList(mb); + const uint32_t triCnt = idxCnt / 3; + assert(idxCnt % 3 == 0); + + acc += calcBatchCountBound(triCnt); + } + + return acc; + } + +protected: + virtual ~IMeshPacker() {} + + static inline size_t calcVertexSize(const SVertexInputParams& vtxInputParams, const E_VERTEX_INPUT_RATE inputRate) + { + size_t size = 0ull; + for (size_t i = 0; i < SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT; ++i) + { + if (vtxInputParams.enabledAttribFlags & (1u << i)) + if(vtxInputParams.bindings[i].inputRate == inputRate) + size += asset::getTexelOrBlockBytesize(static_cast(vtxInputParams.attributes[i].format)); + } + + return size; + } + + static inline uint32_t calcVertexCountBoundWithBatchDuplication(const MeshBufferType* meshBuffer) + { + uint32_t triCnt; + if (IMeshManipulator::getPolyCount(triCnt,meshBuffer)) + return triCnt * 3u; + return 0u; + } + + inline uint32_t calcBatchCountBound(uint32_t triCnt) const + { + if (triCnt!=0u) + return (triCnt-1u)/m_minTriangleCountPerMDIData+1u; + return 0u; + } + + struct Triangle + { + uint32_t oldIndices[3]; + }; + + struct TriangleBatches + { + TriangleBatches(uint32_t triCnt) + { + triangles = core::vector(triCnt); + } + + core::vector triangles; + core::vector ranges; + }; + + struct IdxBufferParams + { + SBufferBinding idxBuffer = { 0u, nullptr }; + E_INDEX_TYPE idxType = EIT_UNKNOWN; + }; + + //TODO: functions: constructTriangleBatches, convertIdxBufferToTriangles, deinterleaveAndCopyAttribute and deinterleaveAndCopyPerInstanceAttribute + //will not work with IGPUMeshBuffer as MeshBufferType, move it to new `ICPUMeshPacker` + + TriangleBatches constructTriangleBatches(const MeshBufferType* meshBuffer, IdxBufferParams idxBufferParams, core::aabbox3df*& aabbs) const + { + uint32_t triCnt; + const bool success = IMeshManipulator::getPolyCount(triCnt,meshBuffer); + assert(success); + + const uint32_t batchCnt = calcBatchCountBound(triCnt); + assert(batchCnt != 0u); + + struct MortonTriangle + { + MortonTriangle() = default; + + MortonTriangle(uint16_t fixedPointPos[3], float area) + { + auto tmp = reinterpret_cast(key); + std::copy_n(fixedPointPos,3u,tmp); + tmp[3] = core::Float16Compressor::compress(area); + } + + void complete(float maxArea) + { + auto tmp = reinterpret_cast(key); + const float area = core::Float16Compressor::decompress(tmp[3]); + const float scale = 0.5f; // square root + uint16_t logRelArea = uint16_t(65535.5f+core::clamp(scale*std::log2f(area/maxArea),-65535.5f,0.f)); + key = core::morton4d_encode(tmp[0],tmp[1],tmp[2],logRelArea); + } + + uint64_t key; + }; + + //TODO: use SoA instead (with core::radix_sort): + //core::vector triangles; + //core::vector triangleMortonCodes; + //where `triangles` is member of `TriangleBatch` struct + struct TriangleMortonCodePair + { + Triangle triangle; + MortonTriangle mortonCode; + + inline bool operator<(const TriangleMortonCodePair& other) + { + return this->mortonCode.key < other.mortonCode.key; + } + }; + + TriangleBatches triangleBatches(triCnt); + core::vector triangles(triCnt); //#1 + + core::smart_refctd_ptr mbTmp = core::smart_refctd_ptr_static_cast(meshBuffer->clone()); + mbTmp->setIndexBufferBinding(std::move(idxBufferParams.idxBuffer)); + mbTmp->setIndexType(idxBufferParams.idxType); + mbTmp->getPipeline()->getPrimitiveAssemblyParams().primitiveType = EPT_TRIANGLE_LIST; + + //triangle reordering + { + const core::aabbox3df aabb = IMeshManipulator::calculateBoundingBox(mbTmp.get()); + + uint32_t ix = 0u; + float maxTriangleArea = 0.0f; + for (auto it = triangles.begin(); it != triangles.end(); it++) + { + auto triangleIndices = IMeshManipulator::getTriangleIndices(mbTmp.get(), ix++); + //have to copy there + std::copy(triangleIndices.begin(), triangleIndices.end(), it->triangle.oldIndices); + + core::vectorSIMDf trianglePos[3]; + trianglePos[0] = mbTmp->getPosition(it->triangle.oldIndices[0]); + trianglePos[1] = mbTmp->getPosition(it->triangle.oldIndices[1]); + trianglePos[2] = mbTmp->getPosition(it->triangle.oldIndices[2]); + + const core::vectorSIMDf centroid = ((trianglePos[0] + trianglePos[1] + trianglePos[2]) / 3.0f) - core::vectorSIMDf(aabb.MinEdge.X, aabb.MinEdge.Y, aabb.MinEdge.Z); + uint16_t fixedPointPos[3]; + fixedPointPos[0] = uint16_t(centroid.x * 65535.5f / aabb.getExtent().X); + fixedPointPos[1] = uint16_t(centroid.y * 65535.5f / aabb.getExtent().Y); + fixedPointPos[2] = uint16_t(centroid.z * 65535.5f / aabb.getExtent().Z); + + float area = core::cross(trianglePos[1] - trianglePos[0], trianglePos[2] - trianglePos[0]).x; + it->mortonCode = MortonTriangle(fixedPointPos, area); + + if (area > maxTriangleArea) + maxTriangleArea = area; + } + + //complete morton code + for (auto it = triangles.begin(); it != triangles.end(); it++) + it->mortonCode.complete(maxTriangleArea); + + std::sort(triangles.begin(), triangles.end()); + } + + //copying, after radix_sort this will be removed + //TODO durning radix_sort integration: + //since there will be distinct arrays for triangles and their morton code use `triangleBatches.triangles` instead of #1 + for (uint32_t i = 0u; i < triCnt; i++) + triangleBatches.triangles[i] = triangles[i].triangle; + + //set ranges + Triangle* triangleArrayBegin = triangleBatches.triangles.data(); + Triangle* triangleArrayEnd = triangleArrayBegin + triangleBatches.triangles.size(); + const uint32_t triangleCnt = triangleBatches.triangles.size(); + + //aabb batch division + { + triangleBatches.ranges.push_back(triangleArrayBegin); + for (auto nextTriangle = triangleArrayBegin; nextTriangle < triangleArrayEnd; ) + { + const Triangle* batchBegin = *(triangleBatches.ranges.end() - 1u); + const Triangle* batchEnd = batchBegin + m_minTriangleCountPerMDIData; + + //find min and max edge + core::vector3df_SIMD min(std::numeric_limits::max()); + core::vector3df_SIMD max(-std::numeric_limits::max()); + + auto extendAABB = [&min, &max, &meshBuffer](auto triangleIt) -> void + { + for (uint32_t i = 0u; i < 3u; i++) + { + auto vxPos = meshBuffer->getPosition(triangleIt->oldIndices[i]); + min = core::min(vxPos, min); + max = core::max(vxPos, max); + } + }; + + for (uint32_t i = 0u; i < m_minTriangleCountPerMDIData && nextTriangle != triangleArrayEnd; i++) + extendAABB(nextTriangle++); + + auto halfAreaAABB = [&min, &max]() -> float + { + auto extent = max - min; + return extent.x * extent.y + extent.x * extent.z + extent.y * extent.z; + }; + + constexpr float kGrowthLimit = 1.025f; + float batchArea = halfAreaAABB(); + for (uint16_t i = m_minTriangleCountPerMDIData; nextTriangle != triangleArrayEnd && i < m_maxTriangleCountPerMDIData; i++) + { + if(aabbs) + *aabbs = core::aabbox3df(core::vector3df(min.x, min.y, min.z), core::vector3df(max.x, max.y, max.z)); + + extendAABB(nextTriangle); + float newBatchArea = halfAreaAABB(); + if (newBatchArea > kGrowthLimit* batchArea) + break; + nextTriangle++; + batchArea = newBatchArea; + } + + if (aabbs) + { + if (nextTriangle == triangleArrayEnd || m_minTriangleCountPerMDIData == m_maxTriangleCountPerMDIData) + *aabbs = core::aabbox3df(core::vector3df(min.x, min.y, min.z), core::vector3df(max.x, max.y, max.z)); + aabbs++; + } + + triangleBatches.ranges.push_back(nextTriangle); + } + + } + + return triangleBatches; + } + + static core::unordered_map constructNewIndicesFromTriangleBatchAndUpdateUnifiedIndexBuffer(TriangleBatches& batches, uint32_t batchIdx, uint16_t*& indexBuffPtr) + { + core::unordered_map usedVertices; + core::vector newIdxTris = batches.triangles; + + auto batchBegin = batches.ranges[batchIdx]; + auto batchEnd = batches.ranges[batchIdx + 1]; + + const uint32_t triangleInBatchCnt = std::distance(batchBegin, batchEnd); + const uint32_t idxInBatchCnt = 3u * triangleInBatchCnt; + + uint32_t newIdx = 0u; + for (uint32_t i = 0u; i < triangleInBatchCnt; i++) + { + const Triangle* const triangle = batchBegin + i; + for (int32_t j = 0; j < 3; j++) + { + const uint32_t oldIndex = triangle->oldIndices[j]; + auto result = usedVertices.insert(std::make_pair(oldIndex, newIdx)); + + newIdxTris[i].oldIndices[j] = result.second ? newIdx++ : result.first->second; + } + } + + //TODO: cache optimization + //copy indices into unified index buffer + for (size_t i = 0; i < triangleInBatchCnt; i++) + { + for (int j = 0; j < 3; j++) + { + *indexBuffPtr = newIdxTris[i].oldIndices[j]; + indexBuffPtr++; + } + } + + return usedVertices; + } + + static void deinterleaveAndCopyAttribute(MeshBufferType* meshBuffer, uint16_t attrLocation, const core::unordered_map& usedVertices, uint8_t* dstAttrPtr) + { + const uint8_t* const srcAttrPtr = meshBuffer->getAttribPointer(attrLocation); + SVertexInputParams& mbVtxInputParams = meshBuffer->getPipeline()->getVertexInputParams(); + SVertexInputAttribParams MBAttrib = mbVtxInputParams.attributes[attrLocation]; + SVertexInputBindingParams attribBinding = mbVtxInputParams.bindings[MBAttrib.binding]; + const size_t attrSize = asset::getTexelOrBlockBytesize(static_cast(MBAttrib.format)); + const size_t stride = (attribBinding.stride) == 0 ? attrSize : attribBinding.stride; + + for (auto index : usedVertices) + { + const uint8_t* attrSrc = srcAttrPtr + (index.first * stride); + uint8_t* attrDest = dstAttrPtr + (index.second * attrSize); + memcpy(attrDest, attrSrc, attrSize); + } + } + + static void deinterleaveAndCopyPerInstanceAttribute(MeshBufferType* meshBuffer, uint16_t attrLocation, uint8_t* dstAttrPtr) + { + const uint8_t* const srcAttrPtr = meshBuffer->getAttribPointer(attrLocation); + SVertexInputParams& mbVtxInputParams = meshBuffer->getPipeline()->getVertexInputParams(); + SVertexInputAttribParams MBAttrib = mbVtxInputParams.attributes[attrLocation]; + SVertexInputBindingParams attribBinding = mbVtxInputParams.bindings[MBAttrib.binding]; + const size_t attrSize = asset::getTexelOrBlockBytesize(static_cast(MBAttrib.format)); + const size_t stride = (attribBinding.stride) == 0 ? attrSize : attribBinding.stride; + + const uint32_t insCnt = meshBuffer->getInstanceCount(); + for (uint32_t i = 0u; i < insCnt; i++) + { + const uint8_t* attrSrc = srcAttrPtr + (i * stride); + uint8_t* attrDest = dstAttrPtr + (i * attrSize); + memcpy(attrDest, attrSrc, attrSize); + } + } + + inline uint32_t calcIdxCntAfterConversionToTriangleList(const MeshBufferType* meshBuffer) + { + const auto& params = meshBuffer->getPipeline()->getPrimitiveAssemblyParams(); + + switch (params.primitiveType) + { + case EPT_TRIANGLE_LIST: + case EPT_TRIANGLE_STRIP: + case EPT_TRIANGLE_FAN: + break; + case EPT_POINT_LIST: + case EPT_LINE_LIST: + case EPT_LINE_STRIP: + case EPT_LINE_LIST_WITH_ADJACENCY: + case EPT_LINE_STRIP_WITH_ADJACENCY: + case EPT_TRIANGLE_LIST_WITH_ADJACENCY: + case EPT_TRIANGLE_STRIP_WITH_ADJACENCY: + case EPT_PATCH_LIST: + default: + assert(false); + break; + } + + uint32_t triCnt; + const bool success = IMeshManipulator::getPolyCount(triCnt, meshBuffer); + assert(success); + + return triCnt * 3; + } + inline uint32_t calcIdxCntAfterConversionToTriangleList(const core::smart_refctd_ptr& meshBuffer) + { + return calcIdxCntAfterConversionToTriangleList(meshBuffer.get()); + } + inline uint32_t calcIdxCntAfterConversionToTriangleList(const core::smart_refctd_ptr& meshBuffer) + { + return calcIdxCntAfterConversionToTriangleList(meshBuffer.get()); + } + + std::pair> convertIdxBufferToTriangles(MeshBufferType* meshBuffer) + { + const auto mbIdxBuffer = meshBuffer->getIndexBufferBinding().buffer; + E_INDEX_TYPE idxType = meshBuffer->getIndexType(); + const uint32_t idxCount = meshBuffer->getIndexCount(); + if (idxCount == 0) + return { 0u, nullptr }; + + const bool iota = idxType == EIT_UNKNOWN || !mbIdxBuffer; + core::smart_refctd_ptr idxBufferToProcess; + if (iota) + { + idxBufferToProcess = core::make_smart_refctd_ptr(sizeof(uint32_t) * idxCount); + auto ptr = reinterpret_cast(idxBufferToProcess->getPointer()); + std::iota(ptr, ptr + idxCount, 0u); + idxType = EIT_32BIT; + } + else + { + idxBufferToProcess = mbIdxBuffer; + } + + std::pair> output; + output.first = meshBuffer->getIndexCount(); + + const auto& params = meshBuffer->getPipeline()->getPrimitiveAssemblyParams(); + switch (params.primitiveType) + { + case EPT_TRIANGLE_STRIP: + output.second = IMeshManipulator::idxBufferFromTriangleStripsToTriangles(idxBufferToProcess->getPointer(), output.first, idxType, idxType); + return output; + + case EPT_TRIANGLE_FAN: + output.second = IMeshManipulator::idxBufferFromTrianglesFanToTriangles(idxBufferToProcess->getPointer(), output.first, idxType, idxType); + return output; + + //TODO: packer should return when there is mesh buffer with one of following: + case EPT_TRIANGLE_LIST: + case EPT_POINT_LIST: + case EPT_LINE_LIST: + case EPT_LINE_STRIP: + case EPT_LINE_LIST_WITH_ADJACENCY: + case EPT_LINE_STRIP_WITH_ADJACENCY: + case EPT_TRIANGLE_LIST_WITH_ADJACENCY: + case EPT_TRIANGLE_STRIP_WITH_ADJACENCY: + case EPT_PATCH_LIST: + default: + assert(false); + return { 0u, nullptr }; + } + } + + IdxBufferParams createNewIdxBufferParamsForNonTriangleListTopologies(MeshBufferType* meshBuffer) + { + IdxBufferParams output; + + const auto& mbPrimitiveType = meshBuffer->getPipeline()->getPrimitiveAssemblyParams().primitiveType; + if (mbPrimitiveType == EPT_TRIANGLE_LIST) + { + const auto& mbIdxBuff = meshBuffer->getIndexBufferBinding(); + output.idxBuffer.offset = mbIdxBuff.offset; + output.idxBuffer.buffer = core::smart_refctd_ptr(mbIdxBuff.buffer); + output.idxType = meshBuffer->getIndexType(); + } + else + { + auto newIdxBuffer = convertIdxBufferToTriangles(meshBuffer); + output.idxBuffer.offset = 0u; + output.idxBuffer.buffer = newIdxBuffer.second; + output.idxType = EIT_32BIT; + } + + return output; + } + +protected: + template + struct PackerDataStoreCommon + { + static_assert(std::is_base_of::value); + + core::smart_refctd_ptr MDIDataBuffer; + + inline bool isValid() + { + return this->MDIDataBuffer->getPointer() != nullptr; + } + }; + +}; +#endif +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl new file mode 100644 index 0000000000..55d1713619 --- /dev/null +++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ +#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace luma_meter +{ + +struct MeteringWindow +{ + using this_t = MeteringWindow; + float32_t2 meteringWindowScale; + float32_t2 meteringWindowOffset; + + static this_t create(float32_t2 scale, float32_t2 offset) { + this_t retval; + retval.meteringWindowScale = scale; + retval.meteringWindowOffset = offset; + return retval; + } +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl new file mode 100644 index 0000000000..20af804603 --- /dev/null +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -0,0 +1,287 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ +#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" +#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "nbl/builtin/hlsl/luma_meter/common.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace luma_meter +{ + +template +struct geom_meter { + using float_t = typename SharedAccessor::type; + using float_t2 = typename conditional, float32_t2, float16_t2>::type; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using this_t = geom_meter; + + static this_t create(float_t2 lumaMinMax, float_t sampleCount) + { + this_t retval; + retval.lumaMinMax = lumaMinMax; + retval.sampleCount = sampleCount; + return retval; + } + + float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + { + return workgroup::reduction < plus < float_t >, GroupSize >:: + template __call (value, sdata); + } + + float_t __computeLumaLog2( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(TexAccessor) tex, + float_t2 shiftedCoord + ) + { + float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + float_t3 color = tex.get(uvPos); + float_t luma = (float_t)TexAccessor::toXYZ(color); + + luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); + + return log2(luma); + } + + void __uploadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + float_t val, + float_t minLog2, + float_t rangeLog2 + ) + { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + + val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + } + + float_t __downloadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + uint32_t index, + float_t minLog2, + float_t rangeLog2 + ) + { + float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); + return luma / rangeLog2 + minLog2; + } + + void sampleLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(ValueAccessor) val, + NBL_REF_ARG(TexAccessor) tex, + NBL_REF_ARG(SharedAccessor) sdata, + float_t2 tileOffset, + float_t2 viewportSize + ) + { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + uint32_t2 coord = { + morton2d_decode_x(tid), + morton2d_decode_y(tid) + }; + + float_t luma = 0.0f; + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); + float_t lumaLog2Sum = __reduction(lumaLog2, sdata); + + if (tid == 0) { + __uploadFloat( + val, + lumaLog2Sum, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ); + } + } + + float_t gatherLuma( + NBL_REF_ARG(ValueAccessor) val + ) + { + uint32_t tid = glsl::gl_SubgroupInvocationID(); + float_t luma = glsl::subgroupAdd( + __downloadFloat( + val, + tid, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ) + ); + + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + return (luma / (1 << fixedPointBitsLeft)) / sampleCount; + } + + float_t sampleCount; + float_t2 lumaMinMax; +}; + +template +struct median_meter { + using int_t = typename SharedAccessor::type; + using float_t = float32_t; + using float_t2 = typename conditional, float32_t2, float16_t2>::type; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using this_t = median_meter; + + static this_t create(float_t2 lumaMinMax) { + this_t retval; + retval.lumaMinMax = lumaMinMax; + return retval; + } + + int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { + return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: + template __call (value, sdata); + } + + float_t __computeLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(TexAccessor) tex, + float_t2 shiftedCoord + ) { + float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + float_t3 color = tex.get(uvPos); + float_t luma = (float_t)TexAccessor::toXYZ(color); + + return clamp(luma, lumaMinMax.x, lumaMinMax.y); + } + + int_t __float2Int( + float_t val, + float_t minLog2, + float_t rangeLog2 + ) { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + } + + float_t __int2Float( + int_t val, + float_t minLog2, + float_t rangeLog2 + ) { + return val / rangeLog2 + minLog2; + } + + void sampleLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(HistogramAccessor) histo, + NBL_REF_ARG(TexAccessor) tex, + NBL_REF_ARG(SharedAccessor) sdata, + float_t2 tileOffset, + float_t2 viewportSize + ) { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + + for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + sdata.set(vid, 0); + } + + sdata.workgroupExecutionAndMemoryBarrier(); + + uint32_t2 coord = { + morton2d_decode_x(tid), + morton2d_decode_y(tid) + }; + + float_t luma = 0.0f; + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + luma = __computeLuma(window, tex, shiftedCoord); + + float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; + uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); + + sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + + sdata.workgroupExecutionAndMemoryBarrier(); + + float_t histogram_value; + sdata.get(tid, histogram_value); + + sdata.workgroupExecutionAndMemoryBarrier(); + + float_t sum = inclusive_scan(histogram_value, sdata); + histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + + const bool is_last_wg_invocation = tid == (GroupSize - 1); + const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; + + for (int i = 1; i < RoundedBinCount; i++) { + uint32_t keyBucketStart = GroupSize * i; + uint32_t vid = tid + keyBucketStart; + + // no if statement about the last iteration needed + if (is_last_wg_invocation) { + float_t beforeSum; + sdata.get(keyBucketStart, beforeSum); + sdata.set(keyBucketStart, beforeSum + sum); + } + + // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes + sdata.workgroupExecutionAndMemoryBarrier(); + + // no aliasing anymore + float_t atVid; + sdata.get(vid, atVid); + sum = inclusive_scan(atVid, sdata); + if (vid < BinCount) { + histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + } + } + } + + float_t gatherLuma( + NBL_REF_ARG(HistogramAccessor) histo, + NBL_REF_ARG(SharedAccessor) sdata + ) { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + + for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + sdata.set( + vid, + histo.get(vid & (BinCount - 1)) + ); + } + + sdata.workgroupExecutionAndMemoryBarrier(); + + uint32_t percentile40, percentile60; + sdata.get(BinCount * 0.4, percentile40); + sdata.get(BinCount * 0.6, percentile60); + + return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; + } + + float_t2 lumaMinMax; +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl new file mode 100644 index 0000000000..c0769fc88b --- /dev/null +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -0,0 +1,160 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_ + +#ifdef __HLSL_VERSION +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#else +#include +#endif + +namespace nbl +{ +namespace hlsl +{ + +namespace impl +{ + +template +NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n) +{ + const static uint64_t mask[5] = + { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0F0F0F0F0F0F0F0Full, + 0x00FF00FF00FF00FFull, + 0x0000FFFF0000FFFFull + }; + return nbl::hlsl::_static_cast(mask[_n]); +} + +template +NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n) +{ + const static uint64_t mask[5] = + { + 0x1249249249249249ull, + 0x10C30C30C30C30C3ull, + 0x010F00F00F00F00Full, + 0x001F0000FF0000FFull, + 0x001F00000000FFFFull + }; + return nbl::hlsl::_static_cast(mask[_n]); +} +template +NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n) +{ + const static uint64_t mask[4] = + { + 0x1111111111111111ull, + 0x0303030303030303ull, + 0x000F000F000F000Full, + 0x000000FF000000FFull + }; + return nbl::hlsl::_static_cast(mask[_n]); +} + +template +inline T morton2d_decode(T x) +{ + x = x & morton2d_mask(0); + x = (x | (x >> 1)) & morton2d_mask(1); + x = (x | (x >> 2)) & morton2d_mask(2); + if (bitDepth > 8u) + { + x = (x | (x >> 4)) & morton2d_mask(3); + } + if (bitDepth > 16u) + { + x = (x | (x >> 8)) & morton2d_mask(4); + } + if (bitDepth > 32u) + { + x = (x | (x >> 16)); + } + return x; +} + +//! Puts bits on even positions filling gaps with 0s +template +inline T separate_bits_2d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 16)) & morton2d_mask(4); + } + if (bitDepth > 16u) + { + x = (x | (x << 8)) & morton2d_mask(3); + } + if (bitDepth > 8u) + { + x = (x | (x << 4)) & morton2d_mask(2); + } + x = (x | (x << 2)) & morton2d_mask(1); + x = (x | (x << 1)) & morton2d_mask(0); + + return x; +} +template +inline T separate_bits_3d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 32)) & morton3d_mask(4); + } + if (bitDepth > 16u) + { + x = (x | (x << 16)) & morton3d_mask(3); + } + if (bitDepth > 8u) + { + x = (x | (x << 8)) & morton3d_mask(2); + } + x = (x | (x << 4)) & morton3d_mask(1); + x = (x | (x << 2)) & morton3d_mask(0); + + return x; +} +template +inline T separate_bits_4d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 24)) & morton4d_mask(3); + } + if (bitDepth > 16u) + { + x = (x | (x << 12)) & morton4d_mask(2); + } + if (bitDepth > 8u) + { + x = (x | (x << 6)) & morton4d_mask(1); + } + x = (x | (x << 3)) & morton4d_mask(0); + + return x; +} +} + +template +T morton2d_decode_x(T _morton) { return impl::morton2d_decode(_morton); } +template +T morton2d_decode_y(T _morton) { return impl::morton2d_decode(_morton >> 1); } + +template +T morton2d_encode(T x, T y) { return impl::separate_bits_2d(x) | (impl::separate_bits_2d(y) << 1); } +template +T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d(x) | (impl::separate_bits_3d(y) << 1) | (impl::separate_bits_3d(z) << 2); } +template +T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d(x) | (impl::separate_bits_4d(y) << 1) | (impl::separate_bits_4d(z) << 2) | (impl::separate_bits_4d(w) << 3); } + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index a7614469dd..97077dffdd 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -140,37 +140,45 @@ enable_if_t/* && !is_same_v*/,T> copyLogical(Ptr_U v); // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not. // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t || is_same_v, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t && (is_same_v || is_same_v), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t || is_same_v, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t && (is_same_v || is_same_v), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t || is_same_v, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t && (is_same_v || is_same_v), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t || is_same_v, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t && (is_same_v || is_same_v), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl new file mode 100644 index 0000000000..46d241c76c --- /dev/null +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -0,0 +1,106 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace tonemapper +{ + +template +struct Reinhard +{ + using float_t = enable_if_t::value, T>; + using float_t3 = vector; + using this_t = Reinhard; + + static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f) + { + this_t retval; + + const float_t unit = 1.0; + retval.keyAndManualLinearExposure = key * exp2(EV); + retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV); + + return retval; + } + + float_t3 operator()(float_t3 rawCIEXYZcolor) { + const float_t unit = 1.0; + float_t exposureFactors = keyAndManualLinearExposure; + float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; + float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma)); + return rawCIEXYZcolor * colorMultiplier; + } + + float_t keyAndManualLinearExposure; + float_t rcpWhite2; +}; + +template +struct ACES +{ + using float_t = enable_if_t::value, T>; + using float_t3 = vector; + using float_t3x3 = matrix; + + using this_t = ACES; + static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) { + this_t retval; + retval.gamma = Contrast; + const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key + retval.exposure = EV + log2(key * reinhardMatchCorrection); + return retval; + } + + float_t3 operator()(float_t3 rawCIEXYZcolor) { + const float_t unit = 1.0; + float_t3 tonemapped = rawCIEXYZcolor; + if (tonemapped.y > bit_cast(numeric_limits::min)) + tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma); + + // XYZ => RRT_SAT + // this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t) + const float_t3x3 XYZ_RRT_Input = float_t3x3( + float_t3(1.594168310, -0.262608051, -0.231993079), + float_t3(-0.6332771780, 1.5840380200, 0.0164147373), + float_t3(0.00892840419, 0.03648501260, 0.87711471300) + ); + + // this is obviously fitted to some particular simulated sensor/film and display + float_t3 v = mul(XYZ_RRT_Input, tonemapped); + float_t3 a = v * (v + promote(0.0245786)) - promote(0.000090537); + float_t3 b = v * (v * promote(0.983729) + promote(0.4329510)) + promote(0.238081); + v = a / b; + + // ODT_SAT => XYZ + // this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t) + const float_t3x3 ODT_XYZ_Output = float_t3x3( + float_t3(0.624798000, 0.164064825, 0.161605373), + float_t3(0.268048108, 0.674283803, 0.057667464), + float_t3(0.0157514643, 0.0526682511, 1.0204007600) + ); + return mul(ODT_XYZ_Output, v); + } + + float_t gamma; // 1.0 + float_t exposure; // actualExposure+midGrayLog2 +}; + +// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php +// or get proper ACES RRT and ODTs +// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use- + +} +} +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index e8798499f9..74b91f05c6 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -27,6 +27,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/struct_declare.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl") +# luma metering +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") +# tonemapper +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl") # bump mapping LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl` @@ -231,6 +236,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl") #extra math LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl") +#morton +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl") #acceleration structures LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl") #colorspace