Skip to content

Commit 4969227

Browse files
committed
Work on cooperative binary search
1 parent e5d5ae2 commit 4969227

File tree

9 files changed

+404
-0
lines changed

9 files changed

+404
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
include(common RESULT_VARIABLE RES)
2+
if(NOT RES)
3+
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
4+
endif()
5+
6+
nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
7+
8+
if(NBL_EMBED_BUILTIN_RESOURCES)
9+
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
10+
set(RESOURCE_DIR "app_resources")
11+
12+
get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
13+
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
14+
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
15+
16+
file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
17+
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
18+
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
19+
endforeach()
20+
21+
ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
22+
23+
LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
24+
endif()
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
#pragma wave shader_stage(compute)
6+
7+
#include "common.h"
8+
using namespace nbl::hlsl;
9+
10+
[[vk::push_constant]] ConstantBuffer<PushConstants> Constants;
11+
[[vk::binding(0)]] StructuredBuffer<uint> Histogram;
12+
[[vk::binding(1)]] RWStructuredBuffer<uint> Output;
13+
14+
static const uint32_t GroupsharedSize = 256;
15+
16+
[numthreads(256, 1, 1)]
17+
void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID)
18+
{
19+
20+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#ifndef _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
2+
#define _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
3+
4+
#include <nbl/builtin/hlsl/cpp_compat/basic.h>
5+
#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
6+
7+
using namespace nbl::hlsl;
8+
namespace nbl {
9+
namespace hlsl {
10+
11+
struct PushConstants
12+
{
13+
uint32_t EntityCount;
14+
};
15+
16+
};
17+
};
18+
19+
#endif // _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
#pragma wave shader_stage(fragment)
6+
7+
// vertex shader is provided by the fullScreenTriangle extension
8+
#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
9+
using namespace nbl::hlsl;
10+
using namespace ext::FullScreenTriangle;
11+
12+
// binding 0 set 0
13+
[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
14+
[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
15+
16+
[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
17+
{
18+
return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
19+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"enableParallelBuild": true,
3+
"threadsPerBuildProcess" : 2,
4+
"isExecuted": false,
5+
"scriptPath": "",
6+
"cmake": {
7+
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
8+
"buildModes": [],
9+
"requiredOptions": []
10+
},
11+
"profiles": [
12+
{
13+
"backend": "vulkan",
14+
"platform": "windows",
15+
"buildModes": [],
16+
"runConfiguration": "Release",
17+
"gpuArchitectures": []
18+
}
19+
],
20+
"dependencies": [],
21+
"data": [
22+
{
23+
"dependencies": [],
24+
"command": [""],
25+
"outputs": []
26+
}
27+
]
28+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
2+
#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
3+
4+
#include "nbl/examples/examples.hpp"
5+
6+
// example's own headers
7+
#include "nbl/ui/ICursorControl.h" // TODO: why not in nabla.h ?
8+
#include "nbl/ext/ImGui/ImGui.h"
9+
#include "imgui/imgui_internal.h"
10+
11+
#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
#include "nbl/examples/examples.hpp"
6+
#include "nbl/system/IApplicationFramework.h"
7+
#include "app_resources/common.h"
8+
9+
#include <iostream>
10+
#include <cstdio>
11+
#include <assert.h>
12+
13+
14+
using namespace nbl;
15+
using namespace nbl::core;
16+
using namespace nbl::hlsl;
17+
using namespace nbl::system;
18+
using namespace nbl::asset;
19+
using namespace nbl::ui;
20+
using namespace nbl::video;
21+
using namespace nbl::examples;
22+
23+
//using namespace glm;
24+
25+
void cpu_tests();
26+
27+
class CooperativeBinarySearch final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
28+
{
29+
using device_base_t = application_templates::MonoDeviceApplication;
30+
using asset_base_t = BuiltinResourcesApplication;
31+
public:
32+
CooperativeBinarySearch(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
33+
IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
34+
35+
bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
36+
{
37+
// Remember to call the base class initialization!
38+
if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
39+
return false;
40+
if (!asset_base_t::onAppInitialized(std::move(system)))
41+
return false;
42+
43+
m_queue = m_device->getQueue(0, 0);
44+
m_commandPool = m_device->createCommandPool(m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
45+
m_commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &m_cmdbuf,1 }, smart_refctd_ptr(m_logger));
46+
47+
smart_refctd_ptr<IShader> shader;
48+
{
49+
IAssetLoader::SAssetLoadParams lp = {};
50+
lp.logger = m_logger.get();
51+
lp.workingDirectory = ""; // virtual root
52+
auto assetBundle = m_assetMgr->getAsset("app_resources/binarySearch.comp.hlsl", lp);
53+
const auto assets = assetBundle.getContents();
54+
if (assets.empty())
55+
return logFail("Could not load shader!");
56+
57+
auto source = IAsset::castDown<IShader>(assets[0]);
58+
// The down-cast should not fail!
59+
assert(source);
60+
61+
// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
62+
shader = m_device->compileShader({ source.get() });
63+
if (!shader)
64+
return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
65+
}
66+
67+
const uint32_t bindingCount = 2u;
68+
IGPUDescriptorSetLayout::SBinding bindings[bindingCount] = {};
69+
bindings[0].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(0)]] StructuredBuffer<uint> Histogram;
70+
bindings[1].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(1)]] RWStructuredBuffer<uint> Output;
71+
72+
for(int i = 0; i < bindingCount; ++i)
73+
{
74+
bindings[i].stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
75+
bindings[i].count = 1;
76+
bindings[i].binding = i;
77+
}
78+
m_descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
79+
{
80+
SPushConstantRange pcRange = {};
81+
pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
82+
pcRange.offset = 0u;
83+
pcRange.size = 2 * sizeof(uint32_t);
84+
auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout));
85+
IGPUComputePipeline::SCreationParams params = {};
86+
params.layout = layout.get();
87+
params.shader.shader = shader.get();
88+
params.shader.entryPoint = "main";
89+
if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
90+
return logFail("Failed to create compute pipeline!\n");
91+
}
92+
93+
for (uint32_t i = 0; i < bindingCount; i++)
94+
{
95+
m_buffers[i] = m_device->createBuffer(IGPUBuffer::SCreationParams {
96+
{.size = 500000, .usage =
97+
IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT |
98+
IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT,
99+
}
100+
});
101+
102+
auto reqs = m_buffers[i]->getMemoryReqs();
103+
reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits();
104+
m_device->allocate(reqs, m_buffers[i].get());
105+
}
106+
107+
smart_refctd_ptr<IDescriptorPool> descriptorPool = nullptr;
108+
{
109+
IDescriptorPool::SCreateInfo createInfo = {};
110+
createInfo.maxSets = 1;
111+
createInfo.maxDescriptorCount[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1;
112+
descriptorPool = m_device->createDescriptorPool(std::move(createInfo));
113+
}
114+
115+
m_descriptorSet = descriptorPool->createDescriptorSet(smart_refctd_ptr(m_descriptorSetLayout));
116+
117+
IGPUDescriptorSet::SDescriptorInfo descriptorInfos[bindingCount] = {};
118+
IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSets[bindingCount] = {};
119+
120+
for(int i = 0; i < bindingCount; ++i)
121+
{
122+
writeDescriptorSets[i].info = &descriptorInfos[i];
123+
writeDescriptorSets[i].dstSet = m_descriptorSet.get();
124+
writeDescriptorSets[i].binding = i;
125+
writeDescriptorSets[i].count = bindings[i].count;
126+
127+
descriptorInfos[i].desc = m_buffers[i];
128+
descriptorInfos[i].info.buffer.size = ~0ull;
129+
}
130+
131+
m_device->updateDescriptorSets(bindingCount, writeDescriptorSets, 0u, nullptr);
132+
133+
// In contrast to fences, we just need one semaphore to rule all dispatches
134+
return true;
135+
}
136+
137+
void onAppTerminated_impl() override
138+
{
139+
m_device->waitIdle();
140+
}
141+
142+
void workLoopBody() override
143+
{
144+
cpu_tests();
145+
146+
constexpr auto StartedValue = 0;
147+
148+
smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(StartedValue);
149+
150+
m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
151+
m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
152+
153+
154+
IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t layoutBufferBarrier[1] = { {
155+
.barrier = {
156+
.dep = {
157+
.srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
158+
.srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT,
159+
.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
160+
.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
161+
}
162+
},
163+
// whole buffer because we transferred the contents into it
164+
.range = {.offset = 0,.size = m_buffers[1]->getCreationParams().size,.buffer = m_buffers[1]}
165+
} };
166+
167+
const IGPUCommandBuffer::SPipelineBarrierDependencyInfo depInfo = { .bufBarriers = layoutBufferBarrier };
168+
m_cmdbuf->pipelineBarrier(EDF_NONE, depInfo);
169+
170+
171+
const uint32_t pushConstants[2] = { 1920, 1080 };
172+
const IGPUDescriptorSet* set = m_descriptorSet.get();
173+
m_cmdbuf->bindComputePipeline(m_pipeline.get());
174+
m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set);
175+
m_cmdbuf->dispatch(240, 135, 1u);
176+
177+
layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT);
178+
m_cmdbuf->pipelineBarrier(EDF_NONE,depInfo);
179+
180+
m_cmdbuf->end();
181+
182+
{
183+
constexpr auto FinishedValue = 69;
184+
IQueue::SSubmitInfo submitInfos[1] = {};
185+
const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
186+
submitInfos[0].commandBuffers = cmdbufs;
187+
const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
188+
submitInfos[0].signalSemaphores = signals;
189+
m_api->startCapture();
190+
m_queue->submit(submitInfos);
191+
m_api->endCapture();
192+
const ISemaphore::SWaitInfo waitInfos[] = { {
193+
.semaphore = progress.get(),
194+
.value = FinishedValue
195+
} };
196+
m_device->blockForSemaphores(waitInfos);
197+
}
198+
199+
auto mem = m_buffers[1]->getBoundMemory();
200+
assert(mem.memory->isMappable());
201+
auto* ptr = mem.memory->map({ .offset = 0, .length = mem.memory->getAllocationSize() });
202+
printf("readback ptr %p\n", ptr);
203+
204+
m_keepRunning = false;
205+
}
206+
207+
bool keepRunning() override
208+
{
209+
return m_keepRunning;
210+
}
211+
212+
213+
private:
214+
smart_refctd_ptr<IGPUComputePipeline> m_pipeline = nullptr;
215+
smart_refctd_ptr<IGPUDescriptorSetLayout> m_descriptorSetLayout;
216+
smart_refctd_ptr<IGPUDescriptorSet> m_descriptorSet;
217+
218+
smart_refctd_ptr<IGPUBuffer> m_buffers[2];
219+
smart_refctd_ptr<IGPUCommandBuffer> m_cmdbuf = nullptr;
220+
IQueue* m_queue;
221+
smart_refctd_ptr<IGPUCommandPool> m_commandPool;
222+
uint64_t m_iteration = 0;
223+
constexpr static inline uint64_t MaxIterations = 200;
224+
225+
bool m_keepRunning = true;
226+
};
227+
228+
NBL_MAIN_FUNC(CooperativeBinarySearch)
229+
230+
void cpu_tests()
231+
{
232+
}

0 commit comments

Comments
 (0)