added gpu-side timings

Hugobros3 · Hugobros3 · commit b327896a922a · 2024-04-16T17:32:12.000+02:00
diff --git a/include/shady/runtime.h b/include/shady/runtime.h
@@ -30,7 +30,11 @@ typedef struct Module_ Module;
 
 Program* new_program_from_module(Runtime*, const CompilerConfig*, Module*);
 
-Command* launch_kernel(Program*, Device*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
+typedef struct {
+    uint64_t* profiled_gpu_time;
+} ExtraKernelOptions;
+
+Command* launch_kernel(Program*, Device*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
 bool wait_completion(Command*);
 
 Buffer* allocate_buffer_device(Device*, size_t);
diff --git a/samples/aobench/ao_main.c b/samples/aobench/ao_main.c
@@ -132,15 +132,19 @@ void render_device(Args* args, TEXEL_T *img, int w, int h, int nsubsamples, Stri
     Program* program = new_program_from_module(runtime, &args->compiler_config, m);
 
     // run it twice to compile everything and benefit from caches
-    wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }));
+    wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }, NULL));
     struct timespec ts;
     timespec_get(&ts, TIME_UTC);
     uint64_t tsn = timespec_to_nano(ts);
-    wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }));
+    uint64_t profiled_gpu_time = 0;
+    ExtraKernelOptions extra_kernel_options = {
+        .profiled_gpu_time = &profiled_gpu_time
+    };
+    wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }, &extra_kernel_options));
     struct timespec tp;
     timespec_get(&tp, TIME_UTC);
     uint64_t tpn = timespec_to_nano(tp);
-    info_print("device rendering took %d us\n", (tpn - tsn) / 1000);
+    info_print("device rendering took %dus (gpu time: %dus)\n", (tpn - tsn) / 1000, profiled_gpu_time / 1000);
 
     debug_print("data %d\n", (int) img[0]);
 
diff --git a/samples/checkerboard/checkerboard.c b/samples/checkerboard/checkerboard.c
@@ -73,7 +73,7 @@ int main(int argc, char **argv)
         error("Failed to load checkerboard module");
     Program* program = new_program_from_module(runtime, &compiler_config, m);
 
-    wait_completion(launch_kernel(program, device, "checkerboard", 16, 16, 1, 1, (void*[]) { &buf_addr }));
+    wait_completion(launch_kernel(program, device, "checkerboard", 16, 16, 1, 1, (void*[]) { &buf_addr }, NULL));
 
     copy_from_buffer(buf, 0, img, buf_size);
     info_print("data %d\n", (int) img[0]);
diff --git a/src/runtime/cuda/cuda_runtime.c b/src/runtime/cuda/cuda_runtime.c
@@ -24,10 +24,16 @@ static void cuda_device_cleanup(CudaDevice* device) {
 
 bool cuda_command_wait(CudaCommand* command) {
     CHECK_CUDA(cuCtxSynchronize(), return false);
+    if (command->profiled_gpu_time) {
+        cudaEventSynchronize(command->stop);
+        float ms;
+        cudaEventElapsedTime(&ms, command->start, command->stop);
+        *command->profiled_gpu_time = (uint64_t) ((double) ms * 1000000);
+    }
     return true;
 }
 
-CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
+CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* options) {
     CudaKernel* kernel = shd_cuda_get_specialized_program(device, p, entry_point);
 
     CudaCommand* cmd = calloc(sizeof(CudaCommand), 1);
@@ -36,11 +42,20 @@ CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry
             .wait_for_completion = (bool(*)(Command*)) cuda_command_wait
         }
     };
+
+    if (options && options->profiled_gpu_time) {
+        cmd->profiled_gpu_time = options->profiled_gpu_time;
+        cudaEventCreate(&cmd->start);
+        cudaEventCreate(&cmd->stop);
+        cudaEventRecord(cmd->start, 0);
+    }
+
     ArenaConfig final_config = get_arena_config(get_module_arena(kernel->final_module));
     unsigned int gx = final_config.specializations.workgroup_size[0];
     unsigned int gy = final_config.specializations.workgroup_size[1];
     unsigned int gz = final_config.specializations.workgroup_size[2];
     CHECK_CUDA(cuLaunchKernel(kernel->entry_point_function, dimx, dimy, dimz, gx, gy, gz, 0, 0, args, NULL), return NULL);
+    cudaEventRecord(cmd->stop, 0);
     return cmd;
 }
 
@@ -63,7 +78,7 @@ static CudaDevice* create_cuda_device(CudaBackend* b, int ordinal) {
             .allocate_buffer = (Buffer* (*)(Device*, size_t)) shd_cuda_allocate_buffer,
             .can_import_host_memory = (bool (*)(Device*)) shd_cuda_can_import_host_memory,
             .import_host_memory_as_buffer = (Buffer* (*)(Device*, void*, size_t)) shd_cuda_import_host_memory,
-            .launch_kernel = (Command*(*)(Device*, Program*, String, int, int, int, int, void**)) shd_cuda_launch_kernel,
+            .launch_kernel = (Command*(*)(Device*, Program*, String, int, int, int, int, void**, ExtraKernelOptions*)) shd_cuda_launch_kernel,
         },
         .handle = handle,
         .specialized_programs = new_dict(SpecProgramKey, CudaKernel*, (HashFn) hash_spec_program_key, (CmpFn) cmp_spec_program_keys),
diff --git a/src/runtime/cuda/cuda_runtime_private.h b/src/runtime/cuda/cuda_runtime_private.h
@@ -4,6 +4,7 @@
 #include "../runtime_private.h"
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <nvrtc.h>
 
 #define CHECK_NVRTC(x, failure_handler) { nvrtcResult the_result_ = x; if (the_result_ != NVRTC_SUCCESS) { const char* msg = nvrtcGetErrorString(the_result_); error_print(#x " failed (%s)\n", msg); failure_handler; } }
@@ -40,6 +41,9 @@ typedef struct {
 
 typedef struct {
     Command base;
+
+    uint64_t* profiled_gpu_time;
+    cudaEvent_t start, stop;
 } CudaCommand;
 
 typedef struct {
diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c
@@ -74,8 +74,8 @@ Device* get_an_device(Runtime* r) {
 
 const char* get_device_name(Device* d) { return d->get_name(d); }
 
-Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
-    return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args);
+Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* extra_options) {
+    return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args, extra_options);
 }
 
 bool wait_completion(Command* cmd) { return cmd->wait_for_completion(cmd); }
diff --git a/src/runtime/runtime_private.h b/src/runtime/runtime_private.h
@@ -30,7 +30,7 @@ struct Device_ {
     void (*cleanup)(Device*);
     String (*get_name)(Device*);
 
-    Command* (*launch_kernel)(Device*, Program*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
+    Command* (*launch_kernel)(Device*, Program*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
     Buffer* (*allocate_buffer)(Device*, size_t bytes);
     Buffer* (*import_host_memory_as_buffer)(Device*, void* base, size_t bytes);
     bool (*can_import_host_memory)(Device*);
diff --git a/src/runtime/runtime_test.c b/src/runtime/runtime_test.c
@@ -70,7 +70,7 @@ int main(int argc, char* argv[]) {
 
     int32_t a0 = 42;
     uint64_t a1 = get_buffer_device_pointer(buffer);
-    wait_completion(launch_kernel(program, device, args.driver_config.config.specialization.entry_point ? args.driver_config.config.specialization.entry_point : "my_kernel", 1, 1, 1, 2, (void*[]) { &a0, &a1 }));
+    wait_completion(launch_kernel(program, device, args.driver_config.config.specialization.entry_point ? args.driver_config.config.specialization.entry_point : "my_kernel", 1, 1, 1, 2, (void*[]) { &a0, &a1 }, NULL));
 
     destroy_buffer(buffer);
 
diff --git a/src/runtime/vulkan/vk_runtime_dispatch.c b/src/runtime/vulkan/vk_runtime_dispatch.c
@@ -57,7 +57,7 @@ static Command make_command_base() {
     };
 }
 
-VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
+VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* options) {
     assert(program && device);
 
     VkrSpecProgram* prog = get_specialized_program(program, entry_point, device);
@@ -82,8 +82,26 @@ VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_
 
     vkCmdBindPipeline(cmd->cmd_buf, VK_PIPELINE_BIND_POINT_COMPUTE, prog->pipeline);
     bind_program_resources(cmd, prog);
+
+    if (options && options->profiled_gpu_time) {
+        VkQueryPoolCreateInfo qpci = {
+            .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+            .pNext = NULL,
+            .queryType = VK_QUERY_TYPE_TIMESTAMP,
+            .queryCount = 2,
+        };
+        CHECK_VK(vkCreateQueryPool(device->device, &qpci, NULL, &cmd->query_pool), {});
+        cmd->profiled_gpu_time = options->profiled_gpu_time;
+        vkCmdResetQueryPool(cmd->cmd_buf, cmd->query_pool, 0, 1);
+        vkCmdWriteTimestamp(cmd->cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, cmd->query_pool, 0);
+    }
+
     vkCmdDispatch(cmd->cmd_buf, dimx, dimy, dimz);
 
+    if (options && options->profiled_gpu_time) {
+        vkCmdWriteTimestamp(cmd->cmd_buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, cmd->query_pool, 1);
+    }
+
     if (!vkr_submit_command(cmd))
         goto err_post_commands_create;
 
@@ -153,13 +171,20 @@ bool vkr_submit_command(VkrCommand* cmd) {
 bool vkr_wait_completion(VkrCommand* cmd) {
     assert(cmd->submitted && "Command must be submitted before they can be waited on");
     CHECK_VK(vkWaitForFences(cmd->device->device, 1, (VkFence[]) { cmd->done_fence }, true, UINT32_MAX), return false);
+    if (cmd->profiled_gpu_time) {
+        uint64_t ts[2];
+        CHECK_VK(vkGetQueryPoolResults(cmd->device->device, cmd->query_pool, 0, 2, sizeof(uint64_t) * 2, ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT), {});
+        *cmd->profiled_gpu_time = ts[1] - ts[0];
+    }
     vkr_destroy_command(cmd);
     return true;
 }
 
 void vkr_destroy_command(VkrCommand* cmd) {
     if (cmd->submitted)
         vkDestroyFence(cmd->device->device, cmd->done_fence, NULL);
+    if (cmd->query_pool)
+        vkDestroyQueryPool(cmd->device->device, cmd->query_pool, NULL);
     vkFreeCommandBuffers(cmd->device->device, cmd->device->cmd_pool, 1, &cmd->cmd_buf);
     free(cmd);
 }
diff --git a/src/runtime/vulkan/vk_runtime_private.h b/src/runtime/vulkan/vk_runtime_private.h
@@ -181,14 +181,17 @@ struct VkrCommand_ {
     VkCommandBuffer cmd_buf;
     VkFence done_fence;
     bool submitted;
+
+    uint64_t* profiled_gpu_time;
+    VkQueryPool query_pool;
 };
 
 VkrCommand* vkr_begin_command(VkrDevice* device);
 bool vkr_submit_command(VkrCommand* commands);
 void vkr_destroy_command(VkrCommand* commands);
 bool vkr_wait_completion(VkrCommand* cmd);
 
-VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
+VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
 
 typedef struct ProgramResourceInfo_ ProgramResourceInfo;
 struct ProgramResourceInfo_ {

Original file line number	Diff line number	Diff line change
`@@ -74,8 +74,8 @@ Device* get_an_device(Runtime* r) {`
`74`	`74`
`75`	`75`	`const char* get_device_name(Device* d) { return d->get_name(d); }`
`76`	`76`
`77`		`-Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {`
`78`		`- return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args);`
	`77`	`+Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* extra_options) {`
	`78`	`+ return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args, extra_options);`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`bool wait_completion(Command* cmd) { return cmd->wait_for_completion(cmd); }`