@@ -24,10 +24,16 @@ static void cuda_device_cleanup(CudaDevice* device) {
2424
2525bool cuda_command_wait (CudaCommand * command ) {
2626 CHECK_CUDA (cuCtxSynchronize (), return false );
27+ if (command -> profiled_gpu_time ) {
28+ cudaEventSynchronize (command -> stop );
29+ float ms ;
30+ cudaEventElapsedTime (& ms , command -> start , command -> stop );
31+ * command -> profiled_gpu_time = (uint64_t ) ((double ) ms * 1000000 );
32+ }
2733 return true;
2834}
2935
30- CudaCommand * shd_cuda_launch_kernel (CudaDevice * device , Program * p , String entry_point , int dimx , int dimy , int dimz , int args_count , void * * args ) {
36+ CudaCommand * shd_cuda_launch_kernel (CudaDevice * device , Program * p , String entry_point , int dimx , int dimy , int dimz , int args_count , void * * args , ExtraKernelOptions * options ) {
3137 CudaKernel * kernel = shd_cuda_get_specialized_program (device , p , entry_point );
3238
3339 CudaCommand * cmd = calloc (sizeof (CudaCommand ), 1 );
@@ -36,11 +42,20 @@ CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry
3642 .wait_for_completion = (bool (* )(Command * )) cuda_command_wait
3743 }
3844 };
45+
46+ if (options && options -> profiled_gpu_time ) {
47+ cmd -> profiled_gpu_time = options -> profiled_gpu_time ;
48+ cudaEventCreate (& cmd -> start );
49+ cudaEventCreate (& cmd -> stop );
50+ cudaEventRecord (cmd -> start , 0 );
51+ }
52+
3953 ArenaConfig final_config = get_arena_config (get_module_arena (kernel -> final_module ));
4054 unsigned int gx = final_config .specializations .workgroup_size [0 ];
4155 unsigned int gy = final_config .specializations .workgroup_size [1 ];
4256 unsigned int gz = final_config .specializations .workgroup_size [2 ];
4357 CHECK_CUDA (cuLaunchKernel (kernel -> entry_point_function , dimx , dimy , dimz , gx , gy , gz , 0 , 0 , args , NULL ), return NULL );
58+ cudaEventRecord (cmd -> stop , 0 );
4459 return cmd ;
4560}
4661
@@ -63,7 +78,7 @@ static CudaDevice* create_cuda_device(CudaBackend* b, int ordinal) {
6378 .allocate_buffer = (Buffer * (* )(Device * , size_t )) shd_cuda_allocate_buffer ,
6479 .can_import_host_memory = (bool (* )(Device * )) shd_cuda_can_import_host_memory ,
6580 .import_host_memory_as_buffer = (Buffer * (* )(Device * , void * , size_t )) shd_cuda_import_host_memory ,
66- .launch_kernel = (Command * (* )(Device * , Program * , String , int , int , int , int , void * * )) shd_cuda_launch_kernel ,
81+ .launch_kernel = (Command * (* )(Device * , Program * , String , int , int , int , int , void * * , ExtraKernelOptions * )) shd_cuda_launch_kernel ,
6782 },
6883 .handle = handle ,
6984 .specialized_programs = new_dict (SpecProgramKey , CudaKernel * , (HashFn ) hash_spec_program_key , (CmpFn ) cmp_spec_program_keys ),
0 commit comments