diff --git a/GPU/Common/GPUCommonDefAPI.h b/GPU/Common/GPUCommonDefAPI.h index f7efbf7e976d4..b029038a3b521 100644 --- a/GPU/Common/GPUCommonDefAPI.h +++ b/GPU/Common/GPUCommonDefAPI.h @@ -43,7 +43,7 @@ #define GPUhd() // Host and device function, inlined during GPU compilation to avoid symbol clashes in host code #define GPUhdi() inline // Host and device function, to-be-inlined on host and device #define GPUhdni() // Host and device function, not to-be-inlined automatically - #define GPUg() INVALID_TRIGGER_ERROR_NO_HOST_CODE // GPU kernel + #define GPUg() INVALID_TRIGGER_ERROR_NO_GPU_CODE // GPU kernel #define GPUshared() // shared memory variable declaration #define GPUglobal() // global memory variable declaration (only used for kernel input pointers) #define GPUconstant() // constant memory variable declaraion diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 9b569d3e88f3c..39507beda8a55 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -16,7 +16,7 @@ #include "GPUReconstructionIncludes.h" #include "GPUReconstructionThreading.h" #include "GPUChain.h" -#include "GPUDefParameters.h" +#include "GPUDefParametersRuntime.h" #include "GPUTPCClusterData.h" #include "GPUTPCSectorOutCluster.h" #include "GPUTPCGMMergedTrack.h" diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index e17f1fcd7091e..3655eaf66055e 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -17,7 +17,7 @@ if(DEFINED CUDA_COMPUTETARGET) endif() message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}") -set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu) +set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDARTCCalls.cu) set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesSystem.h) # -------------------------------- Prepare RTC ------------------------------------------------------- enable_language(ASM) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index f475929d49d50..47a9b675d27f6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -111,6 +111,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() constexpr int32_t reqVerMin = 0; #endif if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) { + mWarpSize = GPUCA_WARP_SIZE; genAndLoadRTC(); exit(0); } @@ -244,16 +245,12 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment); GPUInfo(" "); } - if (deviceProp.warpSize != GPUCA_WARP_SIZE) { + if (deviceProp.warpSize != GPUCA_WARP_SIZE && !mProcessingSettings.rtc.enable) { throw std::runtime_error("Invalid warp size on GPU"); } + mWarpSize = deviceProp.warpSize; mBlockCount = deviceProp.multiProcessorCount; mMaxBackendThreads = std::max(mMaxBackendThreads, deviceProp.maxThreadsPerBlock * mBlockCount); -#ifndef __HIPCC__ // CUDA - mWarpSize = 32; -#else // HIP - mWarpSize = 64; -#endif mDeviceName = deviceProp.name; mDeviceName += " (CUDA GPU)"; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index ac5920f769f25..3441c6b9a4fd6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -45,8 +45,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase template void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - void getRTCKernelCalls(std::vector& kernels); - template friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); GPUReconstructionCUDAInternals* mInternals; @@ -91,6 +89,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels& kernels); void genAndLoadRTC(); void loadKernelModules(bool perKernel); const char *mRtcSrcExtension = ".src", *mRtcBinExtension = ".o"; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 5f481d2cb9058..abcd47ca01c90 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -73,7 +73,8 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } fclose(fp); } - const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true); + const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) + + "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n"; if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) { GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str()); } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h index 1cb3679fc30dc..3f072059a9ad7 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h @@ -12,8 +12,8 @@ /// \file GPUReconstructionCUDAIncludesSystem.h /// \author David Rohr -#ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H -#define O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H +#ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H +#define O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H #include #include @@ -32,4 +32,4 @@ #include #include -#endif +#endif // O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index f8efd8428f035..cf08785e6b3d5 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -118,14 +118,3 @@ static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstS return retVal; }); #endif - -void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector& kernels) -{ -#undef GPUCA_KRNL_LB -#undef __launch_bounds__ -#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__))); -#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_RTC_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__) -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL -#undef GPUCA_KRNL_LB -} diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu new file mode 100644 index 0000000000000..571428dc39e21 --- /dev/null +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu @@ -0,0 +1,32 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionCUDARTCCalls.cu +/// \author David Rohr + +#define GPUCA_GPUCODE_HOSTONLY +#define GPUCA_GPUCODE_NO_LAUNCH_BOUNDS + +#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args)) + +#include "GPUReconstructionCUDAIncludesSystem.h" +#include "GPUReconstructionCUDADef.h" +#include "GPUReconstructionCUDA.h" + +using namespace o2::gpu; + +void GPUReconstructionCUDA::getRTCKernelCalls(std::vector& kernels) +{ +#undef GPUCA_KRNL +#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__))); +#undef __launch_bounds__ +#include "GPUReconstructionKernelList.h" +} diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 9a9b1e36a167c..3a03a054d4a7e 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -24,20 +24,30 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}") if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}") set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify) file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR}) - set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu) + set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu) set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesSystem.h) set(HIP_SOURCES "") foreach(file ${GPUCA_HIP_FILE_LIST}) get_filename_component(ABS_CUDA_SORUCE ../cuda/${file} ABSOLUTE) get_filename_component(CUDA_SOURCE ${file} NAME) + get_filename_component(CUDA_SOURCE_EXT ${file} EXT) string(REPLACE ".cu" ".hip" HIP_SOURCE1 ${CUDA_SOURCE}) string(REPLACE "CUDA" "HIP" HIP_SOURCE ${HIP_SOURCE1}) - add_custom_command( - OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} - COMMAND ${hip_HIPIFY_PERL_EXECUTABLE} --quiet-warnings ${ABS_CUDA_SORUCE} | sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} - DEPENDS ${ABS_CUDA_SORUCE} - COMMENT "Hippifying ${HIP_SOURCE}" - ) + if(CUDA_SOURCE_EXT STREQUAL ".cu" OR CUDA_SOURCE_EXT STREQUAL ".h") + add_custom_command( + OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} + COMMAND ${hip_HIPIFY_PERL_EXECUTABLE} --quiet-warnings ${ABS_CUDA_SORUCE} | sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} + DEPENDS ${ABS_CUDA_SORUCE} + COMMENT "Hippifying ${HIP_SOURCE}" + ) + else() + add_custom_command( + OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} + COMMAND sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' ${ABS_CUDA_SORUCE} > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE} + DEPENDS ${ABS_CUDA_SORUCE} + COMMENT "Generating HIP source ${HIP_SOURCE}" + ) + endif() list(APPEND HIP_SOURCES "${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}") endforeach() foreach(file ${GPUCA_HIP_LOCAL_FILE_LIST}) @@ -61,7 +71,7 @@ else() get_filename_component(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} ABSOLUTE) endif() -set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip) +set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPRTCCalls.hip) set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx) set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesSystem.h) diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h index cfe1121ef1089..1a3a1ff0108af 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h @@ -12,8 +12,8 @@ /// \file GPUReconstructionHIPIncludesSystem.h /// \author David Rohr -#ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDES_H -#define O2_GPU_RECONSTRUCTIONHIPINCLUDES_H +#ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H +#define O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H #include #include @@ -25,4 +25,4 @@ #include #pragma GCC diagnostic pop -#endif +#endif // O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 44a630fe19f48..f428d982394e0 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -129,7 +129,9 @@ set(HDRS_INSTALL DataTypes/GPUTriggerOutputs.h Debug/GPUROOTDump.h Definitions/GPUDefConstantsAndSettings.h - Definitions/GPUDefParametersDefault.h + Definitions/GPUDefParametersWrapper.h + Definitions/GPUDefParametersConstants.h + Definitions/GPUDefParametersDefaults.h Definitions/GPUDef.h Definitions/GPUDefMacros.h Definitions/GPULogging.h @@ -234,7 +236,7 @@ set(TEMPLATE_HEADER_LIST Base/GPUReconstructionKernelList.template.h Base/GPUReconstructionKernelIncludes.template.h Base/GPUReconstructionIncludesDeviceAll.template.h cmake/GPUNoFastMathKernels.template.h - Definitions/GPUDefParameters.template.h + Definitions/GPUDefParametersRuntime.template.h Definitions/GPUDefParametersLoad.template.inc) set(GENERATED_HEADERS_LIST "") @@ -258,7 +260,7 @@ add_custom_command( ) list(APPEND GENERATED_HEADERS_LIST ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoadPrepare.h) -set(HDRS_INSTALL ${HDRS_INSTALL} ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUReconstructionKernelList.h ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParameters.h ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoad.inc ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoadPrepare.h) +set(HDRS_INSTALL ${HDRS_INSTALL} ${GENERATED_HEADERS_LIST}) include(kernels.cmake) # Optional sources depending on optional dependencies diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx index 445c03113cd39..5dbbf63ca8264 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx @@ -258,6 +258,9 @@ GPUdii() void GPUTPCCompressionKernels::Thread(clusters->clusters[iSector][iRow])); } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) { diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h index b0bb8a6c12ecc..81817abf1e6d6 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h @@ -72,15 +72,19 @@ class GPUTPCCompressionGatherKernels : public GPUKernelTemplate using Vec64 = uint64_t; using Vec128 = uint4; - struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered128)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_multiBlock)); + struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { union { - uint32_t warpOffset[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)]; - Vec32 buf32[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE]; - Vec64 buf64[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE]; - Vec128 buf128[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE]; + uint32_t warpOffset[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)]; + Vec32 buf32[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE]; + Vec64 buf64[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE]; + Vec128 buf128[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE]; struct { - uint32_t sizes[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE]; - uint32_t srcOffsets[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE]; + uint32_t sizes[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE]; + uint32_t srcOffsets[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE]; } unbuffered; }; diff --git a/GPU/GPUTracking/Definitions/GPUDef.h b/GPU/GPUTracking/Definitions/GPUDef.h index 404f35f971c94..c77b9ce159306 100644 --- a/GPU/GPUTracking/Definitions/GPUDef.h +++ b/GPU/GPUTracking/Definitions/GPUDef.h @@ -18,7 +18,7 @@ #include "GPUCommonDef.h" #include "GPUDefConstantsAndSettings.h" -#include "GPUDefParametersDefault.h" +#include "GPUDefParametersWrapper.h" #include "GPUCommonRtypes.h" // Macros for masking ptrs in OpenCL kernel calls as uint64_t (The API only allows us to pass buffer objects) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h new file mode 100644 index 0000000000000..3a16d02ecf7c6 --- /dev/null +++ b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h @@ -0,0 +1,87 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUDefParametersConstants.h +/// \author David Rohr + +// This file contains compile-time constants, independent from the backend + +#ifndef GPUDEFPARAMETERSCONSTANTS_H +#define GPUDEFPARAMETERSCONSTANTS_H +// clang-format off + +#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix! + +#if defined(__CUDACC__) || defined(__HIPCC__) + #define GPUCA_SPECIALIZE_THRUST_SORTS +#endif + +#define GPUCA_MAX_THREADS 1024 +#define GPUCA_MAX_STREAMS 36 + +#if defined(GPUCA_GPUCODE) + #define GPUCA_SORT_STARTHITS // Sort the start hits when running on GPU +#endif + +#define GPUCA_ROWALIGNMENT 16 // Align of Row Hits and Grid +#define GPUCA_BUFFER_ALIGNMENT 64 // Alignment of buffers obtained from SetPointers +#define GPUCA_MEMALIGN (64 * 1024) // Alignment of allocated memory blocks + +// Default maximum numbers +#define GPUCA_MAX_CLUSTERS ((size_t) 1024 * 1024 * 1024) // Maximum number of TPC clusters +#define GPUCA_MAX_TRD_TRACKLETS ((size_t) 128 * 1024) // Maximum number of TRD tracklets +#define GPUCA_MAX_ITS_FIT_TRACKS ((size_t) 96 * 1024) // Max number of tracks for ITS track fit +#define GPUCA_MEMORY_SIZE ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device +#define GPUCA_HOST_MEMORY_SIZE ((size_t) 1 * 1024 * 1024 * 1024) // Size of memory allocated on Host +#define GPUCA_GPU_STACK_SIZE ((size_t) 8 * 1024) // Stack size per GPU thread +#define GPUCA_GPU_HEAP_SIZE ((size_t) 16 * 1025 * 1024) // Stack size per GPU thread + +#ifdef GPUCA_GPUCODE + #ifndef GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6 + #endif + #ifndef GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12 + #endif + #ifndef GPUCA_ALTERNATE_BORDER_SORT + #define GPUCA_ALTERNATE_BORDER_SORT 0 + #endif + #ifndef GPUCA_SORT_BEFORE_FIT + #define GPUCA_SORT_BEFORE_FIT 0 + #endif + #ifndef GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 + #endif + #ifndef GPUCA_COMP_GATHER_KERNEL + #define GPUCA_COMP_GATHER_KERNEL 0 + #endif + #ifndef GPUCA_COMP_GATHER_MODE + #define GPUCA_COMP_GATHER_MODE 2 + #endif +#else + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 0 + #define GPUCA_ALTERNATE_BORDER_SORT 0 + #define GPUCA_SORT_BEFORE_FIT 0 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 + #define GPUCA_THREAD_COUNT_FINDER 1 + #define GPUCA_COMP_GATHER_KERNEL 0 + #define GPUCA_COMP_GATHER_MODE 0 +#endif +#ifndef GPUCA_DEDX_STORAGE_TYPE + #define GPUCA_DEDX_STORAGE_TYPE float +#endif +#ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float +#endif + +// clang-format on +#endif // GPUDEFPARAMETERSCONSTANTS_H diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h similarity index 75% rename from GPU/GPUTracking/Definitions/GPUDefParametersDefault.h rename to GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 44f3eb299d4c1..ce703e2ceba4a 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -9,28 +9,22 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUDefParametersDefault.h +/// \file GPUDefParametersDefaults.h /// \author David Rohr -// This files contains compile-time constants affecting the GPU performance. -// Many of these constants are GPU-architecture specific. -// This file also contains all constants describing memory limitations, essentially limiting the total number of tracks, etc. -// Compile-time constants affecting the tracking algorithms / results are located in GPUDefConstantsAndSettings.h +// This file contains compile-time constants affecting the GPU performance. -#ifndef GPUDEFPARAMETERSDEFAULT_H -#define GPUDEFPARAMETERSDEFAULT_H +#if !defined(GPUDEFPARAMETERSDEFAULTS_H) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) // Avoid including for RTC generation besides normal include protection. +#define GPUDEFPARAMETERSDEFAULTS_H // clang-format off -#include "GPUCommonDef.h" -#include "GPUDefMacros.h" - // Launch bound definition, 3 optional parameters: maxThreads per block, minBlocks per multiprocessor, force number of blocks (not passed to compiler as launch bounds) // GPU Run Configuration #ifdef GPUCA_GPUCODE #if defined(GPUCA_GPUTYPE_MI2xx) #define GPUCA_WARP_SIZE 64 - #define GPUCA_THREAD_COUNT 256 + #define GPUCA_THREAD_COUNT_DEFAULT 256 #define GPUCA_LB_GPUTPCCreateTrackingData 256 #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1 #define GPUCA_LB_GPUTPCStartHitsFinder 1024 @@ -81,19 +75,9 @@ #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 - #define GPUCA_ALTERNATE_BORDER_SORT 1 - #define GPUCA_SORT_BEFORE_FIT 1 - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 - #define GPUCA_NO_ATOMIC_PRECHECK 1 - #define GPUCA_DEDX_STORAGE_TYPE uint16_t - #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half - #define GPUCA_COMP_GATHER_KERNEL 4 - #define GPUCA_COMP_GATHER_MODE 3 #elif defined(GPUCA_GPUTYPE_VEGA) #define GPUCA_WARP_SIZE 64 - #define GPUCA_THREAD_COUNT 256 + #define GPUCA_THREAD_COUNT_DEFAULT 256 #define GPUCA_LB_GPUTPCCreateTrackingData 128 #define GPUCA_LB_GPUTPCStartHitsSorter 1024, 2 #define GPUCA_LB_GPUTPCStartHitsFinder 1024 @@ -144,19 +128,9 @@ #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 512 #define GPUCA_LB_COMPRESSION_GATHER 1024 - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 - #define GPUCA_ALTERNATE_BORDER_SORT 1 - #define GPUCA_SORT_BEFORE_FIT 1 - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 - #define GPUCA_NO_ATOMIC_PRECHECK 1 - #define GPUCA_DEDX_STORAGE_TYPE uint16_t - #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half - #define GPUCA_COMP_GATHER_KERNEL 4 - #define GPUCA_COMP_GATHER_MODE 3 #elif defined(GPUCA_GPUTYPE_AMPERE) #define GPUCA_WARP_SIZE 32 - #define GPUCA_THREAD_COUNT 512 + #define GPUCA_THREAD_COUNT_DEFAULT 512 #define GPUCA_LB_GPUTPCCreateTrackingData 384 #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1 #define GPUCA_LB_GPUTPCStartHitsFinder 512 @@ -207,19 +181,9 @@ #define GPUCA_LB_GPUTPCCFDeconvolution 384 #define GPUCA_LB_GPUTPCCFClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 - #define GPUCA_ALTERNATE_BORDER_SORT 1 - #define GPUCA_SORT_BEFORE_FIT 1 - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 - #define GPUCA_NO_ATOMIC_PRECHECK 1 - #define GPUCA_DEDX_STORAGE_TYPE uint16_t - #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half - #define GPUCA_COMP_GATHER_KERNEL 4 - #define GPUCA_COMP_GATHER_MODE 3 #elif defined(GPUCA_GPUTYPE_TURING) #define GPUCA_WARP_SIZE 32 - #define GPUCA_THREAD_COUNT 512 + #define GPUCA_THREAD_COUNT_DEFAULT 512 #define GPUCA_LB_GPUTPCCreateTrackingData 256 #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1 #define GPUCA_LB_GPUTPCStartHitsFinder 512 @@ -262,16 +226,6 @@ #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 32, 1 #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 32, 1 #define GPUCA_LB_COMPRESSION_GATHER 1024 - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 - #define GPUCA_ALTERNATE_BORDER_SORT 1 - #define GPUCA_SORT_BEFORE_FIT 1 - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 - #define GPUCA_NO_ATOMIC_PRECHECK 1 - #define GPUCA_COMP_GATHER_KERNEL 4 - #define GPUCA_COMP_GATHER_MODE 3 - #define GPUCA_DEDX_STORAGE_TYPE uint16_t - #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half // #define GPUCA_USE_TEXTURES #elif defined(GPUCA_GPUTYPE_OPENCL) #else @@ -281,8 +235,11 @@ #ifdef GPUCA_GPUCODE // Default settings for GPU, if not already set for selected GPU type - #ifndef GPUCA_THREAD_COUNT - #define GPUCA_THREAD_COUNT 256 + #ifndef GPUCA_WARP_SIZE + #define GPUCA_WARP_SIZE 32 + #endif + #ifndef GPUCA_THREAD_COUNT_DEFAULT + #define GPUCA_THREAD_COUNT_DEFAULT 256 #endif #ifndef GPUCA_LB_GPUTPCCreateTrackingData #define GPUCA_LB_GPUTPCCreateTrackingData 256 @@ -486,25 +443,21 @@ #define GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov 256 #endif #ifndef GPUCA_LB_GPUMemClean16 - #define GPUCA_LB_GPUMemClean16 GPUCA_THREAD_COUNT, 1 + #define GPUCA_LB_GPUMemClean16 GPUCA_THREAD_COUNT_DEFAULT, 1 #endif #ifndef GPUCA_LB_GPUitoa - #define GPUCA_LB_GPUitoa GPUCA_THREAD_COUNT, 1 + #define GPUCA_LB_GPUitoa GPUCA_THREAD_COUNT_DEFAULT, 1 #endif - #define GPUCA_GET_THREAD_COUNT(...) GPUCA_M_FIRST(__VA_ARGS__) - // These kernel launch-bounds are derrived from one of the constants set above #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression - #ifdef GPUCA_HAS_ONNX #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels - #endif #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN @@ -516,105 +469,7 @@ #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered128 GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_multiBlock GPUCA_LB_COMPRESSION_GATHER -#else - #define GPUCA_GET_THREAD_COUNT(...) 1 // On the host, a thread is a block, and we run 1 "device thread" per block. -#endif - -#define GPUCA_GET_WARP_COUNT(...) (GPUCA_GET_THREAD_COUNT(__VA_ARGS__) / GPUCA_WARP_SIZE) - -#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix! - -#if defined(__CUDACC__) || defined(__HIPCC__) - #define GPUCA_SPECIALIZE_THRUST_SORTS -#endif - -#ifndef GPUCA_NEIGHBORSFINDER_REGS - #define GPUCA_NEIGHBORSFINDER_REGS NONE, 0 -#endif -#ifdef GPUCA_GPUCODE - #ifndef GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6 - #endif - #ifndef GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12 - #endif - #ifndef GPUCA_ALTERNATE_BORDER_SORT - #define GPUCA_ALTERNATE_BORDER_SORT 0 - #endif - #ifndef GPUCA_SORT_BEFORE_FIT - #define GPUCA_SORT_BEFORE_FIT 0 - #endif - #ifndef GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 - #endif - #ifndef GPUCA_COMP_GATHER_KERNEL - #define GPUCA_COMP_GATHER_KERNEL 0 - #endif - #ifndef GPUCA_COMP_GATHER_MODE - #define GPUCA_COMP_GATHER_MODE 2 - #endif -#else - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0 - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 0 - #define GPUCA_ALTERNATE_BORDER_SORT 0 - #define GPUCA_SORT_BEFORE_FIT 0 - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 - #define GPUCA_THREAD_COUNT_FINDER 1 - #define GPUCA_COMP_GATHER_KERNEL 0 - #define GPUCA_COMP_GATHER_MODE 0 -#endif -#ifndef GPUCA_DEDX_STORAGE_TYPE - #define GPUCA_DEDX_STORAGE_TYPE float -#endif -#ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE - #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float -#endif -#define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_MERGER_INTERPOLATION_ERROR_TYPE) -#define GPUCA_DEDX_STORAGE_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_DEDX_STORAGE_TYPE) - -#ifndef GPUCA_WARP_SIZE - #ifdef GPUCA_GPUCODE - #define GPUCA_WARP_SIZE 32 - #else - #define GPUCA_WARP_SIZE 1 - #endif -#endif - -#define GPUCA_MAX_THREADS 1024 -#define GPUCA_MAX_STREAMS 36 - -#define GPUCA_SORT_STARTHITS_GPU // Sort the start hits when running on GPU -#define GPUCA_ROWALIGNMENT 16 // Align of Row Hits and Grid -#define GPUCA_BUFFER_ALIGNMENT 64 // Alignment of buffers obtained from SetPointers -#define GPUCA_MEMALIGN (64 * 1024) // Alignment of allocated memory blocks - -// #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE // Output Profiling Data for Tracklet Constructor Tracklet Scheduling - -// Default maximum numbers -#define GPUCA_MAX_CLUSTERS ((size_t) 1024 * 1024 * 1024) // Maximum number of TPC clusters -#define GPUCA_MAX_TRD_TRACKLETS ((size_t) 128 * 1024) // Maximum number of TRD tracklets -#define GPUCA_MAX_ITS_FIT_TRACKS ((size_t) 96 * 1024) // Max number of tracks for ITS track fit -#define GPUCA_MEMORY_SIZE ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device -#define GPUCA_HOST_MEMORY_SIZE ((size_t) 1 * 1024 * 1024 * 1024) // Size of memory allocated on Host -#define GPUCA_GPU_STACK_SIZE ((size_t) 8 * 1024) // Stack size per GPU thread -#define GPUCA_GPU_HEAP_SIZE ((size_t) 16 * 1025 * 1024) // Stack size per GPU thread - -// #define GPUCA_KERNEL_DEBUGGER_OUTPUT - -// Some assertions to make sure the parameters are not invalid -#if defined(GPUCA_GPUCODE) - static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP, "Invalid GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP"); - static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE, "Invalid GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE"); - static_assert(GPUCA_M_FIRST(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE, "Invalid GPUCA_TPC_COMP_CHUNK_SIZE"); -#endif - -// Derived parameters -#ifdef GPUCA_USE_TEXTURES - #define GPUCA_TEXTURE_FETCH_CONSTRUCTOR // Fetch data through texture cache -#endif -#if defined(GPUCA_SORT_STARTHITS_GPU) && defined(GPUCA_GPUCODE) - #define GPUCA_SORT_STARTHITS #endif // clang-format on -#endif // GPUDEFPARAMETERSDEFAULT_H +#endif // GPUDEFPARAMETERSDEFAULTS_H diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc index c17244572ee0c..938cedbdacc93 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc +++ b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc @@ -18,7 +18,7 @@ #define GPUCA_M_LB_EMPTY_1(...) __VA_ARGS__ #define GPUCA_M_LB_EMPTY0(...) GPUCA_M_CAT(GPUCA_M_LB_EMPTY_, __VA_OPT__(1))(__VA_ARGS__) -#include "GPUDefParameters.h" +#include "GPUDefParametersRuntime.h" #include "GPUDefMacros.h" #include #include @@ -37,17 +37,17 @@ static GPUDefParameters GPUDefParametersLoad() }; } -#define GPUCA_EXPORT_KERNEL(name) \ - if (par.par_LB_maxThreads[i] > 0) { \ - o << "#define GPUCA_" << (forRTC ? "RTC_" : "") << "LB_" << GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \ - if (par.par_LB_minBlocks[i] > 0) { \ - o << ", " << par.par_LB_minBlocks[i]; \ - } \ - if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \ - o << ", " << par.par_LB_forceBlocks[i]; \ - } \ - o << "\n"; \ - } \ +#define GPUCA_EXPORT_KERNEL(name) \ + if (par.par_LB_maxThreads[i] > 0) { \ + o << "#define GPUCA_LB_" << GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \ + if (par.par_LB_minBlocks[i] > 0) { \ + o << ", " << par.par_LB_minBlocks[i]; \ + } \ + if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \ + o << ", " << par.par_LB_forceBlocks[i]; \ + } \ + o << "\n"; \ + } \ i++; static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC) diff --git a/GPU/GPUTracking/Definitions/GPUDefParameters.template.h b/GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h similarity index 87% rename from GPU/GPUTracking/Definitions/GPUDefParameters.template.h rename to GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h index 731cb76b89193..f3537c058a824 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParameters.template.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h @@ -9,11 +9,11 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUDefParameters.h +/// \file GPUDefParametersRuntime.h /// \author David Rohr -#ifndef GPUDEFPARAMETERS_H -#define GPUDEFPARAMETERS_H +#ifndef GPUDEFPARAMETERSRUNTIME_H +#define GPUDEFPARAMETERSRUNTIME_H namespace o2::gpu { @@ -24,4 +24,4 @@ struct GPUDefParameters { // clang-format off }; // clang-format on } // namespace o2::gpu -#endif +#endif // GPUDEFPARAMETERSRUNTIME_H diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h b/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h new file mode 100644 index 0000000000000..8d8815d8a8044 --- /dev/null +++ b/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h @@ -0,0 +1,104 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUDefParametersWrapper.h +/// \author David Rohr + +// Wrapper file to load all compile-time parameters (architecture / rtc - dependent ones, and constant ones) +// Compile-time constants affecting the tracking algorithms / results are located in GPUDefConstantsAndSettings.h + +#ifndef GPUDEFPARAMETERSWRAPPER_H +#define GPUDEFPARAMETERSWRAPPER_H +// clang-format off + +#include "GPUCommonDef.h" +#include "GPUDefMacros.h" + +#ifdef GPUCA_GPUCODE +#if defined(GPUCA_GPUTYPE_MI2xx) + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 + #define GPUCA_ALTERNATE_BORDER_SORT 1 + #define GPUCA_SORT_BEFORE_FIT 1 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 + #define GPUCA_NO_ATOMIC_PRECHECK 1 + #define GPUCA_DEDX_STORAGE_TYPE uint16_t + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half + #define GPUCA_COMP_GATHER_KERNEL 4 + #define GPUCA_COMP_GATHER_MODE 3 +#elif defined(GPUCA_GPUTYPE_VEGA) + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 + #define GPUCA_ALTERNATE_BORDER_SORT 1 + #define GPUCA_SORT_BEFORE_FIT 1 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 + #define GPUCA_NO_ATOMIC_PRECHECK 1 + #define GPUCA_DEDX_STORAGE_TYPE uint16_t + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half + #define GPUCA_COMP_GATHER_KERNEL 4 + #define GPUCA_COMP_GATHER_MODE 3 +#elif defined(GPUCA_GPUTYPE_AMPERE) + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 + #define GPUCA_ALTERNATE_BORDER_SORT 1 + #define GPUCA_SORT_BEFORE_FIT 1 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 + #define GPUCA_NO_ATOMIC_PRECHECK 1 + #define GPUCA_DEDX_STORAGE_TYPE uint16_t + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half + #define GPUCA_COMP_GATHER_KERNEL 4 + #define GPUCA_COMP_GATHER_MODE 3 +#elif defined(GPUCA_GPUTYPE_TURING) + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 + #define GPUCA_ALTERNATE_BORDER_SORT 1 + #define GPUCA_SORT_BEFORE_FIT 1 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1 + #define GPUCA_NO_ATOMIC_PRECHECK 1 + #define GPUCA_COMP_GATHER_KERNEL 4 + #define GPUCA_COMP_GATHER_MODE 3 + #define GPUCA_DEDX_STORAGE_TYPE uint16_t + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half +#endif +#endif + +#ifdef GPUCA_GPUCODE +#include "GPUDefParametersDefaults.h" +#endif +#include "GPUDefParametersConstants.h" + +namespace o2::gpu +{ +#if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) + GPUhdi() static constexpr uint32_t GPUCA_GET_THREAD_COUNT(uint32_t val, ...) { return val; } + GPUhdi() static constexpr uint32_t GPUCA_GET_WARP_COUNT(uint32_t val, ...) { return val / GPUCA_WARP_SIZE; } +#else + static constexpr uint32_t GPUCA_WARP_SIZE = 1; // On the host, a thread is a block is a warp, and we run 1 "device thread" per block. + #define GPUCA_GET_THREAD_COUNT(...) 1 // This must be a define not a constexpr function + #define GPUCA_GET_WARP_COUNT(...) 1 // since launch bound constants are not defined in host-code, and must evaluate to 1! +#endif + +#define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_MERGER_INTERPOLATION_ERROR_TYPE) +#define GPUCA_DEDX_STORAGE_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_DEDX_STORAGE_TYPE) + +// #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE // Output Profiling Data for Tracklet Constructor Tracklet Scheduling + +// #define GPUCA_KERNEL_DEBUGGER_OUTPUT + +// Derived parameters +#ifdef GPUCA_USE_TEXTURES + #define GPUCA_TEXTURE_FETCH_CONSTRUCTOR // Fetch data through texture cache +#endif + +} // namespace o2::gpu + +// clang-format on +#endif // GPUDEFPARAMETERSWRAPPER_H diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h index 41b5eb8a4ffb8..1bf5000cfbe5c 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h @@ -41,6 +41,7 @@ class GPUTPCNeighboursFinder : public GPUKernelTemplate int32_t mIRowUp; // next row number int32_t mIRowDn; // previous row number #if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0 + static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP); float mA1[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; float mA2[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; calink mB[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h index 5009c672b030e..f487931bdaf4b 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h @@ -37,6 +37,7 @@ class GPUTPCTrackletSelector : public GPUKernelTemplate int32_t mNTracklets; // n of tracklets int32_t mReserved; // for alignment reasons #if GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 + static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE); GPUTPCHitId mHits[GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletSelector)]; #endif // GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 }; diff --git a/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C b/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C index 4a72b0cef31a3..785c049816252 100644 --- a/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C +++ b/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C @@ -16,14 +16,15 @@ // ROOT_INCLUDE_PATH="`pwd`/include" root -l -q -b src/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C'()' // Logic for testing to load the default parameters -/*#define GPUCA_GPUCODE +/* #define GPUCA_GPUCODE #define GPUCA_GPUTYPE_AMPERE #define GPUCA_MAXN 40 #define GPUCA_ROW_COUNT 152 #define GPUCA_TPC_COMP_CHUNK_SIZE 1024 -#include "GPUDefParametersDefault.h"*/ +#include "GPUDefParametersConstants.h" +#include "GPUDefParametersDefaults.h" */ -// Load file that sets GPUDefParameters +// Alternatively, logic to load file that sets GPUDefParameters #include "testParam.h" #include "GPUDefParametersLoad.inc" diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h index f5d8f533df651..71236bc317443 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h @@ -34,7 +34,9 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate noiseSuppression = 0, updatePeaks = 1, }; - static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression); + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks)); + struct GPUSharedMemory { ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_NOISE_N]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h index 25d3588be6d17..a72907fe55e89 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h @@ -37,6 +37,13 @@ class GPUTPCCFStreamCompaction : public GPUKernelTemplate struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { }; +#if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) + static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart)); + static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp)); + static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop)); + static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown)); + static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits)); +#endif typedef GPUTPCClusterFinder processorType; GPUhdi() static processorType* Processor(GPUConstantMem& processors) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index e6c1dc508d6e4..a1d641fdb0b93 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -37,7 +37,13 @@ class MCLabelAccumulator; class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: - static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels); + // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); struct GPUSharedMemory { // Regular cluster finder ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];