diff --git a/GPU/Common/GPUCommonDef.h b/GPU/Common/GPUCommonDef.h index 78da104a0c029..d7e99f53d4ce8 100644 --- a/GPU/Common/GPUCommonDef.h +++ b/GPU/Common/GPUCommonDef.h @@ -58,7 +58,7 @@ #if (defined(__CUDACC__) && defined(GPUCA_CUDA_NO_CONSTANT_MEMORY)) || (defined(__HIPCC__) && defined(GPUCA_HIP_NO_CONSTANT_MEMORY)) || (defined(__OPENCL__) && defined(GPUCA_OPENCL_NO_CONSTANT_MEMORY)) #define GPUCA_NO_CONSTANT_MEMORY -#elif defined(__CUDACC__) || defined(__HIPCC__) +#elif (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(GPUCA_GPUCODE_HOSTONLY) #define GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM #endif diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h index e0b06f0a3ea55..532c270431d99 100644 --- a/GPU/GPUTracking/Base/GPUConstantMem.h +++ b/GPU/GPUTracking/Base/GPUConstantMem.h @@ -95,7 +95,7 @@ union GPUConstantMemCopyable { static constexpr size_t gGPUConstantMemBufferSize = (sizeof(GPUConstantMem) + sizeof(uint4) - 1); #endif } // namespace o2::gpu -#if defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) && !defined(GPUCA_GPUCODE_HOSTONLY) +#if defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) GPUconstant() o2::gpu::GPUConstantMemCopyable gGPUConstantMemBuffer; // TODO: This should go into o2::gpu namespace, but then CUDA or HIP would not find the symbol #endif // GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM namespace o2::gpu @@ -104,7 +104,7 @@ namespace o2::gpu // Must be placed here, to avoid circular header dependency GPUdi() GPUconstantref() const GPUConstantMem* GPUProcessor::GetConstantMem() const { -#if defined(GPUCA_GPUCODE_DEVICE) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) && !defined(GPUCA_GPUCODE_HOSTONLY) +#if defined(GPUCA_GPUCODE_DEVICE) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) return &GPUCA_CONSMEM; #else return mConstantMem; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index a8a83fdbd9203..d714c6833d18d 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -16,7 +16,7 @@ #include "GPUReconstructionIncludes.h" #include "GPUReconstructionThreading.h" #include "GPUChain.h" - +#include "GPUDefParameters.h" #include "GPUTPCClusterData.h" #include "GPUTPCSectorOutCluster.h" #include "GPUTPCGMMergedTrack.h" @@ -120,15 +120,27 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs -krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend() +template +gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu) { - return krnlProperties{1, 1}; + if (gpu == -1) { + gpu = IsGPU(); + } + const auto num = GetKernelNum(); + const auto* p = gpu ? mParDevice : mParCPU; + gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]}; + if (ret.nThreads == 0) { + ret.nThreads = gpu ? mThreadCount : 1u; + } + if (ret.minBlocks == 0) { + ret.minBlocks = 1; + } + return ret; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ template void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args); \ - template krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend(); + template krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index fd999ec2304e1..163b00c804d7f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -38,8 +38,6 @@ class GPUReconstructionCPUBackend : public GPUReconstructionProcessing void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs& args); template void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args); - template - gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); }; class GPUReconstructionCPU : public GPUReconstructionKernels @@ -55,10 +53,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels void runKernel(krnlSetup&& setup, Args&&... args); template - const gpu_reconstruction_kernels::krnlProperties getKernelProperties() - { - return getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument()); - } + gpu_reconstruction_kernels::krnlProperties getKernelProperties(int gpu = -1); virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false); int32_t GPUStuck() { return mGPUStuck; } @@ -77,13 +72,15 @@ class GPUReconstructionCPU : public GPUReconstructionKernels, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \ { \ + krnlSetupArgs args(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)); \ + const uint32_t num = GetKernelNum(); \ if (cpuFallback) { \ - GPUReconstructionCPU::runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ + GPUReconstructionCPU::runKernelImpl(num, &args); \ } else { \ - runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ + runKernelImpl(num, &args); \ } \ } #include "GPUReconstructionKernelList.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h index 0b1a501ebc094..b3f6c6ec817fd 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h @@ -38,21 +38,18 @@ #ifndef GPUCA_KRNL_CUSTOM #define GPUCA_KRNL_CUSTOM(...) #endif -#define GPUCA_KRNL_REG_EXTRREG(...) GPUCA_M_STRIP(__VA_ARGS__) -#define GPUCA_KRNL_CUSTOM_EXTRREG(MODE, ...) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__) -#define GPUCA_KRNL_NONE_EXTRREG(MODE, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_REG(MODE, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, MODE))(num) GPUCA_ATTRRES_XREG (MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_CUSTOM(MODE, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, MODE))(args) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_NONE(MODE, none, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_(MODE, ...) -#define GPUCA_ATTRRES_XNONE(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_XCUSTOM(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES_XREG(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__) -#define GPUCA_ATTRRES(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__) +#define GPUCA_ATTRRES_REG(reg, num, ...) GPUCA_M_EXPAND(GPUCA_KRNL_REG)(num) GPUCA_ATTRRES_XREG (__VA_ARGS__) +#define GPUCA_ATTRRES_CUSTOM(custom, args, ...) GPUCA_M_EXPAND(GPUCA_KRNL_CUSTOM)(args) GPUCA_ATTRRES_XCUSTOM(__VA_ARGS__) +#define GPUCA_ATTRRES_NONE(none, ...) GPUCA_ATTRRES_XNONE(__VA_ARGS__) +#define GPUCA_ATTRRES_(...) +#define GPUCA_ATTRRES_XNONE(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__) +#define GPUCA_ATTRRES_XCUSTOM(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__) +#define GPUCA_ATTRRES_XREG(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__) +#define GPUCA_ATTRRES(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__) // GPU Kernel entry point #define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \ - GPUg() void GPUCA_ATTRRES(, GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments)) + GPUg() void GPUCA_ATTRRES(GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments)) #ifdef GPUCA_KRNL_DEFONLY #define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__); @@ -79,12 +76,6 @@ } \ }; -#define GPUCA_KRNL_PROP(x_class, x_attributes) \ - template <> gpu_reconstruction_kernels::krnlProperties GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::getKernelPropertiesBackend() { \ - gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_EXTRREG, GPUCA_M_STRIP(x_attributes))}; \ - return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \ - } - #endif // GPUCA_GPUCODE #define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__) diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernels.h b/GPU/GPUTracking/Base/GPUReconstructionKernels.h index b8f3e3746c743..7f500d471de1f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernels.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernels.h @@ -95,17 +95,19 @@ class GPUReconstructionKernels : public T template using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs; -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ - virtual void runKernelImpl(const krnlSetupArgs& args) \ - { \ - T::template runKernelBackend(args); \ - } \ - virtual gpu_reconstruction_kernels::krnlProperties getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument) \ - { \ - return T::template getKernelPropertiesBackend(); \ - } + virtual void runKernelImpl(const int num, const void* args) + { + switch (num) { // clang-format off +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \ + case x_num: { \ + const auto& args2 = *(const krnlSetupArgs*)args; \ + T::template runKernelBackend(args2); \ + break; \ + } #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL + } // clang-format on + } }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx index bae95ac8d3f38..95a47dec946e6 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx @@ -12,11 +12,35 @@ /// \file GPUReconstructionProcessing.cxx /// \author David Rohr +#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS +#include "GPUDefParametersDefault.h" +#include "GPUDefParametersLoad.inc" + #include "GPUReconstructionProcessing.h" #include "GPUReconstructionThreading.h" using namespace o2::gpu; +GPUReconstructionProcessing::GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) +{ + if (mMaster == nullptr) { + mParCPU = new GPUDefParameters(o2::gpu::internal::GPUDefParametersLoad()); + mParDevice = new GPUDefParameters(); + } else { + GPUReconstructionProcessing* master = dynamic_cast(mMaster); + mParCPU = master->mParCPU; + mParDevice = master->mParDevice; + } +} + +GPUReconstructionProcessing::~GPUReconstructionProcessing() +{ + if (mMaster == nullptr) { + delete mParCPU; + delete mParDevice; + } +} + int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores) { int32_t nThreads = 0; @@ -119,38 +143,22 @@ std::unique_ptr GPUReconstructionProc gpu_reconstruction_kernels::threadContext::threadContext() = default; gpu_reconstruction_kernels::threadContext::~threadContext() = default; -template -uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k) -{ - static int32_t num = k; - if (num < 0) { - throw std::runtime_error("Internal Error - Kernel Number not Set"); - } - return num; -} - -namespace o2::gpu::internal -{ -static std::vector initKernelNames() -{ - std::vector retVal; -#define GPUCA_KRNL(x_class, ...) \ - GPUReconstructionProcessing::GetKernelNum(retVal.size()); \ - retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class))); +const std::vector GPUReconstructionProcessing::mKernelNames = { +#define GPUCA_KRNL(x_class, ...) GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)), #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL - return retVal; -} -} // namespace o2::gpu::internal - -const std::vector GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames(); - -#define GPUCA_KRNL(x_class, ...) \ - template uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t); \ - template <> \ - const char* GPUReconstructionProcessing::GetKernelName() \ - { \ - return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \ +}; + +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \ + template <> \ + uint32_t GPUReconstructionProcessing::GetKernelNum() \ + { \ + return x_num; \ + } \ + template <> \ + const char* GPUReconstructionProcessing::GetKernelName() \ + { \ + return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \ } #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index b0466efceac24..2428027118c0a 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -25,6 +25,8 @@ namespace o2::gpu { +struct GPUDefParameters; + namespace gpu_reconstruction_kernels { struct deviceEvent { @@ -63,7 +65,7 @@ class threadContext class GPUReconstructionProcessing : public GPUReconstruction { public: - ~GPUReconstructionProcessing() override = default; + ~GPUReconstructionProcessing() override; // Threading int32_t getNKernelHostThreads(bool splitCores); @@ -78,7 +80,7 @@ class GPUReconstructionProcessing : public GPUReconstruction static const char* GetKernelName(); const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; } template - static uint32_t GetKernelNum(int32_t k = -1); + static uint32_t GetKernelNum(); // Public queries for timers auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; } @@ -101,7 +103,7 @@ class GPUReconstructionProcessing : public GPUReconstruction }; protected: - GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {} + GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg); using deviceEvent = gpu_reconstruction_kernels::deviceEvent; static const std::vector mKernelNames; @@ -132,6 +134,9 @@ class GPUReconstructionProcessing : public GPUReconstruction template HighResTimer& getTimer(const char* name, int32_t num = -1); + GPUDefParameters* mParCPU = nullptr; + GPUDefParameters* mParDevice = nullptr; + private: uint32_t getNextTimerId(); timerMeta* getTimerById(uint32_t id, bool increment = true); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index f87d5c8189cdc..8790d7718f517 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -13,8 +13,13 @@ /// \author David Rohr #define GPUCA_GPUCODE_HOSTONLY -#include "GPUReconstructionCUDAIncludesHost.h" +#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS +#include "GPUReconstructionCUDADef.h" +#include "GPUDefParametersDefault.h" +#include "GPUDefParametersLoad.inc" + +#include "GPUReconstructionCUDAIncludesHost.h" #include #include "GPUReconstructionCUDA.h" @@ -51,11 +56,14 @@ GPUReconstructionCUDABackend::GPUReconstructionCUDABackend(const GPUSettingsDevi { if (mMaster == nullptr) { mInternals = new GPUReconstructionCUDAInternals; + *mParDevice = o2::gpu::internal::GPUDefParametersLoad(); } + mDeviceBackendSettings.deviceType = DeviceType::CUDA; } GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend() { + Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA if (mMaster == nullptr) { delete mInternals; } @@ -69,7 +77,6 @@ int32_t GPUReconstructionCUDABackend::GPUChkErrInternal(const int64_t error, con GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) { - mDeviceBackendSettings.deviceType = DeviceType::CUDA; #ifndef __HIPCC__ // CUDA mRtcSrcExtension = ".cu"; mRtcBinExtension = ".fatbin"; @@ -78,11 +85,7 @@ GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg mRtcBinExtension = ".o"; #endif } - -GPUReconstructionCUDA::~GPUReconstructionCUDA() -{ - Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA -} +GPUReconstructionCUDA::~GPUReconstructionCUDA() {} GPUReconstruction* GPUReconstruction_Create_CUDA(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionCUDA(cfg); } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index a98b14a873ca0..ac5920f769f25 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -44,8 +44,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase void runKernelBackend(const krnlSetupArgs& args); template void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - template - gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); void getRTCKernelCalls(std::vector& kernels); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index 4b3f8a767226c..ac79dd7576e48 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -83,17 +83,14 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs(const krnlSetupArgs& args); +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args); #else // ---------- COMPILE_MODE = onefile | rdc ---------- #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2 #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc #endif -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ - GPUCA_KRNL_PROP(x_class, x_attributes) \ - GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types) \ +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ + GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \ template void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args); #ifndef __HIPCC__ // CUDA version diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index e724f0f2cbfcd..7310b8b6041a9 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -12,7 +12,10 @@ /// \file GPUReconstructionOCL.cxx /// \author David Rohr +#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS #include "GPUReconstructionOCLIncludesHost.h" +#include "GPUDefParametersDefault.h" +#include "GPUDefParametersLoad.inc" #include @@ -36,6 +39,7 @@ GPUReconstructionOCLBackend::GPUReconstructionOCLBackend(const GPUSettingsDevice { if (mMaster == nullptr) { mInternals = new GPUReconstructionOCLInternals; + *mParDevice = o2::gpu::internal::GPUDefParametersLoad(); } mDeviceBackendSettings.deviceType = DeviceType::OCL; } diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index 29951cd43f167..16ef9b5e87fe8 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -58,8 +58,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase int32_t AddKernel(); template void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - template - gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); GPUReconstructionOCLInternals* mInternals; float mOclVersion; diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx index fff69038c056f..cca634fba65fc 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx @@ -91,8 +91,6 @@ int32_t GPUReconstructionOCLBackend::AddKernels() return 0; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ - GPUCA_KRNL_PROP(x_class, x_attributes) \ - template void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args); +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 39218e9f94527..33715909e810c 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -129,7 +129,7 @@ set(HDRS_INSTALL DataTypes/GPUTriggerOutputs.h Debug/GPUROOTDump.h Definitions/GPUDefConstantsAndSettings.h - Definitions/GPUDefGPUParameters.h + Definitions/GPUDefParametersDefault.h Definitions/GPUDef.h Definitions/GPUDefMacros.h Definitions/GPULogging.h @@ -247,10 +247,18 @@ file(GENERATE OUTPUT include_gpu_onthefly/GPUNoFastMathKernels.h INPUT cmake/GPUNoFastMathKernels.template.h ) +file(GENERATE + OUTPUT include_gpu_onthefly/GPUDefParameters.h + INPUT Definitions/GPUDefParameters.template.h +) +file(GENERATE + OUTPUT include_gpu_onthefly/GPUDefParametersLoad.inc + INPUT Definitions/GPUDefParametersLoad.template.inc +) if(NOT ALIGPU_BUILD_TYPE STREQUAL "O2") include_directories(${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly) endif() -set(HDRS_INSTALL ${HDRS_INSTALL} ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUReconstructionKernelList.h) +set(HDRS_INSTALL ${HDRS_INSTALL} ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUReconstructionKernelList.h ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParameters.h) include(kernels.cmake) # Optional sources depending on optional dependencies diff --git a/GPU/GPUTracking/Definitions/GPUDef.h b/GPU/GPUTracking/Definitions/GPUDef.h index f01e3e6d38332..404f35f971c94 100644 --- a/GPU/GPUTracking/Definitions/GPUDef.h +++ b/GPU/GPUTracking/Definitions/GPUDef.h @@ -18,7 +18,7 @@ #include "GPUCommonDef.h" #include "GPUDefConstantsAndSettings.h" -#include "GPUDefGPUParameters.h" +#include "GPUDefParametersDefault.h" #include "GPUCommonRtypes.h" // Macros for masking ptrs in OpenCL kernel calls as uint64_t (The API only allows us to pass buffer objects) diff --git a/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h b/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h index f18390629f2bc..2d7aca8d71b92 100644 --- a/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h +++ b/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h @@ -13,7 +13,7 @@ /// \author David Rohr // This files contains compile-time constants affecting the GPU algorithms / reconstruction results. -// Architecture-dependant compile-time constants affecting the performance without changing the results are stored in GPUDefGPUParameters.h +// Architecture-dependant compile-time constants affecting the performance without changing the results are stored in GPUDefParameters.h #ifndef GPUDEFCONSTANTSANDSETTINGS_H #define GPUDEFCONSTANTSANDSETTINGS_H @@ -66,7 +66,7 @@ #endif #endif -//#define GPUCA_MERGER_BY_MC_LABEL // Use MC labels for TPC track merging - for performance studies +//#define GPUCA_MERGER_BY_MC_LABEL // Use MC labels for TPC track merging - for performance studies // TODO: Cleanup unneeded options //#define GPUCA_FULL_CLUSTERDATA // Store all cluster information in the cluster data, also those not needed for tracking. //#define GPUCA_TPC_RAW_PROPAGATE_PAD_ROW_TIME // Propagate Pad, Row, Time cluster information to GM //#define GPUCA_GM_USE_FULL_FIELD // Use offline magnetic field during GMPropagator prolongation diff --git a/GPU/GPUTracking/Definitions/GPUDefParameters.template.h b/GPU/GPUTracking/Definitions/GPUDefParameters.template.h new file mode 100644 index 0000000000000..731cb76b89193 --- /dev/null +++ b/GPU/GPUTracking/Definitions/GPUDefParameters.template.h @@ -0,0 +1,27 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUDefParameters.h +/// \author David Rohr + +#ifndef GPUDEFPARAMETERS_H +#define GPUDEFPARAMETERS_H + +namespace o2::gpu +{ +struct GPUDefParameters { // clang-format off + int32_t par_LB_maxThreads[$>] = {}; + int32_t par_LB_minBlocks[$>] = {}; + int32_t par_LB_forceBlocks[$>] = {}; +}; // clang-format on +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h similarity index 69% rename from GPU/GPUTracking/Definitions/GPUDefGPUParameters.h rename to GPU/GPUTracking/Definitions/GPUDefParametersDefault.h index 910907368e891..1193731acd9cf 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUDefGPUParameters.h +/// \file GPUDefParametersDefault.h /// \author David Rohr // This files contains compile-time constants affecting the GPU performance. @@ -17,14 +17,10 @@ // This file also contains all constants describing memory limitations, essentially limiting the total number of tracks, etc. // Compile-time constants affecting the tracking algorithms / results are located in GPUDefConstantsAndSettings.h -#ifndef GPUDEFGPUPARAMETERS_H -#define GPUDEFGPUPARAMETERS_H +#ifndef GPUDEFPARAMETERSDEFAULT_H +#define GPUDEFPARAMETERSDEFAULT_H // clang-format off -#ifndef GPUDEF_H -#error Please include GPUDef.h -#endif - #include "GPUCommonDef.h" #include "GPUDefMacros.h" @@ -282,7 +278,7 @@ #endif // GPUCA_GPUCODE #ifdef GPUCA_GPUCODE - // Default settings, if not already set for selected GPU type + // Default settings for GPU, if not already set for selected GPU type #ifndef GPUCA_THREAD_COUNT #define GPUCA_THREAD_COUNT 256 #endif @@ -334,10 +330,10 @@ #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow #define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 256 #endif - #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters #define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 256 #endif - #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters #define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 256 #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS @@ -487,10 +483,15 @@ #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov #define GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov 256 #endif + #ifndef GPUCA_LB_GPUMemClean16 + #define GPUCA_LB_GPUMemClean16 GPUCA_THREAD_COUNT, 1 + #endif + #ifndef GPUCA_LB_GPUitoa + #define GPUCA_LB_GPUitoa GPUCA_THREAD_COUNT, 1 + #endif #define GPUCA_GET_THREAD_COUNT(...) GPUCA_M_FIRST(__VA_ARGS__) #else - // The following defaults are needed to compile the host code - #define GPUCA_GET_THREAD_COUNT(...) 1 + #define GPUCA_GET_THREAD_COUNT(...) 1 // On the host, a thread is a block, and we run 1 "device thread" per block. #endif #define GPUCA_GET_WARP_COUNT(...) (GPUCA_GET_THREAD_COUNT(__VA_ARGS__) / GPUCA_WARP_SIZE) @@ -514,8 +515,6 @@ #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_THREAD_COUNT_SCAN #define GPUCA_LB_GPUTPCCFStreamCompaction_scanDown GPUCA_THREAD_COUNT_SCAN #define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_THREAD_COUNT_SCAN -#define GPUCA_LB_GPUTPCTrackletConstructor_singleSector GPUCA_LB_GPUTPCTrackletConstructor -#define GPUCA_LB_GPUTPCTrackletConstructor_allSectors GPUCA_LB_GPUTPCTrackletConstructor #define GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32 GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER @@ -523,33 +522,33 @@ #define GPUCA_LB_GPUTPCCompressionGatherKernels_multiBlock GPUCA_LB_COMPRESSION_GATHER #if defined(__CUDACC__) || defined(__HIPCC__) -#define GPUCA_SPECIALIZE_THRUST_SORTS + #define GPUCA_SPECIALIZE_THRUST_SORTS #endif #ifndef GPUCA_NEIGHBORSFINDER_REGS -#define GPUCA_NEIGHBORSFINDER_REGS NONE, 0 + #define GPUCA_NEIGHBORSFINDER_REGS NONE, 0 #endif #ifdef GPUCA_GPUCODE #ifndef GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP - #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6 + #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6 #endif #ifndef GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE - #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12 + #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12 #endif #ifndef GPUCA_ALTERNATE_BORDER_SORT - #define GPUCA_ALTERNATE_BORDER_SORT 0 + #define GPUCA_ALTERNATE_BORDER_SORT 0 #endif #ifndef GPUCA_SORT_BEFORE_FIT - #define GPUCA_SORT_BEFORE_FIT 0 + #define GPUCA_SORT_BEFORE_FIT 0 #endif #ifndef GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION - #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 + #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0 #endif #ifndef GPUCA_COMP_GATHER_KERNEL - #define GPUCA_COMP_GATHER_KERNEL 0 + #define GPUCA_COMP_GATHER_KERNEL 0 #endif #ifndef GPUCA_COMP_GATHER_MODE - #define GPUCA_COMP_GATHER_MODE 2 + #define GPUCA_COMP_GATHER_MODE 2 #endif #else #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0 @@ -562,20 +561,20 @@ #define GPUCA_COMP_GATHER_MODE 0 #endif #ifndef GPUCA_DEDX_STORAGE_TYPE -#define GPUCA_DEDX_STORAGE_TYPE float + #define GPUCA_DEDX_STORAGE_TYPE float #endif #ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE -#define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float + #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float #endif #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_MERGER_INTERPOLATION_ERROR_TYPE) #define GPUCA_DEDX_STORAGE_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_DEDX_STORAGE_TYPE) #ifndef GPUCA_WARP_SIZE -#ifdef GPUCA_GPUCODE -#define GPUCA_WARP_SIZE 32 -#else -#define GPUCA_WARP_SIZE 1 -#endif + #ifdef GPUCA_GPUCODE + #define GPUCA_WARP_SIZE 32 + #else + #define GPUCA_WARP_SIZE 1 + #endif #endif #define GPUCA_MAX_THREADS 1024 @@ -602,10 +601,10 @@ // #define GPUCA_KERNEL_DEBUGGER_OUTPUT -// Some assertions to make sure out parameters are not invalid -static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP, "Invalid GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP"); -static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE, "Invalid GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE"); -#ifdef GPUCA_GPUCODE +// Some assertions to make sure the parameters are not invalid +#if defined(GPUCA_GPUCODE) + static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP, "Invalid GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP"); + static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE, "Invalid GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE"); static_assert(GPUCA_M_FIRST(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE, "Invalid GPUCA_TPC_COMP_CHUNK_SIZE"); #endif @@ -621,5 +620,270 @@ static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE, "Invalid #define GPUCA_NEW_ALIGNMENT (std::align_val_t{GPUCA_BUFFER_ALIGNMENT}) #define GPUCA_OPERATOR_NEW_ALIGNMENT ,GPUCA_NEW_ALIGNMENT +#ifdef GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS + // Invalid default values, must not be used, but needed for now to make the GPUDefParametersLoad() happy // TOCO: cleanup + #ifndef GPUCA_LB_GPUTPCCreateTrackingData + #define GPUCA_LB_GPUTPCCreateTrackingData 0 + #endif + #ifndef GPUCA_LB_GPUTPCTrackletConstructor + #define GPUCA_LB_GPUTPCTrackletConstructor 0 + #endif + #ifndef GPUCA_LB_GPUTPCTrackletSelector + #define GPUCA_LB_GPUTPCTrackletSelector 0 + #endif + #ifndef GPUCA_LB_GPUTPCNeighboursFinder + #define GPUCA_LB_GPUTPCNeighboursFinder 0 + #endif + #ifndef GPUCA_LB_GPUTPCNeighboursCleaner + #define GPUCA_LB_GPUTPCNeighboursCleaner 0 + #endif + #ifndef GPUCA_LB_GPUTPCExtrapolationTracking + #define GPUCA_LB_GPUTPCExtrapolationTracking 0 + #endif + #ifndef GPUCA_LB_GPUTRDTrackerKernels_gpuVersion + #define GPUCA_LB_GPUTRDTrackerKernels_gpuVersion 0 + #endif + #ifndef GPUCA_LB_GPUTPCCreateOccupancyMap_fill + #define GPUCA_LB_GPUTPCCreateOccupancyMap_fill 0 + #endif + #ifndef GPUCA_LB_GPUTPCCreateOccupancyMap_fold + #define GPUCA_LB_GPUTPCCreateOccupancyMap_fold 0 + #endif + #ifndef GPUCA_LB_GPUTRDTrackerKernels_o2Version + #define GPUCA_LB_GPUTRDTrackerKernels_o2Version 0 + #endif + #ifndef GPUCA_LB_GPUTPCConvertKernel + #define GPUCA_LB_GPUTPCConvertKernel 0 + #endif + #ifndef GPUCA_LB_GPUTPCCompressionKernels_step0attached + #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 0 + #endif + #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached + #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 0 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 0 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step1unattached + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 0 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow + #define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 0 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters + #define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 0 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters + #define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFDecodeZS + #define GPUCA_LB_GPUTPCCFDecodeZS 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFDecodeZSLink + #define GPUCA_LB_GPUTPCCFDecodeZSLink 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFDecodeZSDenseLink + #define GPUCA_LB_GPUTPCCFDecodeZSDenseLink 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFGather + #define GPUCA_LB_GPUTPCCFGather 0 + #endif + #ifndef GPUCA_LB_COMPRESSION_GATHER + #define GPUCA_LB_COMPRESSION_GATHER 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerTrackFit + #define GPUCA_LB_GPUTPCGMMergerTrackFit 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerFollowLoopers + #define GPUCA_LB_GPUTPCGMMergerFollowLoopers 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerSectorRefit + #define GPUCA_LB_GPUTPCGMMergerSectorRefit 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerUnpackResetIds + #define GPUCA_LB_GPUTPCGMMergerUnpackResetIds 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerUnpackGlobal + #define GPUCA_LB_GPUTPCGMMergerUnpackGlobal 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerResolve_step0 + #define GPUCA_LB_GPUTPCGMMergerResolve_step0 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerResolve_step1 + #define GPUCA_LB_GPUTPCGMMergerResolve_step1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerResolve_step2 + #define GPUCA_LB_GPUTPCGMMergerResolve_step2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerResolve_step3 + #define GPUCA_LB_GPUTPCGMMergerResolve_step3 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerResolve_step4 + #define GPUCA_LB_GPUTPCGMMergerResolve_step4 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerClearLinks + #define GPUCA_LB_GPUTPCGMMergerClearLinks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeWithinPrepare + #define GPUCA_LB_GPUTPCGMMergerMergeWithinPrepare 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeSectorsPrepare + #define GPUCA_LB_GPUTPCGMMergerMergeSectorsPrepare 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeBorders_step0 + #define GPUCA_LB_GPUTPCGMMergerMergeBorders_step0 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeBorders_step2 + #define GPUCA_LB_GPUTPCGMMergerMergeBorders_step2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeCE + #define GPUCA_LB_GPUTPCGMMergerMergeCE 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerLinkExtrapolatedTracks + #define GPUCA_LB_GPUTPCGMMergerLinkExtrapolatedTracks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerCollect + #define GPUCA_LB_GPUTPCGMMergerCollect 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerSortTracksPrepare + #define GPUCA_LB_GPUTPCGMMergerSortTracksPrepare 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerPrepareClusters_step0 + #define GPUCA_LB_GPUTPCGMMergerPrepareClusters_step0 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerPrepareClusters_step1 + #define GPUCA_LB_GPUTPCGMMergerPrepareClusters_step1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerPrepareClusters_step2 + #define GPUCA_LB_GPUTPCGMMergerPrepareClusters_step2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerFinalize_step0 + #define GPUCA_LB_GPUTPCGMMergerFinalize_step0 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerFinalize_step1 + #define GPUCA_LB_GPUTPCGMMergerFinalize_step1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerFinalize_step2 + #define GPUCA_LB_GPUTPCGMMergerFinalize_step2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeLoopers_step0 + #define GPUCA_LB_GPUTPCGMMergerMergeLoopers_step0 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeLoopers_step1 + #define GPUCA_LB_GPUTPCGMMergerMergeLoopers_step1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeLoopers_step2 + #define GPUCA_LB_GPUTPCGMMergerMergeLoopers_step2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMO2Output_prepare + #define GPUCA_LB_GPUTPCGMO2Output_prepare 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMO2Output_output + #define GPUCA_LB_GPUTPCGMO2Output_output 0 + #endif + #ifndef GPUCA_LB_GPUITSFitterKernels + #define GPUCA_LB_GPUITSFitterKernels 0 + #endif + #ifndef GPUCA_LB_GPUTPCStartHitsFinder + #define GPUCA_LB_GPUTPCStartHitsFinder 0 + #endif + #ifndef GPUCA_LB_GPUTPCStartHitsSorter + #define GPUCA_LB_GPUTPCStartHitsSorter 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFCheckPadBaseline + #define GPUCA_LB_GPUTPCCFCheckPadBaseline 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap + #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits + #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart + #define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFPeakFinder + #define GPUCA_LB_GPUTPCCFPeakFinder 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFNoiseSuppression + #define GPUCA_LB_GPUTPCCFNoiseSuppression 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFDeconvolution + #define GPUCA_LB_GPUTPCCFDeconvolution 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFClusterizer + #define GPUCA_LB_GPUTPCCFClusterizer 0 + #endif + #ifndef GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels 0 + #endif + #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU + #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 0 + #endif + #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov + #define GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov 0 + #endif + #ifndef GPUCA_LB_GPUMemClean16 + #define GPUCA_LB_GPUMemClean16 0 + #endif + #ifndef GPUCA_LB_GPUitoa + #define GPUCA_LB_GPUitoa 0 + #endif + #ifndef GPUCA_LB_GPUTPCExtrapolationTrackingCopyNumbers + #define GPUCA_LB_GPUTPCExtrapolationTrackingCopyNumbers 0 + #endif + #ifndef GPUCA_LB_GPUTPCSectorDebugSortKernels_hitData + #define GPUCA_LB_GPUTPCSectorDebugSortKernels_hitData 0 + #endif + #ifndef GPUCA_LB_GPUTPCSectorDebugSortKernels_startHits + #define GPUCA_LB_GPUTPCSectorDebugSortKernels_startHits 0 + #endif + #ifndef GPUCA_LB_GPUTPCSectorDebugSortKernels_sectorTracks + #define GPUCA_LB_GPUTPCSectorDebugSortKernels_sectorTracks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGlobalDebugSortKernels_clearIds + #define GPUCA_LB_GPUTPCGlobalDebugSortKernels_clearIds 0 + #endif + #ifndef GPUCA_LB_GPUTPCGlobalDebugSortKernels_sectorTracks + #define GPUCA_LB_GPUTPCGlobalDebugSortKernels_sectorTracks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGlobalDebugSortKernels_extrapolatedTracks1 + #define GPUCA_LB_GPUTPCGlobalDebugSortKernels_extrapolatedTracks1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGlobalDebugSortKernels_extrapolatedTracks2 + #define GPUCA_LB_GPUTPCGlobalDebugSortKernels_extrapolatedTracks2 0 + #endif + #ifndef GPUCA_LB_GPUTPCGlobalDebugSortKernels_borderTracks + #define GPUCA_LB_GPUTPCGlobalDebugSortKernels_borderTracks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerUnpackSaveNumber + #define GPUCA_LB_GPUTPCGMMergerUnpackSaveNumber 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeBorders_step1 + #define GPUCA_LB_GPUTPCGMMergerMergeBorders_step1 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerMergeBorders_variant + #define GPUCA_LB_GPUTPCGMMergerMergeBorders_variant 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerSortTracks + #define GPUCA_LB_GPUTPCGMMergerSortTracks 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMMergerSortTracksQPt + #define GPUCA_LB_GPUTPCGMMergerSortTracksQPt 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMO2Output_sort + #define GPUCA_LB_GPUTPCGMO2Output_sort 0 + #endif + #ifndef GPUCA_LB_GPUTPCGMO2Output_mc + #define GPUCA_LB_GPUTPCGMO2Output_mc 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFMCLabelFlattener_setRowOffsets + #define GPUCA_LB_GPUTPCCFMCLabelFlattener_setRowOffsets 0 + #endif + #ifndef GPUCA_LB_GPUTPCCFMCLabelFlattener_flatten + #define GPUCA_LB_GPUTPCCFMCLabelFlattener_flatten 0 + #endif +#endif // GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS + // clang-format on -#endif +#endif // GPUDEFPARAMETERSDEFAULT_H diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc new file mode 100644 index 0000000000000..953750b6f925b --- /dev/null +++ b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc @@ -0,0 +1,56 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUDefParametersLoad.inc +/// \author David Rohr + +#include "GPUDefParameters.h" +#include "GPUDefMacros.h" +#include +#include + +namespace o2::gpu::internal +{ + +static GPUDefParameters GPUDefParametersLoad() +{ + return GPUDefParameters{ + // clang-format off + {$,REPLACE,[^A-Za-z0-9]+,_>,PREPEND,GPUCA_M_FIRST(GPUCA_LB_>,APPEND,)>,$>}, + {$,REPLACE,[^A-Za-z0-9]+,_>,PREPEND,GPUCA_M_FIRST(GPUCA_M_SHIFT(GPUCA_LB_>,APPEND,$0))>,$>}, + {$,REPLACE,[^A-Za-z0-9]+,_>,PREPEND,GPUCA_M_FIRST(GPUCA_M_SHIFT(GPUCA_M_SHIFT(GPUCA_LB_>,APPEND,$0$0)))>,$>} + // clang-format on + }; +} + +#define GPUCA_EXPORT_KERNEL(name) \ + if (par.par_LB_maxThreads[i] > 0) { \ + o << "#define " GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \ + if (par.par_LB_minBlocks[i] > 0) { \ + o << ", " << par.par_LB_minBlocks[i]; \ + } \ + if (par.par_LB_forceBlocks[i] > 0) { \ + o << ", " << par.par_LB_forceBlocks[i]; \ + } \ + o << "\n"; \ + } \ + i++; + +static std::string GPUDefParametersExport(const GPUDefParameters& par) +{ + std::stringstream o; // clang-format off + int32_t i = 0; + $,REPLACE,[^A-Za-z0-9]+,_>,PREPEND,GPUCA_EXPORT_KERNEL(>,APPEND,)>, + > + return o.str(); // clang-format on +} + +} // namespace o2::gpu::internal diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index e161f74a31032..3e7447892307a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -105,7 +105,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { processorsShadow()->tpcTrackers[iSector].GPUParametersConst()->gpumem = (char*)mRec->DeviceMemoryBase(); // Initialize Startup Constants - processors()->tpcTrackers[iSector].GPUParameters()->nextStartHit = (((getKernelProperties().minBlocks * BlockCount()) + NSECTORS - 1 - iSector) / NSECTORS) * getKernelProperties().nThreads; + processors()->tpcTrackers[iSector].GPUParameters()->nextStartHit = (((getKernelProperties().minBlocks * BlockCount()) + NSECTORS - 1 - iSector) / NSECTORS) * getKernelProperties().nThreads; processorsShadow()->tpcTrackers[iSector].SetGPUTextureBase(mRec->DeviceMemoryBase()); } diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.cxx index 5a7df0ba8b874..71df683eee1dc 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.cxx @@ -476,7 +476,7 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::DoTracklet(GPUconstantref() GPUTPCT } template <> -GPUdii() void GPUTPCTrackletConstructor::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& sMem, processorType& GPUrestrict() tracker) +GPUdii() void GPUTPCTrackletConstructor::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& sMem, processorType& GPUrestrict() tracker) { if (get_local_id(0) == 0) { sMem.mNStartHits = *tracker.NStartHits(); @@ -491,79 +491,6 @@ GPUdii() void GPUTPCTrackletConstructor::Thread -GPUdii() void GPUTPCTrackletConstructor::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& sMem, processorType& GPUrestrict() tracker0) -{ - GPUconstantref() GPUTPCTracker* GPUrestrict() pTracker = &tracker0; -#ifdef GPUCA_GPUCODE - int32_t mySector = get_group_id(0) % GPUCA_NSECTORS; - int32_t currentSector = -1; - - if (get_local_id(0) == 0) { - sMem.mNextStartHitFirstRun = 1; - } - GPUCA_UNROLL(, U()) - for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) { - GPUconstantref() GPUTPCTracker& GPUrestrict() tracker = pTracker[mySector]; - - GPUTPCThreadMemory rMem; - - while ((rMem.mISH = FetchTracklet(tracker, sMem)) != -2) { - if (rMem.mISH >= 0 && get_local_id(0) < GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletConstructor)) { - rMem.mISH += get_local_id(0); - } else { - rMem.mISH = -1; - } - - if (mySector != currentSector) { - if (get_local_id(0) == 0) { - sMem.mNStartHits = *tracker.NStartHits(); - } - CA_SHARED_CACHE(&sMem.mRows[0], tracker.TrackingDataRows(), GPUCA_ROW_COUNT * sizeof(GPUTPCRow)); - GPUbarrier(); - currentSector = mySector; - } - - if (rMem.mISH >= 0 && rMem.mISH < sMem.mNStartHits) { - rMem.mGo = true; - DoTracklet(tracker, sMem, rMem); - } - } - if (++mySector >= GPUCA_NSECTORS) { - mySector = 0; - } - } -#else - for (int32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) { - Thread(nBlocks, nThreads, iBlock, iThread, sMem, pTracker[iSector]); - } -#endif -} - -#ifdef GPUCA_GPUCODE - -GPUd() int32_t GPUTPCTrackletConstructor::FetchTracklet(GPUconstantref() GPUTPCTracker& GPUrestrict() tracker, GPUsharedref() GPUSharedMemory& sMem) -{ - const uint32_t nStartHit = *tracker.NStartHits(); - GPUbarrier(); - if (get_local_id(0) == 0) { - int32_t firstStartHit = -2; - if (sMem.mNextStartHitFirstRun == 1) { - firstStartHit = (get_group_id(0) - tracker.ISector()) / GPUCA_NSECTORS * GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletConstructor); - sMem.mNextStartHitFirstRun = 0; - } else { - if (tracker.GPUParameters()->nextStartHit < nStartHit) { - firstStartHit = CAMath::AtomicAdd(&tracker.GPUParameters()->nextStartHit, GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletConstructor)); - } - } - sMem.mNextStartHitFirst = firstStartHit < (int32_t)nStartHit ? firstStartHit : -2; - } - GPUbarrier(); - return (sMem.mNextStartHitFirst); -} - -#endif // GPUCA_GPUCODE - template <> // FIXME: GPUgeneric() needed to make the clang spirv output link correctly GPUd() int32_t GPUTPCTrackletConstructor::GPUTPCTrackletConstructorExtrapolationTracking(GPUconstantref() GPUTPCTracker& GPUrestrict() tracker, GPUsharedref() GPUTPCExtrapolationTracking::GPUSharedMemory& sMem, GPUTPCTrackParam& GPUrestrict() tParam, int32_t row, int32_t increment, int32_t iTracklet, calink* rowHits) { diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.h b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.h index 0f8314ee0fad4..af87d0276f1c7 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletConstructor.h @@ -28,14 +28,9 @@ namespace o2::gpu */ class GPUTPCTracker; -class GPUTPCTrackletConstructor +class GPUTPCTrackletConstructor : public GPUKernelTemplate { public: - enum K { - singleSector = 0, - allSectors = 1 - }; - class GPUTPCThreadMemory { friend class GPUTPCTrackletConstructor; //! friend class @@ -89,10 +84,6 @@ class GPUTPCTrackletConstructor GPUd() static void DoTracklet(GPUconstantref() GPUTPCTracker& tracker, GPUsharedref() GPUTPCTrackletConstructor::GPUSharedMemory& sMem, GPUTPCThreadMemory& rMem); -#ifdef GPUCA_GPUCODE - GPUd() static int32_t FetchTracklet(GPUconstantref() GPUTPCTracker& tracker, GPUsharedref() GPUSharedMemory& sMem); -#endif // GPUCA_GPUCODE - template GPUd() static int32_t GPUTPCTrackletConstructorExtrapolationTracking(GPUconstantref() GPUTPCTracker& tracker, GPUsharedref() T& sMem, GPUTPCTrackParam& tParam, int32_t startrow, int32_t increment, int32_t iTracklet, calink* rowHits); diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index e9cb7c5179c59..2e3e4725bd6aa 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -53,9 +53,6 @@ #include "GPUTPCGMMergedTrack.h" #include "GPUSettings.h" #include -#if not(defined(__ARM_NEON) or defined(__aarch64__)) // ARM doesn't have SSE -#include -#endif #include "GPUO2DataTypes.h" #include "GPUChainITS.h" @@ -85,23 +82,6 @@ std::atomic nIteration, nIterationEnd; std::vector ioPtrEvents; std::vector ioMemEvents; -void SetCPUAndOSSettings() -{ -#if not(defined(__ARM_NEON) or defined(__aarch64__)) // ARM doesn't have SSE -#ifdef FE_DFL_DISABLE_SSE_DENORMS_ENV // Flush and load denormals to zero in any case - fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV); -#else -#ifndef _MM_FLUSH_ZERO_ON -#define _MM_FLUSH_ZERO_ON 0x8000 -#endif -#ifndef _MM_DENORMALS_ZERO_ON -#define _MM_DENORMALS_ZERO_ON 0x0040 -#endif - _mm_setcsr(_mm_getcsr() | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON)); -#endif -#endif // ARM -} - int32_t ReadConfiguration(int argc, char** argv) { int32_t qcRet = qConfigParse(argc, (const char**)argv); @@ -740,8 +720,6 @@ int32_t main(int argc, char** argv) { std::unique_ptr recUnique, recUniqueAsync, recUniquePipeline; - SetCPUAndOSSettings(); - if (ReadConfiguration(argc, argv)) { return 1; } diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index ed4fc5c9f7e2d..dfc8e8db3bc7a 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -62,8 +62,11 @@ if (GPUCA_BUILD_DEBUG_SANITIZE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -shared-libasan") endif() endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings -Wno-vla-cxx-extension") -set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic -Wl,--no-undefined") +string(APPEND CMAKE_CXX_FLAGS " -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings") +string(APPEND CMAKE_SHARED_LINKER_FLAGS " -rdynamic -Wl,--no-undefined") +if(CMAKE_CXX_COMPILER MATCHES "clang\\+\\+") + string(APPEND CMAKE_CXX_FLAGS " -Wno-vla-cxx-extension") +endif() # Find mandatory packages find_package(TBB REQUIRED) diff --git a/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h index dac93277d5ec9..499672bf00b50 100644 --- a/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h +++ b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h @@ -12,6 +12,9 @@ /// \file GPUNoFastMathKernels.h /// \author David Rohr +#ifndef GPUNOFASTMATHKERNELS_H +#define GPUNOFASTMATHKERNELS_H + #include #include @@ -21,3 +24,5 @@ namespace o2::gpu::internal static const std::unordered_set noFastMathKernels = {$>,APPEND,">,PREPEND,">,$ >}; // clang-format on } // namespace o2::gpu::internal + +#endif diff --git a/GPU/GPUTracking/cmake/kernel_helpers.cmake b/GPU/GPUTracking/cmake/kernel_helpers.cmake index 99699cc72e940..3c1ad9658566b 100644 --- a/GPU/GPUTracking/cmake/kernel_helpers.cmake +++ b/GPU/GPUTracking/cmake/kernel_helpers.cmake @@ -64,7 +64,9 @@ function(o2_gpu_add_kernel kernel_name kernel_files) endif() set(TMP_PRE "") set(TMP_POST "") - set(TMP_KERNEL "GPUCA_KRNL${TMP_BOUNDS}((${kernel_name}), (${kernel_extra}), (${OPT1}), (${OPT2}), (${OPT3}))\n") + get_property(LIST_KERNELS TARGET O2_GPU_KERNELS PROPERTY O2_GPU_KERNELS) + list(LENGTH LIST_KERNELS KERNEL_COUNT) + set(TMP_KERNEL "GPUCA_KRNL${TMP_BOUNDS}((${kernel_name}), (${kernel_extra}), (${OPT1}), (${OPT2}), (${OPT3}), ${KERNEL_COUNT})\n") separate_arguments(kernel_files NATIVE_COMMAND ${kernel_files}) list(GET kernel_files 0 TMP_KERNEL_CLASS_FILE) if (TMP_KERNEL_CLASS_FILE STREQUAL "=") diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index ad348a84264f0..994f10a516b10 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -34,11 +34,10 @@ o2_gpu_add_kernel("GPUTPCNeighboursFinder" "= TPCTRAC o2_gpu_add_kernel("GPUTPCNeighboursCleaner" "= TPCTRACKER" LB) o2_gpu_add_kernel("GPUTPCStartHitsFinder" "= TPCTRACKER" LB) o2_gpu_add_kernel("GPUTPCStartHitsSorter" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCTrackletConstructor, singleSector" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCTrackletConstructor, allSectors" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCTrackletConstructor" "= TPCTRACKER" LB) o2_gpu_add_kernel("GPUTPCTrackletSelector" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUMemClean16" "GPUGeneralKernels" "NO_REG, (GPUCA_THREAD_COUNT, 1)" void* ptr "uint64_t" size) -o2_gpu_add_kernel("GPUitoa" "GPUGeneralKernels" "NO_REG, (GPUCA_THREAD_COUNT, 1)" int32_t* ptr "uint64_t" size) +o2_gpu_add_kernel("GPUMemClean16" "GPUGeneralKernels" NO void* ptr "uint64_t" size) +o2_gpu_add_kernel("GPUitoa" "GPUGeneralKernels" NO int32_t* ptr "uint64_t" size) o2_gpu_add_kernel("GPUTPCExtrapolationTrackingCopyNumbers" "GPUTPCExtrapolationTracking TPCTRACKER" NO int32_t n) o2_gpu_add_kernel("GPUTPCExtrapolationTracking" "= TPCTRACKER TPCTRACKLETCONS" LB) o2_gpu_add_kernel("GPUTPCCreateTrackingData" "= TPCTRACKER TPCSECTORDATA" LB) diff --git a/GPU/GPUTracking/utils/qmaths_helpers.h b/GPU/GPUTracking/utils/qmaths_helpers.h index 9c5f704180aaa..5eb3ce4fb2483 100644 --- a/GPU/GPUTracking/utils/qmaths_helpers.h +++ b/GPU/GPUTracking/utils/qmaths_helpers.h @@ -15,24 +15,33 @@ #ifndef QMATH_HELPERS_H #define QMATH_HELPERS_H -#if defined __has_include -#if __has_include() && __has_include() -#include -#include -#if defined(_MM_FLUSH_ZERO_OFF) && defined(_MM_DENORMALS_ZERO_ON) +#if !(defined(__ARM_NEON) || defined(__aarch64__)) && __has_include() // clang-format off + #include + #if __has_include() + #include + #endif +#elif __has_include() + #include +#endif + static void disable_denormals() { - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#if !(defined(__ARM_NEON) || defined(__aarch64__)) && __has_include() // clang-format off + #if defined(_MM_FLUSH_ZERO_OFF) && defined(_MM_DENORMALS_ZERO_ON) + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + #else + #ifndef _MM_FLUSH_ZERO_ON + #define _MM_FLUSH_ZERO_ON 0x8000 + #endif + #ifndef _MM_DENORMALS_ZERO_ON + #define _MM_DENORMALS_ZERO_ON 0x0040 + #endif + _mm_setcsr(_mm_getcsr() | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON)); + #endif +#elif __has_include() && defined(FE_DFL_DISABLE_SSE_DENORMS_ENV) + fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV); +#endif // clang-format on } -#define XMM_HAS_DENORMAL_DEACTIVATE -#endif -#endif -#endif -#ifdef XMM_HAS_DENORMAL_DEACTIVATE -#undef XMM_HAS_DENORMAL_DEACTIVATE -#else -static void disable_denormals() {} -#endif #endif diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index bbbb420354fae..95db55041184f 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -82,7 +82,6 @@ elseif(NOT GPUCA_DETERMINISTIC_MODE MATCHES "^[0-9]+$") message(FATAL_ERROR "Invalid setting ${GPUCA_DETERMINISTIC_MODE} for GPUCA_DETERMINISTIC_MODE") endif() set(GPUCA_DETERMINISTIC_MODE ${GPUCA_DETERMINISTIC_MODE_MAP_${GPUCA_DETERMINISTIC_MODE}}) - message(STATUS "Set to ${GPUCA_DETERMINISTIC_MODE}") endif() if (CMAKE_SYSTEM_NAME MATCHES Darwin OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(GPUCA_CXX_DENORMALS_FLAGS "")