diff --git a/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx b/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx index 424657ac19426..375fa732007cc 100644 --- a/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx +++ b/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx @@ -51,6 +51,8 @@ #include "GPUTRDTrackletWord.h" #include "GPUTRDInterfaces.h" #include "GPUTRDGeometry.h" +#include "GPUConstantMem.h" +#include "GPUTRDTrackerKernels.h" #ifdef ENABLE_UPGRADES #include "ITS3Reconstruction/IOUtils.h" diff --git a/GPU/GPUTracking/Base/GPUProcessor.cxx b/GPU/GPUTracking/Base/GPUProcessor.cxx index 8a18f71d535e3..82627fb00723c 100644 --- a/GPU/GPUTracking/Base/GPUProcessor.cxx +++ b/GPU/GPUTracking/Base/GPUProcessor.cxx @@ -14,7 +14,7 @@ #include "GPUProcessor.h" #include "GPUReconstruction.h" -#include "GPUReconstructionDeviceBase.h" +#include "GPUSettings.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index acca74e57a80e..c79c743e96ce5 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -30,7 +30,9 @@ #include "GPUROOTDumpCore.h" #include "GPUConfigDump.h" #include "GPUChainTracking.h" +#include "GPUConstantMem.h" #include "GPUCommonHelpers.h" +#include "GPUSettings.h" #include "GPUMemoryResource.h" #include "GPUChain.h" @@ -75,10 +77,10 @@ constexpr GPUReconstruction::GeometryType GPUReconstruction::geometryType; static ptrdiff_t ptrDiff(void* a, void* b) { return (char*)a - (char*)b; } -GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mDeviceBackendSettings(cfg) +GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mGRPSettings(new GPUSettingsGRP), mDeviceBackendSettings(new GPUSettingsDeviceBackend(cfg)), mProcessingSettings(new GPUSettingsProcessing) { if (cfg.master) { - if (cfg.master->mDeviceBackendSettings.deviceType != cfg.deviceType) { + if (cfg.master->GetDeviceBackendSettings().deviceType != cfg.deviceType) { throw std::invalid_argument("device type of master and slave GPUReconstruction does not match"); } if (cfg.master->mMaster) { @@ -87,7 +89,7 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos mMaster = cfg.master; cfg.master->mSlaves.emplace_back(this); } - param().SetDefaults(&mGRPSettings); + param().SetDefaults(mGRPSettings.get()); mMemoryScalers.reset(new GPUMemorySizeScalers); for (uint32_t i = 0; i < NSECTORS; i++) { processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place @@ -148,7 +150,7 @@ int32_t GPUReconstruction::Init() if (InitDevice()) { return 1; } - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { mHostMemoryPoolEnd = (char*)mHostMemoryBase + mHostMemorySize; mDeviceMemoryPoolEnd = (char*)mDeviceMemoryBase + mDeviceMemorySize; } else { @@ -213,7 +215,7 @@ static uint32_t getDefaultNThreads() int32_t GPUReconstruction::InitPhaseBeforeDevice() { - if (mProcessingSettings.printSettings) { + if (GetProcessingSettings().printSettings) { if (mSlaves.size() || mMaster) { printf("\nConfig Dump %s\n", mMaster ? "Slave" : "Master"); } @@ -223,7 +225,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() break; } } - GPUConfigDump::dumpConfig(¶m().rec, &mProcessingSettings, chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, &mDeviceBackendSettings, &mRecoSteps); + GPUConfigDump::dumpConfig(¶m().rec, mProcessingSettings.get(), chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, mDeviceBackendSettings.get(), &mRecoSteps); } mRecoSteps.stepsGPUMask &= mRecoSteps.steps; mRecoSteps.stepsGPUMask &= AvailableGPURecoSteps(); @@ -231,95 +233,95 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mRecoSteps.stepsGPUMask.set((uint8_t)0); } - if (mProcessingSettings.forceMemoryPoolSize >= 1024 || mProcessingSettings.forceHostMemoryPoolSize >= 1024) { - mProcessingSettings.memoryAllocationStrategy = GPUMemoryResource::ALLOCATION_GLOBAL; + if (GetProcessingSettings().forceMemoryPoolSize >= 1024 || GetProcessingSettings().forceHostMemoryPoolSize >= 1024) { + mProcessingSettings->memoryAllocationStrategy = GPUMemoryResource::ALLOCATION_GLOBAL; } - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) { - mProcessingSettings.memoryAllocationStrategy = IsGPU() ? GPUMemoryResource::ALLOCATION_GLOBAL : GPUMemoryResource::ALLOCATION_INDIVIDUAL; + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) { + mProcessingSettings->memoryAllocationStrategy = IsGPU() ? GPUMemoryResource::ALLOCATION_GLOBAL : GPUMemoryResource::ALLOCATION_INDIVIDUAL; } - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { - mProcessingSettings.forceMemoryPoolSize = mProcessingSettings.forceHostMemoryPoolSize = 0; + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + mProcessingSettings->forceMemoryPoolSize = mProcessingSettings->forceHostMemoryPoolSize = 0; } - if (mProcessingSettings.debugLevel >= 4) { - mProcessingSettings.keepAllMemory = true; + if (GetProcessingSettings().debugLevel >= 4) { + mProcessingSettings->keepAllMemory = true; } - if (mProcessingSettings.debugLevel >= 5 && mProcessingSettings.allocDebugLevel < 2) { - mProcessingSettings.allocDebugLevel = 2; + if (GetProcessingSettings().debugLevel >= 5 && GetProcessingSettings().allocDebugLevel < 2) { + mProcessingSettings->allocDebugLevel = 2; } - if (mProcessingSettings.eventDisplay || mProcessingSettings.keepAllMemory) { - mProcessingSettings.keepDisplayMemory = true; + if (GetProcessingSettings().eventDisplay || GetProcessingSettings().keepAllMemory) { + mProcessingSettings->keepDisplayMemory = true; } - if (mProcessingSettings.debugLevel < 6) { - mProcessingSettings.debugMask = 0; + if (GetProcessingSettings().debugLevel < 6) { + mProcessingSettings->debugMask = 0; } - if (mProcessingSettings.debugLevel < 1) { - mProcessingSettings.deviceTimers = false; + if (GetProcessingSettings().debugLevel < 1) { + mProcessingSettings->deviceTimers = false; } - if (mProcessingSettings.debugLevel > 0) { - mProcessingSettings.recoTaskTiming = true; + if (GetProcessingSettings().debugLevel > 0) { + mProcessingSettings->recoTaskTiming = true; } - if (mProcessingSettings.deterministicGPUReconstruction == -1) { - mProcessingSettings.deterministicGPUReconstruction = mProcessingSettings.debugLevel >= 6; + if (GetProcessingSettings().deterministicGPUReconstruction == -1) { + mProcessingSettings->deterministicGPUReconstruction = GetProcessingSettings().debugLevel >= 6; } - if (mProcessingSettings.deterministicGPUReconstruction) { + if (GetProcessingSettings().deterministicGPUReconstruction) { #ifndef GPUCA_DETERMINISTIC_MODE GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); #endif - mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU; + mProcessingSettings->overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU; param().rec.tpc.nWaysOuter = true; if (param().rec.tpc.looperInterpolationInExtraPass == -1) { param().rec.tpc.looperInterpolationInExtraPass = 0; } - if (mProcessingSettings.createO2Output > 1) { - mProcessingSettings.createO2Output = 1; + if (GetProcessingSettings().createO2Output > 1) { + mProcessingSettings->createO2Output = 1; } - mProcessingSettings.rtc.deterministic = 1; + mProcessingSettings->rtc.deterministic = 1; } else { #ifdef GPUCA_DETERMINISTIC_MODE GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!"); #endif } - if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) { - mProcessingSettings.nTPCClustererLanes = 1; + if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) { + mProcessingSettings->nTPCClustererLanes = 1; } - if (mProcessingSettings.createO2Output > 1 && mProcessingSettings.runQA && mProcessingSettings.qcRunFraction == 100.f) { - mProcessingSettings.createO2Output = 1; + if (GetProcessingSettings().createO2Output > 1 && GetProcessingSettings().runQA && GetProcessingSettings().qcRunFraction == 100.f) { + mProcessingSettings->createO2Output = 1; } - if (!mProcessingSettings.createO2Output || !IsGPU()) { - mProcessingSettings.clearO2OutputFromGPU = false; + if (!GetProcessingSettings().createO2Output || !IsGPU()) { + mProcessingSettings->clearO2OutputFromGPU = false; } if (!(mRecoSteps.stepsGPUMask & GPUDataTypes::RecoStep::TPCMerging)) { - mProcessingSettings.mergerSortTracks = false; + mProcessingSettings->mergerSortTracks = false; } - if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) { - mProcessingSettings.delayedOutput = false; + if (GetProcessingSettings().debugLevel > 3 || !IsGPU() || GetProcessingSettings().deterministicGPUReconstruction) { + mProcessingSettings->delayedOutput = false; } - if (!mProcessingSettings.rtc.enable) { - mProcessingSettings.rtc.optConstexpr = false; + if (!GetProcessingSettings().rtc.enable) { + mProcessingSettings->rtc.optConstexpr = false; } - mMemoryScalers->factor = mProcessingSettings.memoryScalingFactor; - mMemoryScalers->conservative = mProcessingSettings.conservativeMemoryEstimate; - mMemoryScalers->returnMaxVal = mProcessingSettings.forceMaxMemScalers != 0; - if (mProcessingSettings.forceMaxMemScalers > 1) { - mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers); + mMemoryScalers->factor = GetProcessingSettings().memoryScalingFactor; + mMemoryScalers->conservative = GetProcessingSettings().conservativeMemoryEstimate; + mMemoryScalers->returnMaxVal = GetProcessingSettings().forceMaxMemScalers != 0; + if (GetProcessingSettings().forceMaxMemScalers > 1) { + mMemoryScalers->rescaleMaxMem(GetProcessingSettings().forceMaxMemScalers); } - if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) { + if (GetProcessingSettings().nHostThreads != -1 && GetProcessingSettings().ompThreads != -1) { GPUFatal("Must not use both nHostThreads and ompThreads at the same time!"); - } else if (mProcessingSettings.ompThreads != -1) { - mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads; + } else if (GetProcessingSettings().ompThreads != -1) { + mProcessingSettings->nHostThreads = GetProcessingSettings().ompThreads; GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!"); } - if (mProcessingSettings.nHostThreads <= 0) { - mProcessingSettings.nHostThreads = internal::getDefaultNThreads(); + if (GetProcessingSettings().nHostThreads <= 0) { + mProcessingSettings->nHostThreads = internal::getDefaultNThreads(); } else { - mProcessingSettings.autoAdjustHostThreads = false; + mProcessingSettings->autoAdjustHostThreads = false; } - mMaxHostThreads = mProcessingSettings.nHostThreads; + mMaxHostThreads = GetProcessingSettings().nHostThreads; if (mMaster == nullptr) { mThreading = std::make_shared(); mThreading->control = std::make_unique(tbb::global_control::max_allowed_parallelism, mMaxHostThreads); @@ -330,26 +332,26 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() } mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads); if (IsGPU()) { - mNStreams = std::max(mProcessingSettings.nStreams, 3); + mNStreams = std::max(GetProcessingSettings().nStreams, 3); } - if (mProcessingSettings.nTPCClustererLanes == -1) { - mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max(1, std::min(GPUCA_NSECTORS, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads)); + if (GetProcessingSettings().nTPCClustererLanes == -1) { + mProcessingSettings->nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max(1, std::min(GPUCA_NSECTORS, GetProcessingSettings().inKernelParallel ? (mMaxHostThreads >= 4 ? std::min(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads)); } - if (mProcessingSettings.overrideClusterizerFragmentLen == -1) { - mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST; + if (GetProcessingSettings().overrideClusterizerFragmentLen == -1) { + mProcessingSettings->overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / GetProcessingSettings().nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST; } - if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSECTORS) { - GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes); - mProcessingSettings.nTPCClustererLanes = GPUCA_NSECTORS; + if (GetProcessingSettings().nTPCClustererLanes > GPUCA_NSECTORS) { + GPUError("Invalid value for nTPCClustererLanes: %d", GetProcessingSettings().nTPCClustererLanes); + mProcessingSettings->nTPCClustererLanes = GPUCA_NSECTORS; } - if (mProcessingSettings.doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || mProcessingSettings.memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) { + if (GetProcessingSettings().doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) { GPUError("Must use double pipeline mode only with exactly one chain that must support it"); return 1; } - if (mMaster == nullptr && mProcessingSettings.doublePipeline) { + if (mMaster == nullptr && GetProcessingSettings().doublePipeline) { mPipelineContext.reset(new GPUReconstructionPipelineContext); } @@ -367,16 +369,16 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mDeviceMemorySize += memPrimary; mHostMemorySize += memPageLocked; } - if (mProcessingSettings.forceMemoryPoolSize && mProcessingSettings.forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) { - mDeviceMemorySize = mProcessingSettings.forceMemoryPoolSize; - } else if (mProcessingSettings.forceMemoryPoolSize > 2) { - mDeviceMemorySize = mProcessingSettings.forceMemoryPoolSize; + if (GetProcessingSettings().forceMemoryPoolSize && GetProcessingSettings().forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) { + mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize; + } else if (GetProcessingSettings().forceMemoryPoolSize > 2) { + mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize; if (!IsGPU() || mOutputControl.useInternal()) { mHostMemorySize = mDeviceMemorySize; } } - if (mProcessingSettings.forceHostMemoryPoolSize) { - mHostMemorySize = mProcessingSettings.forceHostMemoryPoolSize; + if (GetProcessingSettings().forceHostMemoryPoolSize) { + mHostMemorySize = GetProcessingSettings().forceHostMemoryPoolSize; } for (uint32_t i = 0; i < mProcessors.size(); i++) { @@ -399,7 +401,7 @@ int32_t GPUReconstruction::InitPhasePermanentMemory() int32_t GPUReconstruction::InitPhaseAfterDevice() { - if (mProcessingSettings.forceMaxMemScalers <= 1 && mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().forceMaxMemScalers <= 1 && GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { mMemoryScalers->rescaleMaxMem(IsGPU() ? mDeviceMemorySize : mHostMemorySize); } for (uint32_t i = 0; i < mChains.size(); i++) { @@ -446,7 +448,7 @@ int32_t GPUReconstruction::Exit() mChains.clear(); // Make sure we destroy a possible ITS GPU tracker before we call the destructors mHostConstantMem.reset(); // Reset these explicitly before the destruction of other members unloads the library - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { for (uint32_t i = 0; i < mMemoryResources.size(); i++) { if (mMemoryResources[i].mReuse >= 0) { continue; @@ -481,9 +483,38 @@ void GPUReconstruction::ComputeReuseMax(GPUProcessor* proc) } } +int16_t GPUReconstruction::RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re) +{ + if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) { + if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !GetProcessingSettings().keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory + type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU); + } else { + type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU; + } + } + if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) { + type &= ~GPUMemoryResource::MEMORY_GPU; + } + mMemoryResources.emplace_back(proc, setPtr, (GPUMemoryResource::MemoryType)type, name); + if (mMemoryResources.size() >= 32768) { + throw std::bad_alloc(); + } + uint16_t retVal = mMemoryResources.size() - 1; + if (re.type != GPUMemoryReuse::NONE && !GetProcessingSettings().disableMemoryReuse) { + const auto& it = mMemoryReuse1to1.find(re.id); + if (it == mMemoryReuse1to1.end()) { + mMemoryReuse1to1[re.id] = {proc, retVal}; + } else { + mMemoryResources[retVal].mReuse = it->second.res[0]; + it->second.res.emplace_back(retVal); + } + } + return retVal; +} + size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool resetCustom) { - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Allocating memory %p", (void*)proc); } size_t total = 0; @@ -496,7 +527,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool rese } } } - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Allocating memory done"); } return total; @@ -504,7 +535,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool rese size_t GPUReconstruction::AllocateRegisteredPermanentMemory() { - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Allocating Permanent Memory"); } int32_t total = 0; @@ -515,7 +546,7 @@ size_t GPUReconstruction::AllocateRegisteredPermanentMemory() } mHostMemoryPermanent = mHostMemoryPool; mDeviceMemoryPermanent = mDeviceMemoryPool; - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Permanent Memory Done"); } return total; @@ -534,7 +565,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, GPUError("Insufficient reuse memory %lu < %lu (%s) (%s)", mMemoryResources[res->mReuse].mSize, retVal, res->mName, device); throw std::bad_alloc(); } - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Reused (" << device << ") " << res->mName << ": " << retVal << "\n"; } return retVal; @@ -568,7 +599,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, std::cerr << "Memory pool size exceeded (" << device << ") (" << res->mName << ": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) << " > " << memorysize << "\n"; throw std::bad_alloc(); } - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Allocated (" << device << ") " << res->mName << ": " << retVal << " - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) << "\n"; } return retVal; @@ -576,7 +607,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool) { - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) { if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { if (res->mPtrDevice && res->mReuse < 0) { operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); @@ -593,7 +624,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, } res->mPtr = GPUProcessor::alignPointer(res->mPtrDevice); res->SetPointers(res->mPtr); - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n"; } if (res->mType & GPUMemoryResource::MEMORY_STACK) { @@ -612,13 +643,13 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, if (IsGPU() && res->mOverrideSize < GPUCA_BUFFER_ALIGNMENT) { res->mOverrideSize = GPUCA_BUFFER_ALIGNMENT; } - if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || mProcessingSettings.keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory + if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || GetProcessingSettings().keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory if (control && control->useExternal()) { if (control->allocator) { res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize); res->mPtr = control->allocator(CAMath::nextMultipleOf(res->mSize)); res->mSize = std::max(ptrDiff(res->SetPointers(res->mPtr), res->mPtr), res->mOverrideSize); - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Allocated (from callback) " << res->mName << ": " << res->mSize << "\n"; } } else { @@ -676,7 +707,7 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type) if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) { throw std::runtime_error("Requested invalid memory typo for unmanaged allocation"); } - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]); return GPUProcessor::alignPointer(mUnmanagedChunks.back().get()); } else { @@ -689,7 +720,7 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type) throw std::bad_alloc(); } UpdateMaxMemoryUsed(); - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n"; } return retVal; @@ -711,7 +742,7 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) throw std::bad_alloc(); } UpdateMaxMemoryUsed(); - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n"; } @@ -773,10 +804,10 @@ void GPUReconstruction::FreeRegisteredMemory(int16_t ires) void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res) { - if (mProcessingSettings.allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) { + if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) { std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n"; } - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) { operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); } res->mPtr = nullptr; @@ -789,7 +820,7 @@ void GPUReconstruction::ReturnVolatileDeviceMemory() mDeviceMemoryPool = mVolatileMemoryStart; mVolatileMemoryStart = nullptr; } - if (mProcessingSettings.allocDebugLevel >= 2) { + if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n"; } } @@ -807,7 +838,7 @@ void GPUReconstruction::PushNonPersistentMemory(uint64_t tag) void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) { - if (mProcessingSettings.keepDisplayMemory || mProcessingSettings.disableMemoryReuse) { + if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) { return; } if (mNonPersistentMemoryStack.size() == 0) { @@ -816,7 +847,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) if (tag != 0 && std::get<3>(mNonPersistentMemoryStack.back()) != tag) { GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str()); } - if ((mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) && (IsGPU() || mProcessingSettings.forceHostMemoryPoolSize)) { + if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size()); PrintMemoryOverview(); printf("%76s", ""); @@ -872,7 +903,7 @@ void GPUReconstruction::ClearAllocatedMemory(bool clearOutputs) mNonPersistentMemoryStack.clear(); mNonPersistentIndividualAllocations.clear(); mVolatileMemoryStart = nullptr; - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { mHostMemoryPool = GPUProcessor::alignPointer(mHostMemoryPermanent); mDeviceMemoryPool = GPUProcessor::alignPointer(mDeviceMemoryPermanent); mHostMemoryPoolEnd = mHostMemoryPoolBlocked ? mHostMemoryPoolBlocked : ((char*)mHostMemoryBase + mHostMemorySize); @@ -895,7 +926,7 @@ void GPUReconstruction::PrintMemoryMax() void GPUReconstruction::PrintMemoryOverview() { - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { printf("Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n", ptrDiff(mHostMemoryPool, mHostMemoryBase) + ptrDiff((char*)mHostMemoryBase + mHostMemorySize, mHostMemoryPoolEnd), mHostMemorySize, ptrDiff(mHostMemoryPermanent, mHostMemoryBase), ptrDiff(mHostMemoryPool, mHostMemoryPermanent), ptrDiff((char*)mHostMemoryBase + mHostMemorySize, mHostMemoryPoolEnd), ptrDiff(mDeviceMemoryPool, mDeviceMemoryBase) + ptrDiff((char*)mDeviceMemoryBase + mDeviceMemorySize, mDeviceMemoryPoolEnd), mDeviceMemorySize, ptrDiff(mDeviceMemoryPermanent, mDeviceMemoryBase), ptrDiff(mDeviceMemoryPool, mDeviceMemoryPermanent), ptrDiff((char*)mDeviceMemoryBase + mDeviceMemorySize, mDeviceMemoryPoolEnd), @@ -934,7 +965,7 @@ void GPUReconstruction::PrintMemoryStatistics() int32_t GPUReconstruction::registerMemoryForGPU(const void* ptr, size_t size) { - if (mProcessingSettings.noGPUMemoryRegistration) { + if (GetProcessingSettings().noGPUMemoryRegistration) { return 0; } int32_t retVal = registerMemoryForGPU_internal(ptr, size); @@ -946,7 +977,7 @@ int32_t GPUReconstruction::registerMemoryForGPU(const void* ptr, size_t size) int32_t GPUReconstruction::unregisterMemoryForGPU(const void* ptr) { - if (mProcessingSettings.noGPUMemoryRegistration) { + if (GetProcessingSettings().noGPUMemoryRegistration) { return 0; } const auto& pos = mRegisteredMemoryPtrs.find(ptr); @@ -982,10 +1013,10 @@ int32_t GPUReconstruction::getGeneralStepNum(GeneralStep step, bool validCheck) void GPUReconstruction::RunPipelineWorker() { - if (!mInitialized || !mProcessingSettings.doublePipeline || mMaster != nullptr || !mSlaves.size()) { + if (!mInitialized || !GetProcessingSettings().doublePipeline || mMaster != nullptr || !mSlaves.size()) { throw std::invalid_argument("Cannot start double pipeline mode"); } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Pipeline worker started"); } bool terminate = false; @@ -1011,7 +1042,7 @@ void GPUReconstruction::RunPipelineWorker() } q->c.notify_one(); } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Pipeline worker ended"); } } @@ -1107,7 +1138,7 @@ void GPUReconstruction::DumpSettings(const char* dir) std::string f; f = dir; f += "settings.dump"; - DumpStructToFile(&mGRPSettings, f.c_str()); + DumpStructToFile(mGRPSettings.get(), f.c_str()); for (uint32_t i = 0; i < mChains.size(); i++) { mChains[i]->DumpSettings(dir); } @@ -1121,11 +1152,11 @@ void GPUReconstruction::UpdateDynamicSettings(const GPUSettingsRecDynamic* d) void GPUReconstruction::UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p, const GPUSettingsRecDynamic* d) { if (g) { - mGRPSettings = *g; + *mGRPSettings = *g; } if (p) { - mProcessingSettings.debugLevel = p->debugLevel; - mProcessingSettings.resetTimers = p->resetTimers; + mProcessingSettings->debugLevel = p->debugLevel; + mProcessingSettings->resetTimers = p->resetTimers; } GPURecoStepConfiguration* w = nullptr; if (mRecoSteps.steps.isSet(GPUDataTypes::RecoStep::TPCdEdx)) { @@ -1142,11 +1173,11 @@ int32_t GPUReconstruction::ReadSettings(const char* dir) std::string f; f = dir; f += "settings.dump"; - new (&mGRPSettings) GPUSettingsGRP; - if (ReadStructFromFile(f.c_str(), &mGRPSettings)) { + new (mGRPSettings.get()) GPUSettingsGRP; + if (ReadStructFromFile(f.c_str(), mGRPSettings.get())) { return 1; } - param().UpdateSettings(&mGRPSettings); + param().UpdateSettings(mGRPSettings.get()); for (uint32_t i = 0; i < mChains.size(); i++) { mChains[i]->ReadSettings(dir); } @@ -1173,9 +1204,9 @@ void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettings GPUError("Cannot update settings while initialized"); throw std::runtime_error("Settings updated while initialized"); } - mGRPSettings = *grp; + *mGRPSettings = *grp; if (proc) { - mProcessingSettings = *proc; + *mProcessingSettings = *proc; } if (workflow) { mRecoSteps.steps = workflow->steps; @@ -1183,7 +1214,7 @@ void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettings mRecoSteps.inputs = workflow->inputs; mRecoSteps.outputs = workflow->outputs; } - param().SetDefaults(&mGRPSettings, rec, proc, workflow); + param().SetDefaults(mGRPSettings.get(), rec, proc, workflow); } void GPUReconstruction::SetOutputControl(void* ptr, size_t size) @@ -1193,10 +1224,14 @@ void GPUReconstruction::SetOutputControl(void* ptr, size_t size) SetOutputControl(outputControl); } -void GPUReconstruction::SetInputControl(void* ptr, size_t size) -{ - mInputControl.set(ptr, size); -} +void GPUReconstruction::SetInputControl(void* ptr, size_t size) { mInputControl.set(ptr, size); } +GPUReconstruction::DeviceType GPUReconstruction::GetDeviceType() const { return (DeviceType)GetDeviceBackendSettings().deviceType; } +const GPUParam& GPUReconstruction::GetParam() const { return mHostConstantMem->param; } +void GPUReconstruction::SetResetTimers(bool reset) { mProcessingSettings->resetTimers = reset; } +void GPUReconstruction::SetDebugLevelTmp(int32_t level) { mProcessingSettings->debugLevel = level; } +GPUParam& GPUReconstruction::param() { return mHostConstantMem->param; } +const GPUTrackingInOutPointers GPUReconstruction::GetIOPtrs() const { return mHostConstantMem->ioPtrs; } +const GPUCalibObjectsConst& GPUReconstruction::GetCalib() const { return processors()->calibObjects; } ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r) { diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 23fb6e4d9ff06..b6256f7f8ad82 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -25,13 +25,13 @@ #include #include -#include "GPUTRDDef.h" -#include "GPUParam.h" -#include "GPUSettings.h" -#include "GPUOutputControl.h" +#include "GPUDataTypes.h" #include "GPUMemoryResource.h" -#include "GPUConstantMem.h" -#include "GPULogging.h" +#include "GPUOutputControl.h" + +/*#include "GPUParam.h" +#include "GPUSettings.h" +#include "GPULogging.h"*/ namespace o2::its { @@ -49,6 +49,13 @@ struct GPUReconstructionThreading; class GPUROOTDumpCore; class ThrustVolatileAllocator; struct GPUDefParameters; +class GPUMemoryResource; +struct GPUSettingsDeviceBackend; +struct GPUSettingsGRP; +struct GPUSettingsProcessing; +struct GPUSettingsRec; +struct GPUSettingsRecDynamic; +struct GPUMemoryReuse; namespace gpu_reconstruction_kernels { @@ -186,18 +193,20 @@ class GPUReconstruction bool slavesExist() { return mSlaves.size() || mMaster; } // Getters / setters for parameters - DeviceType GetDeviceType() const { return (DeviceType)mDeviceBackendSettings.deviceType; } + DeviceType GetDeviceType() const; bool IsGPU() const { return GetDeviceType() != DeviceType::INVALID_DEVICE && GetDeviceType() != DeviceType::CPU; } - const GPUParam& GetParam() const { return mHostConstantMem->param; } + const GPUParam& GetParam() const; const GPUConstantMem& GetConstantMem() const { return *mHostConstantMem; } - const GPUSettingsGRP& GetGRPSettings() const { return mGRPSettings; } - const GPUSettingsDeviceBackend& GetDeviceBackendSettings() { return mDeviceBackendSettings; } - const GPUSettingsProcessing& GetProcessingSettings() const { return mProcessingSettings; } + const GPUTrackingInOutPointers GetIOPtrs() const; + const GPUSettingsGRP& GetGRPSettings() const { return *mGRPSettings; } + const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return *mDeviceBackendSettings; } + const GPUSettingsProcessing& GetProcessingSettings() const { return *mProcessingSettings; } + const GPUCalibObjectsConst& GetCalib() const; bool IsInitialized() const { return mInitialized; } void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow = nullptr); void SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec = nullptr, const GPUSettingsProcessing* proc = nullptr, const GPURecoStepConfiguration* workflow = nullptr); - void SetResetTimers(bool reset) { mProcessingSettings.resetTimers = reset; } // May update also after Init() - void SetDebugLevelTmp(int32_t level) { mProcessingSettings.debugLevel = level; } // Temporarily, before calling SetSettings() + void SetResetTimers(bool reset); // May update also after Init() + void SetDebugLevelTmp(int32_t level); // Temporarily, before calling SetSettings() void UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p = nullptr, const GPUSettingsRecDynamic* d = nullptr); void UpdateDynamicSettings(const GPUSettingsRecDynamic* d); void SetOutputControl(const GPUOutputControl& v) { mOutputControl = v; } @@ -272,6 +281,7 @@ class GPUReconstruction size_t ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr* mem, InOutPointerType type, T** nonConstPtrs = nullptr); template T* AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr& u); + int16_t RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re); // Private helper functions to dump / load flat objects template @@ -292,17 +302,17 @@ class GPUReconstruction // Pointers to tracker classes GPUConstantMem* processors() { return mHostConstantMem.get(); } const GPUConstantMem* processors() const { return mHostConstantMem.get(); } - GPUParam& param() { return mHostConstantMem->param; } + GPUParam& param(); std::unique_ptr mHostConstantMem; GPUConstantMem* mDeviceConstantMem = nullptr; // Settings - GPUSettingsGRP mGRPSettings; // Global Run Parameters - GPUSettingsDeviceBackend mDeviceBackendSettings; // Processing Parameters (at constructor level) - GPUSettingsProcessing mProcessingSettings; // Processing Parameters (at init level) - GPUOutputControl mOutputControl; // Controls the output of the individual components - GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps - std::unique_ptr mMemoryScalers; // Scalers how much memory will be needed + std::unique_ptr mGRPSettings; // Global Run Parameters + std::unique_ptr mDeviceBackendSettings; // Processing Parameters (at constructor level) + std::unique_ptr mProcessingSettings; // Processing Parameters (at init level) + GPUOutputControl mOutputControl; // Controls the output of the individual components + GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps + std::unique_ptr mMemoryScalers; // Scalers how much memory will be needed GPURecoStepConfiguration mRecoSteps; @@ -392,35 +402,6 @@ class GPUReconstruction static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg); }; -template -inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr& u) -{ - if (n == 0) { - u.reset(nullptr); - return nullptr; - } - T* retVal; - if (mInputControl.useExternal()) { - u.reset(nullptr); - mInputControl.checkCurrent(); - GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n); - if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) { - throw std::bad_alloc(); - } - } else { - u.reset(new T[n]); - retVal = u.get(); - if (mProcessingSettings.registerStandaloneInputMemory) { - if (registerMemoryForGPU(u.get(), n * sizeof(T))) { - GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T))); - throw std::bad_alloc(); - } - } - } - ptr = retVal; - return retVal; -} - template inline T* GPUReconstruction::AddChain(Args... args) { @@ -431,31 +412,7 @@ inline T* GPUReconstruction::AddChain(Args... args) template inline int16_t GPUReconstruction::RegisterMemoryAllocation(T* proc, void* (T::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re) { - if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) { - if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !mProcessingSettings.keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory - type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU); - } else { - type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU; - } - } - if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) { - type &= ~GPUMemoryResource::MEMORY_GPU; - } - mMemoryResources.emplace_back(proc, static_cast(setPtr), (GPUMemoryResource::MemoryType)type, name); - if (mMemoryResources.size() >= 32768) { - throw std::bad_alloc(); - } - uint16_t retVal = mMemoryResources.size() - 1; - if (re.type != GPUMemoryReuse::NONE && !mProcessingSettings.disableMemoryReuse) { - const auto& it = mMemoryReuse1to1.find(re.id); - if (it == mMemoryReuse1to1.end()) { - mMemoryReuse1to1[re.id] = {proc, retVal}; - } else { - mMemoryResources[retVal].mReuse = it->second.res[0]; - it->second.res.emplace_back(retVal); - } - } - return retVal; + return RegisterMemoryAllocationHelper(proc, static_cast(setPtr), type, name, re); } template @@ -471,7 +428,7 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate) { static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class"); if (allocate) { - proc->SetMaxData(mHostConstantMem->ioPtrs); + proc->SetMaxData(GetIOPtrs()); } if (proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && proc->mLinkedProcessor) { std::memcpy((void*)proc->mLinkedProcessor, (const void*)proc, sizeof(*proc)); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 39507beda8a55..2453ce4a2328f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -30,19 +30,18 @@ #include "GPUTRDTrackletLabels.h" #include "GPUMemoryResource.h" #include "GPUConstantMem.h" +#include "GPULogging.h" #include "GPUMemorySizeScalers.h" +#include "GPUReconstructionProcessingKernels.inc" + #include #include -#define GPUCA_LOGGING_PRINTF -#include "GPULogging.h" - #ifndef _WIN32 #include #endif using namespace o2::gpu; -using namespace o2::gpu::gpu_reconstruction_kernels; constexpr GPUReconstructionCPU::krnlRunRange GPUReconstructionCPU::krnlRunRangeNone; constexpr GPUReconstructionCPU::krnlEvent GPUReconstructionCPU::krnlEventNone; @@ -55,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU() } template -inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) { auto& x = _xyz.x; auto& y = _xyz.y; @@ -67,7 +66,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu } int32_t nThreads = getNKernelHostThreads(false); if (nThreads > 1) { - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { printf("Running %d Threads\n", mThreading->activeThreads->max_concurrency()); } tbb::this_task_arena::isolate([&] { @@ -89,7 +88,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu } template <> -inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { int32_t nThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNKernelHostThreads(true))); if (nThreads > 1) { @@ -110,7 +109,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal -void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args) +void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs& args) { #pragma GCC diagnostic push #if defined(__clang__) @@ -121,14 +120,14 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs -gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu) +GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu) { if (gpu == -1) { gpu = IsGPU(); } const auto num = GetKernelNum(); const auto* p = gpu ? mParDevice : mParCPU; - gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]}; + GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]}; if (ret.nThreads == 0) { ret.nThreads = gpu ? mThreadCount : 1u; } @@ -138,9 +137,9 @@ gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProper return ret; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ - template void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args); \ - template krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu); +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ + template void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs& args); \ + template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL @@ -169,7 +168,7 @@ size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, i if (!(res.mType & GPUMemoryResource::MEMORY_GPU) || (res.mType & GPUMemoryResource::MEMORY_CUSTOM_TRANSFER)) { continue; } - if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) { + if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) { continue; } if (toGPU) { @@ -197,7 +196,7 @@ int32_t GPUReconstructionCPU::InitDevice() { mActiveHostKernelThreads = mMaxHostThreads; mThreading->activeThreads = std::make_unique(mActiveHostKernelThreads); - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { if (mMaster == nullptr) { if (mDeviceMemorySize > mHostMemorySize) { mHostMemorySize = mDeviceMemorySize; @@ -207,7 +206,7 @@ int32_t GPUReconstructionCPU::InitDevice() mHostMemoryPermanent = mHostMemoryBase; ClearAllocatedMemory(); } - if (mProcessingSettings.inKernelParallel) { + if (GetProcessingSettings().inKernelParallel) { mBlockCount = mMaxHostThreads; } mProcShadow.mProcessorsProc = processors(); @@ -216,7 +215,7 @@ int32_t GPUReconstructionCPU::InitDevice() int32_t GPUReconstructionCPU::ExitDevice() { - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { if (mMaster == nullptr) { operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); } @@ -232,13 +231,13 @@ int32_t GPUReconstructionCPU::RunChains() mStatNEvents++; mNEventsProcessed++; - if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) { + if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) { printf("Allocated memory when starting processing %34s", ""); PrintMemoryOverview(); } mTimerTotal.Start(); const std::clock_t cpuTimerStart = std::clock(); - if (mProcessingSettings.doublePipeline) { + if (GetProcessingSettings().doublePipeline) { int32_t retVal = EnqueuePipeline(); if (retVal) { return retVal; @@ -259,7 +258,7 @@ int32_t GPUReconstructionCPU::RunChains() } mTimerTotal.Stop(); mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC; - if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) { + if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) { printf("Allocated memory when ending processing %36s", ""); PrintMemoryOverview(); } @@ -281,7 +280,7 @@ int32_t GPUReconstructionCPU::RunChains() for (int32_t j = 0; j < mTimers[i]->num; j++) { HighResTimer& timer = mTimers[i]->timer[j]; time += timer.GetElapsedTime(); - if (mProcessingSettings.resetTimers) { + if (GetProcessingSettings().resetTimers) { timer.Reset(); } } @@ -297,7 +296,7 @@ int32_t GPUReconstructionCPU::RunChains() snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); } printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); - if (mProcessingSettings.resetTimers) { + if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; } @@ -317,7 +316,7 @@ int32_t GPUReconstructionCPU::RunChains() printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); } - if (mProcessingSettings.resetTimers) { + if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; mTimersRecoSteps[i].timerToGPU.Reset(); mTimersRecoSteps[i].timerToHost.Reset(); @@ -340,7 +339,7 @@ int32_t GPUReconstructionCPU::RunChains() } else if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); } - if (mProcessingSettings.resetTimers) { + if (GetProcessingSettings().resetTimers) { mStatNEvents = 0; mStatCPUTime = 0; mTimerTotal.Reset(); @@ -366,7 +365,7 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons if (!((size_t)¶m().occupancyTotal - (size_t)¶m().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) { throw std::runtime_error("occupancy data not consecutive in GPUParam"); } - const auto threadContext = GetThreadContext(); + const auto holdContext = GetThreadContext(); size_t tmp[2] = {(size_t)mapGPU, 0}; memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal)); WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index b37bf2b75f01c..d0d8b05c4af0e 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -16,14 +16,9 @@ #define GPURECONSTRUCTIONICPU_H #include "GPUReconstructionProcessing.h" -#include "GPUConstantMem.h" #include #include -#include "GPUGeneralKernels.h" -#include "GPUReconstructionKernelIncludes.h" -#include "GPUReconstructionKernels.h" - namespace Ort { struct SessionOptions; @@ -32,20 +27,7 @@ struct SessionOptions; namespace o2::gpu { -class GPUReconstructionCPUBackend : public GPUReconstructionProcessing -{ - public: - ~GPUReconstructionCPUBackend() override = default; - - protected: - GPUReconstructionCPUBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing(cfg) {} - template - void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs& args); - template - void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args); -}; - -class GPUReconstructionCPU : public GPUReconstructionKernels +class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface { friend GPUReconstruction* GPUReconstruction::GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg); friend class GPUChain; @@ -55,10 +37,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels - void runKernel(krnlSetup&& setup, Args&&... args); template - gpu_reconstruction_kernels::krnlProperties getKernelProperties(int gpu = -1); + krnlProperties getKernelProperties(int gpu = -1); + template + void runKernelBackend(const krnlSetupArgs& args); virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false); int32_t GPUStuck() { return mGPUStuck; } @@ -75,21 +57,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \ - { \ - krnlSetupArgs args(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)); \ - const uint32_t num = GetKernelNum(); \ - if (cpuFallback) { \ - GPUReconstructionCPU::runKernelImpl(num, &args); \ - } else { \ - runKernelImpl(num, &args); \ - } \ - } -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL + GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface(cfg) {} + + template + void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; } int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; } @@ -132,72 +103,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels + void runKernelInterface(krnlSetup&& setup, Args const&... args); }; -template -inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args) -{ - HighResTimer* t = nullptr; - GPUDataTypes::RecoStep myStep = S::GetRecoStep() == GPUDataTypes::RecoStep::NoRecoStep ? setup.x.step : S::GetRecoStep(); - if (myStep == GPUDataTypes::RecoStep::NoRecoStep) { - throw std::runtime_error("Failure running general kernel without defining RecoStep"); - } - int32_t cpuFallback = IsGPU() ? (setup.x.device == krnlDeviceType::CPU ? 2 : (mRecoSteps.stepsGPUMask & myStep) != myStep) : 0; - uint32_t& nThreads = setup.x.nThreads; - uint32_t& nBlocks = setup.x.nBlocks; - const uint32_t stream = setup.x.stream; - auto prop = getKernelProperties(); - const int32_t autoThreads = cpuFallback ? 1 : prop.nThreads; - const int32_t autoBlocks = cpuFallback ? 1 : (prop.forceBlocks ? prop.forceBlocks : (prop.minBlocks * mBlockCount)); - if (nBlocks == (uint32_t)-1) { - nBlocks = (nThreads + autoThreads - 1) / autoThreads; - nThreads = autoThreads; - } else if (nBlocks == (uint32_t)-2) { - nBlocks = nThreads; - nThreads = autoThreads; - } else if (nBlocks == (uint32_t)-3) { - nBlocks = autoBlocks; - nThreads = autoThreads; - } else if ((int32_t)nThreads < 0) { - nThreads = cpuFallback ? 1 : -nThreads; - } - if (nThreads > GPUCA_MAX_THREADS) { - throw std::runtime_error("GPUCA_MAX_THREADS exceeded"); - } - if (mProcessingSettings.debugLevel >= 3) { - GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str()); - } - if (nThreads == 0 || nBlocks == 0) { - return; - } - if (mProcessingSettings.debugLevel >= 1) { - t = &getKernelTimer(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream); - if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) { - t->Start(); - } - } - double deviceTimerTime = 0.; - runKernelImplWrapper(gpu_reconstruction_kernels::classArgument(), cpuFallback, deviceTimerTime, std::forward(setup), std::forward(args)...); - if (GPUDebug(GetKernelName(), stream, mProcessingSettings.serializeGPU & 1)) { - throw std::runtime_error("kernel failure"); - } - if (mProcessingSettings.debugLevel >= 1) { - if (t) { - if (deviceTimerTime != 0.) { - t->AddTime(deviceTimerTime); - if (t->IsRunning()) { - t->Abort(); - } - } else if (t->IsRunning()) { - t->Stop(); - } - } - if (CheckErrorCodes(cpuFallback) && !mProcessingSettings.ignoreNonFatalGPUErrors) { - throw std::runtime_error("kernel error code"); - } - } -} - } // namespace o2::gpu #endif diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h b/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h new file mode 100644 index 0000000000000..837516a93b6ae --- /dev/null +++ b/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h @@ -0,0 +1,98 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionCPUKernels.h +/// \author David Rohr + +#ifndef GPURECONSTRUCTIONICPUKERNELS_H +#define GPURECONSTRUCTIONICPUKERNELS_H + +#include "GPUReconstructionCPU.h" +#include "GPUSettings.h" +#include "GPULogging.h" + +namespace o2::gpu +{ + +template +inline void GPUReconstructionCPU::runKernelInterface(krnlSetup&& setup, Args const&... args) +{ + HighResTimer* t = nullptr; + GPUDataTypes::RecoStep myStep = S::GetRecoStep() == GPUDataTypes::RecoStep::NoRecoStep ? setup.x.step : S::GetRecoStep(); + if (myStep == GPUDataTypes::RecoStep::NoRecoStep) { + throw std::runtime_error("Failure running general kernel without defining RecoStep"); + } + int32_t cpuFallback = IsGPU() ? (setup.x.device == krnlDeviceType::CPU ? 2 : (mRecoSteps.stepsGPUMask & myStep) != myStep) : 0; + uint32_t& nThreads = setup.x.nThreads; + uint32_t& nBlocks = setup.x.nBlocks; + const uint32_t stream = setup.x.stream; + auto prop = getKernelProperties(); + const int32_t autoThreads = cpuFallback ? 1 : prop.nThreads; + const int32_t autoBlocks = cpuFallback ? 1 : (prop.forceBlocks ? prop.forceBlocks : (prop.minBlocks * mBlockCount)); + if (nBlocks == (uint32_t)-1) { + nBlocks = (nThreads + autoThreads - 1) / autoThreads; + nThreads = autoThreads; + } else if (nBlocks == (uint32_t)-2) { + nBlocks = nThreads; + nThreads = autoThreads; + } else if (nBlocks == (uint32_t)-3) { + nBlocks = autoBlocks; + nThreads = autoThreads; + } else if ((int32_t)nThreads < 0) { + nThreads = cpuFallback ? 1 : -nThreads; + } + if (nThreads > GPUCA_MAX_THREADS) { + throw std::runtime_error("GPUCA_MAX_THREADS exceeded"); + } + if (GetProcessingSettings().debugLevel >= 3) { + GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : (cpuFallback ? "CPU (fallback)" : mDeviceName.c_str())); + } + if (nThreads == 0 || nBlocks == 0) { + return; + } + if (GetProcessingSettings().debugLevel >= 1) { + t = &getKernelTimer(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream); + if ((!GetProcessingSettings().deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) { + t->Start(); + } + } + double deviceTimerTime = 0.; + krnlSetupArgs argPack{{}, {{setup.x, setup.y, setup.z}, deviceTimerTime}, {args...}}; + const uint32_t num = GetKernelNum(); + if (cpuFallback) { + GPUReconstructionCPU::runKernelVirtual(num, &argPack); + } else { + runKernelVirtual(num, &argPack); + } + + if (GPUDebug(GetKernelName(), stream, GetProcessingSettings().serializeGPU & 1)) { + throw std::runtime_error("kernel failure"); + } + if (GetProcessingSettings().debugLevel >= 1) { + if (t) { + if (deviceTimerTime != 0.) { + t->AddTime(deviceTimerTime); + if (t->IsRunning()) { + t->Abort(); + } + } else if (t->IsRunning()) { + t->Stop(); + } + } + if (CheckErrorCodes(cpuFallback) && !GetProcessingSettings().ignoreNonFatalGPUErrors) { + throw std::runtime_error("kernel error code"); + } + } +} + +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx index e12ca7ec601ad..2dec88393f632 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx @@ -25,7 +25,7 @@ #include "GPUO2DataTypes.h" #include "GPUDataTypes.h" #include "GPUTPCGeometry.h" -#include "AliHLTTPCRawCluster.h" +#include "AliHLTTPCRawCluster.h" // TODO: Is this still needed at all, or can it be removed? #include "GPUParam.h" #include "GPULogging.h" #include diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx index b389e99a0b2bb..9962bdf3922c1 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx @@ -14,6 +14,7 @@ #include "GPUReconstructionDeviceBase.h" #include "GPUReconstructionIncludes.h" +#include "GPUConstantMem.h" #include "GPUTPCTracker.h" @@ -93,21 +94,21 @@ int32_t GPUReconstructionDeviceBase::InitDevice() // CPU_SET(0, &mask); // sched_setaffinity(0, sizeof(mask), &mask); - if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { GPUError("Individual memory allocation strategy unsupported for device\n"); return (1); } - if (mProcessingSettings.nStreams > GPUCA_MAX_STREAMS) { - GPUError("Too many straems requested %d > %d\n", mProcessingSettings.nStreams, GPUCA_MAX_STREAMS); + if (GetProcessingSettings().nStreams > GPUCA_MAX_STREAMS) { + GPUError("Too many straems requested %d > %d\n", GetProcessingSettings().nStreams, GPUCA_MAX_STREAMS); return (1); } void* semLock = nullptr; - if (mProcessingSettings.globalInitMutex && GetGlobalLock(semLock)) { + if (GetProcessingSettings().globalInitMutex && GetGlobalLock(semLock)) { return (1); } - if (mProcessingSettings.deviceTimers) { + if (GetProcessingSettings().deviceTimers) { AddGPUEvents(mDebugEvents); } @@ -117,7 +118,7 @@ int32_t GPUReconstructionDeviceBase::InitDevice() return (1); } - if (mProcessingSettings.globalInitMutex) { + if (GetProcessingSettings().globalInitMutex) { ReleaseGlobalLock(semLock); } @@ -129,7 +130,7 @@ int32_t GPUReconstructionDeviceBase::InitDevice() mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors"); AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors); - if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) { + if (mMaster == nullptr || GetProcessingSettings().debugLevel >= 2) { GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message! } @@ -186,13 +187,15 @@ void GPUReconstructionDeviceBase::runConstantRegistrators() size_t GPUReconstructionDeviceBase::TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst) { if (!(res->Type() & GPUMemoryResource::MEMORY_GPU)) { - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUInfo("Skipped transfer of non-GPU memory resource: %s", res->Name()); } return 0; } - if (mProcessingSettings.debugLevel >= 3 && (strcmp(res->Name(), "ErrorCodes") || mProcessingSettings.debugLevel >= 4)) { + if (GetProcessingSettings().debugLevel >= 3 && (strcmp(res->Name(), "ErrorCodes") || GetProcessingSettings().debugLevel >= 4)) { GPUInfo("Copying to %s: %s - %ld bytes", toGPU ? "GPU" : "Host", res->Name(), (int64_t)res->Size()); } return GPUMemCpy(dst, src, res->Size(), stream, toGPU, ev, evList, nEvents); } + +const GPUParam* GPUReconstructionDeviceBase::DeviceParam() const { return &mDeviceConstantMem->param; } diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h index f0e19f588e0f1..c8288f978f6ae 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h +++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h @@ -22,16 +22,12 @@ namespace o2::gpu { -#if !(defined(__CLING__) || defined(__ROOTCLING__) || defined(G__ROOT)) -extern template class GPUReconstructionKernels; -#endif - class GPUReconstructionDeviceBase : public GPUReconstructionCPU { public: ~GPUReconstructionDeviceBase() override; - const GPUParam* DeviceParam() const { return &mDeviceConstantMem->param; } + const GPUParam* DeviceParam() const; struct deviceConstantMemRegistration { deviceConstantMemRegistration(void* (*reg)()) { @@ -51,8 +47,6 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU int32_t unregisterMemoryForGPU_internal(const void* ptr) override; void unregisterRemainingRegisteredMemory(); - virtual const GPUTPCTracker* CPUTracker(int32_t iSector) { return &processors()->tpcTrackers[iSector]; } - int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override = 0; size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst) override; size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override = 0; diff --git a/GPU/GPUTracking/Base/GPUReconstructionIO.h b/GPU/GPUTracking/Base/GPUReconstructionIO.h index 2208c15846e09..810ebfffe1703 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionIO.h +++ b/GPU/GPUTracking/Base/GPUReconstructionIO.h @@ -16,10 +16,40 @@ #define GPURECONSTRUCTIONIO_H #include "GPUReconstruction.h" +#include "GPUSettings.h" namespace o2::gpu { +template +inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr& u) +{ + if (n == 0) { + u.reset(nullptr); + return nullptr; + } + T* retVal; + if (mInputControl.useExternal()) { + u.reset(nullptr); + mInputControl.checkCurrent(); + GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n); + if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) { + throw std::bad_alloc(); + } + } else { + u.reset(new T[n]); + retVal = u.get(); + if (GetProcessingSettings().registerStandaloneInputMemory) { + if (registerMemoryForGPU(u.get(), n * sizeof(T))) { + GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T))); + throw std::bad_alloc(); + } + } + } + ptr = retVal; + return retVal; +} + template inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, const S* num, InOutPointerType type) { @@ -38,7 +68,7 @@ inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, c fwrite(entries[i], sizeof(*entries[i]), num[i], fp); } } - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Dumped %ld %s", (int64_t)numTotal, IOTYPENAMES[type]); } return numTotal; @@ -72,7 +102,7 @@ inline size_t GPUReconstruction::ReadData(FILE* fp, const T** entries, S* num, s numTotal += num[i]; } (void)r; - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Read %ld %s", (int64_t)numTotal, IOTYPENAMES[type]); } return numTotal; @@ -112,7 +142,7 @@ inline std::unique_ptr GPUReconstruction::ReadFlatObjectFromFile(const char* r = fread((void*)retVal.get(), 1, size[0], fp); r = fread(buf, 1, size[1], fp); fclose(fp); - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Read %ld bytes from %s", (int64_t)r, file); } retVal->clearInternalBufferPtr(); @@ -151,7 +181,7 @@ inline std::unique_ptr GPUReconstruction::ReadStructFromFile(const char* file std::unique_ptr newObj(new T); r = fread(newObj.get(), 1, size, fp); fclose(fp); - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Read %ld bytes from %s", (int64_t)r, file); } return newObj; @@ -172,7 +202,7 @@ inline int32_t GPUReconstruction::ReadStructFromFile(const char* file, T* obj) } r = fread(obj, 1, size, fp); fclose(fp); - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Read %ld bytes from %s", (int64_t)r, file); } return 0; diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h index b3f6c6ec817fd..2b16dfb32fe14 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h @@ -65,7 +65,7 @@ // GPU Host wrappers for kernel #define GPUCA_KRNL_HOST(x_class, ...) \ GPUCA_KRNLGPU(x_class, __VA_ARGS__) \ - template <> class GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::backendInternal { \ + template <> class GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::backendInternal { \ public: \ template \ static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \ diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernels.h b/GPU/GPUTracking/Base/GPUReconstructionKernels.h deleted file mode 100644 index 7f500d471de1f..0000000000000 --- a/GPU/GPUTracking/Base/GPUReconstructionKernels.h +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file GPUReconstructionKernels.h -/// \author David Rohr - -#ifndef GPURECONSTRUCTIONKERNELS_H -#define GPURECONSTRUCTIONKERNELS_H - -#include "GPUReconstruction.h" - -namespace o2::gpu -{ - -namespace gpu_reconstruction_kernels -{ - -template -struct classArgument { - using t = T; - static constexpr int32_t i = I; -}; - -struct krnlExec { - constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto) : nBlocks(b), nThreads(t), stream(s), device(d), step(GPUDataTypes::RecoStep::NoRecoStep) {} - constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(GPUReconstruction::krnlDeviceType::Auto), step(st) {} - constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(d), step(st) {} - uint32_t nBlocks; - uint32_t nThreads; - int32_t stream; - GPUReconstruction::krnlDeviceType device; - GPUDataTypes::RecoStep step; -}; -struct krnlRunRange { - constexpr krnlRunRange() = default; - constexpr krnlRunRange(uint32_t v) : index(v) {} - uint32_t index = 0; -}; -struct krnlEvent { - constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {} - deviceEvent* ev; - deviceEvent* evList; - int32_t nEvents; -}; - -struct krnlProperties { - krnlProperties(int32_t t = 0, int32_t b = 1, int32_t b2 = 0) : nThreads(t), minBlocks(b), forceBlocks(b2) {} - uint32_t nThreads; - uint32_t minBlocks; - uint32_t forceBlocks; - uint32_t total() { return forceBlocks ? forceBlocks : (nThreads * minBlocks); } -}; - -struct krnlSetup { - krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {} - krnlExec x; - krnlRunRange y; - krnlEvent z; -}; - -struct krnlSetupTime : public krnlSetup { - double& t; -}; - -template -struct krnlSetupArgs : public gpu_reconstruction_kernels::classArgument { - krnlSetupArgs(const krnlExec& xx, const krnlRunRange& yy, const krnlEvent& zz, double& tt, const Args&... args) : s{{xx, yy, zz}, tt}, v(args...) {} - const krnlSetupTime s; - std::tuple sizeof(void*)), const Args&, const Args>::type...> v; -}; - -} // namespace gpu_reconstruction_kernels - -template -class GPUReconstructionKernels : public T -{ - public: - GPUReconstructionKernels(const GPUSettingsDeviceBackend& cfg) : T(cfg) {} - - protected: - using deviceEvent = gpu_reconstruction_kernels::deviceEvent; - using krnlExec = gpu_reconstruction_kernels::krnlExec; - using krnlRunRange = gpu_reconstruction_kernels::krnlRunRange; - using krnlEvent = gpu_reconstruction_kernels::krnlEvent; - using krnlSetup = gpu_reconstruction_kernels::krnlSetup; - using krnlSetupTime = gpu_reconstruction_kernels::krnlSetupTime; - template - using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs; - - virtual void runKernelImpl(const int num, const void* args) - { - switch (num) { // clang-format off -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \ - case x_num: { \ - const auto& args2 = *(const krnlSetupArgs*)args; \ - T::template runKernelBackend(args2); \ - break; \ - } -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL - } // clang-format on - } -}; - -} // namespace o2::gpu - -#endif diff --git a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx index aa01d26446b56..89517c612403b 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx @@ -24,6 +24,7 @@ #include "GPUReconstruction.h" #include "GPUReconstructionAvailableBackends.h" +#include "GPUSettings.h" #include "utils/qlibload.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx index d02309f66c762..a511102a492ef 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx @@ -15,6 +15,9 @@ #include "GPUReconstructionProcessing.h" #include "GPUReconstructionThreading.h" #include "GPUDefParametersLoad.inc" +#include "GPUReconstructionKernelIncludes.h" +#include "GPUSettings.h" +#include "GPULogging.h" using namespace o2::gpu; @@ -41,7 +44,7 @@ GPUReconstructionProcessing::~GPUReconstructionProcessing() int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores) { int32_t nThreads = 0; - if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) { + if (GetProcessingSettings().inKernelParallel == 2 && mNActiveThreadsOuterLoop) { if (splitCores) { nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop; nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop; @@ -50,7 +53,7 @@ int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores) } nThreads = std::max(1, nThreads); } else { - nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1; + nThreads = GetProcessingSettings().inKernelParallel ? mMaxHostThreads : 1; } return nThreads; } @@ -59,7 +62,7 @@ void GPUReconstructionProcessing::SetNActiveThreads(int32_t n) { mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads)); mThreading->activeThreads = std::make_unique(mActiveHostKernelThreads); - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n); } } @@ -80,12 +83,12 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max) { - if (condition && mProcessingSettings.inKernelParallel != 1) { - mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min(max, mMaxHostThreads) : mMaxHostThreads; + if (condition && GetProcessingSettings().inKernelParallel != 1) { + mNActiveThreadsOuterLoop = GetProcessingSettings().inKernelParallel == 2 ? std::min(max, mMaxHostThreads) : mMaxHostThreads; } else { mNActiveThreadsOuterLoop = 1; } - if (mProcessingSettings.debugLevel >= 5) { + if (GetProcessingSettings().debugLevel >= 5) { printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop); } return mNActiveThreadsOuterLoop; @@ -132,9 +135,9 @@ uint32_t GPUReconstructionProcessing::getNextTimerId() return id.fetch_add(1); } -std::unique_ptr GPUReconstructionProcessing::GetThreadContext() +std::unique_ptr GPUReconstructionProcessing::GetThreadContext() { - return std::make_unique(); + return std::make_unique(); } gpu_reconstruction_kernels::threadContext::threadContext() = default; diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index 4ce8bc1b42743..9e611e57148c6 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -16,7 +16,6 @@ #define GPURECONSTRUCTIONPROCESSING_H #include "GPUReconstruction.h" -#include "GPUReconstructionKernelIncludes.h" #include "utils/timer.h" #include @@ -32,7 +31,7 @@ namespace o2::gpu struct GPUDefParameters; -namespace gpu_reconstruction_kernels +namespace gpu_reconstruction_kernels // TODO: Get rid of this namespace { struct deviceEvent { constexpr deviceEvent() = default; @@ -72,6 +71,86 @@ class GPUReconstructionProcessing : public GPUReconstruction public: ~GPUReconstructionProcessing() override; + using deviceEvent = gpu_reconstruction_kernels::deviceEvent; + using threadContext = gpu_reconstruction_kernels::threadContext; + + struct RecoStepTimerMeta { + HighResTimer timerToGPU; + HighResTimer timerToHost; + HighResTimer timerTotal; + double timerCPU = 0.; + size_t bytesToGPU = 0; + size_t bytesToHost = 0; + uint32_t countToGPU = 0; + uint32_t countToHost = 0; + }; + + template + struct kernelInterfaceArguments { + using t = T; + static constexpr int32_t i = I; + }; + + struct krnlExec { + constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto) : nBlocks(b), nThreads(t), stream(s), device(d), step(GPUDataTypes::RecoStep::NoRecoStep) {} + constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(GPUReconstruction::krnlDeviceType::Auto), step(st) {} + constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(d), step(st) {} + uint32_t nBlocks; + uint32_t nThreads; + int32_t stream; + GPUReconstruction::krnlDeviceType device; + GPUDataTypes::RecoStep step; + }; + struct krnlRunRange { + constexpr krnlRunRange() = default; + constexpr krnlRunRange(uint32_t v) : index(v) {} + uint32_t index = 0; + }; + struct krnlEvent { + constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {} + deviceEvent* ev; + deviceEvent* evList; + int32_t nEvents; + }; + + struct krnlProperties { + krnlProperties(int32_t t = 0, int32_t b = 1, int32_t b2 = 0) : nThreads(t), minBlocks(b), forceBlocks(b2) {} + uint32_t nThreads; + uint32_t minBlocks; + uint32_t forceBlocks; + uint32_t total() { return forceBlocks ? forceBlocks : (nThreads * minBlocks); } + }; + + struct krnlSetup { + krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {} + krnlExec x; + krnlRunRange y; + krnlEvent z; + }; + + struct krnlSetupTime : public krnlSetup { + double& t; + }; + + template + struct krnlSetupArgs : public kernelInterfaceArguments { + const krnlSetupTime s; + std::tuple sizeof(void*)), const Args&, const Args>::type...> v; + }; + + template + class KernelInterface : public S + { + public: + template + KernelInterface(const Args&... args) : S(args...) + { + } + + protected: + virtual void runKernelVirtual(const int num, const void* args); + }; + // Threading int32_t getNKernelHostThreads(bool splitCores); uint32_t getNActiveThreadsOuterLoop() const { return mNActiveThreadsOuterLoop; } @@ -94,23 +173,12 @@ class GPUReconstructionProcessing : public GPUReconstruction template void AddGPUEvents(T*& events); - virtual std::unique_ptr GetThreadContext() override; + virtual std::unique_ptr GetThreadContext() override; - struct RecoStepTimerMeta { - HighResTimer timerToGPU; - HighResTimer timerToHost; - HighResTimer timerTotal; - double timerCPU = 0.; - size_t bytesToGPU = 0; - size_t bytesToHost = 0; - uint32_t countToGPU = 0; - uint32_t countToHost = 0; - }; const GPUDefParameters& getGPUParameters(bool doGPU) const override { return *(doGPU ? mParDevice : mParCPU); } protected: GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg); - using deviceEvent = gpu_reconstruction_kernels::deviceEvent; static const std::vector mKernelNames; @@ -181,7 +249,7 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu static int32_t id = getNextTimerId(); timerMeta* timer = getTimerById(id); if (timer == nullptr) { - int32_t max = std::max({mMaxHostThreads, mProcessingSettings.nStreams}); + int32_t max = std::max({mMaxHostThreads, GPUCA_MAX_STREAMS}); timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep); } if (num == -1) { diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc new file mode 100644 index 0000000000000..49d02515372b8 --- /dev/null +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc @@ -0,0 +1,41 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionProcessingKernels.h +/// \author David Rohr + +#ifndef GPURECONSTRUCTIONPROCESSINGKERNELS_H +#define GPURECONSTRUCTIONPROCESSINGKERNELS_H + +#include "GPUReconstructionProcessing.h" +#include "GPUReconstructionKernelIncludes.h" + +namespace o2::gpu +{ + +template +void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const int num, const void* args) +{ + switch (num) { // clang-format off +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \ + case x_num: { \ + const auto& args2 = *(const krnlSetupArgs*)args; \ + ((T*)this)->template runKernelBackend(args2); \ + break; \ + } +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL + } // clang-format on +} + +} // namespace o2::gpu + +#endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H diff --git a/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx b/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx index 4693a1eff24f2..b25b93e957b15 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx @@ -14,6 +14,8 @@ #include "GPUReconstructionTimeframe.h" #include "GPUReconstruction.h" +#include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" #include "display/GPUDisplayInterface.h" #include "GPUQA.h" #include "AliHLTTPCClusterMCData.h" diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index fe2906caace80..970b331ea99fb 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -24,6 +24,7 @@ #include "GPUParamRTC.h" #include "GPUReconstructionCUDAHelpers.inc" #include "GPUDefParametersLoad.inc" +#include "GPUReconstructionProcessingKernels.inc" #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 #include "utils/qGetLdBinarySymbols.h" @@ -53,16 +54,23 @@ __global__ void dummyInitKernel(void*) {} #include "GPUReconstructionIncludesITS.h" -GPUReconstructionCUDABackend::GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionDeviceBase(cfg, sizeof(GPUReconstructionDeviceBase)) +GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface(cfg, sizeof(GPUReconstructionDeviceBase)) { if (mMaster == nullptr) { mInternals = new GPUReconstructionCUDAInternals; *mParDevice = o2::gpu::internal::GPUDefParametersLoad(); } - mDeviceBackendSettings.deviceType = DeviceType::CUDA; + mDeviceBackendSettings->deviceType = DeviceType::CUDA; +#ifndef __HIPCC__ // CUDA + mRtcSrcExtension = ".cu"; + mRtcBinExtension = ".fatbin"; +#else // HIP + mRtcSrcExtension = ".hip"; + mRtcBinExtension = ".o"; +#endif } -GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend() +GPUReconstructionCUDA::~GPUReconstructionCUDA() { Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA if (mMaster == nullptr) { @@ -71,23 +79,11 @@ GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend() } static_assert(sizeof(cudaError_t) <= sizeof(int64_t) && cudaSuccess == 0); -int32_t GPUReconstructionCUDABackend::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const +int32_t GPUReconstructionCUDA::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const { return internal::GPUReconstructionCUDAChkErr(error, file, line); } -GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) -{ -#ifndef __HIPCC__ // CUDA - mRtcSrcExtension = ".cu"; - mRtcBinExtension = ".fatbin"; -#else // HIP - mRtcSrcExtension = ".hip"; - mRtcBinExtension = ".o"; -#endif -} -GPUReconstructionCUDA::~GPUReconstructionCUDA() = default; - GPUReconstruction* GPUReconstruction_Create_CUDA(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionCUDA(cfg); } void GPUReconstructionCUDA::GetITSTraits(std::unique_ptr* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) @@ -109,7 +105,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() constexpr int32_t reqVerMaj = 2; constexpr int32_t reqVerMin = 0; #endif - if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) { + if (GetProcessingSettings().rtc.enable && GetProcessingSettings().rtctech.runTest == 2) { mWarpSize = GPUCA_WARP_SIZE; genAndLoadRTC(); exit(0); @@ -123,14 +119,14 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUError("Error getting CUDA Device Count"); return (1); } - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Available CUDA devices:"); } std::vector devicesOK(count, false); std::vector devMemory(count, 0); bool contextCreated = false; for (int32_t i = 0; i < count; i++) { - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUInfo("Examining device %d", i); } size_t free, total; @@ -139,14 +135,14 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() #else // HIP if (GPUChkErrI(hipSetDevice(i))) { #endif - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUWarning("Couldn't create context for device %d. Skipping it.", i); } continue; } contextCreated = true; if (GPUChkErrI(cudaMemGetInfo(&free, &total))) { - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUWarning("Error obtaining CUDA memory info about device %d! Skipping it.", i); } GPUChkErr(cudaDeviceReset()); @@ -156,13 +152,13 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUChkErr(cudaDeviceReset()); contextCreated = false; } - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUInfo("Obtained current memory usage for device %d", i); } if (GPUChkErrI(cudaGetDeviceProperties(&deviceProp, i))) { continue; } - if (mProcessingSettings.debugLevel >= 4) { + if (GetProcessingSettings().debugLevel >= 4) { GPUInfo("Obtained device properties for device %d", i); } int32_t deviceOK = true; @@ -179,7 +175,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() } deviceSpeed = (double)deviceProp.multiProcessorCount * (double)deviceProp.clockRate * (double)deviceProp.warpSize * (double)free * (double)deviceProp.major * (double)deviceProp.major; - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUImportant("Device %s%2d: %s (Rev: %d.%d - Mem Avail %lu / %lu)%s %s", deviceOK ? " " : "[", i, deviceProp.name, deviceProp.major, deviceProp.minor, free, (size_t)deviceProp.totalGlobalMem, deviceOK ? " " : " ]", deviceOK ? "" : deviceFailure); } if (!deviceOK) { @@ -191,7 +187,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() bestDevice = i; bestDeviceSpeed = deviceSpeed; } else { - if (mProcessingSettings.debugLevel >= 2 && mProcessingSettings.deviceNum < 0) { + if (GetProcessingSettings().debugLevel >= 2 && GetProcessingSettings().deviceNum < 0) { GPUInfo("Skipping: Speed %f < %f\n", deviceSpeed, bestDeviceSpeed); } } @@ -204,15 +200,15 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUImportant("Requiring Revision %d.%d, Mem: %lu", reqVerMaj, reqVerMin, std::max(mDeviceMemorySize, REQUIRE_MIN_MEMORY)); #endif noDevice = true; - } else if (mProcessingSettings.deviceNum > -1) { - if (mProcessingSettings.deviceNum >= (signed)count) { - GPUError("Requested device ID %d does not exist", mProcessingSettings.deviceNum); + } else if (GetProcessingSettings().deviceNum > -1) { + if (GetProcessingSettings().deviceNum >= (signed)count) { + GPUError("Requested device ID %d does not exist", GetProcessingSettings().deviceNum); noDevice = true; - } else if (!devicesOK[mProcessingSettings.deviceNum]) { - GPUError("Unsupported device requested (%d)", mProcessingSettings.deviceNum); + } else if (!devicesOK[GetProcessingSettings().deviceNum]) { + GPUError("Unsupported device requested (%d)", GetProcessingSettings().deviceNum); noDevice = true; } else { - bestDevice = mProcessingSettings.deviceNum; + bestDevice = GetProcessingSettings().deviceNum; } } if (noDevice) { @@ -225,7 +221,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUChkErrI(cudaGetDeviceProperties(&deviceProp, mDeviceId)); - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Using CUDA Device %s with Properties:", deviceProp.name); GPUInfo("\ttotalGlobalMem = %ld", (uint64_t)deviceProp.totalGlobalMem); GPUInfo("\tsharedMemPerBlock = %ld", (uint64_t)deviceProp.sharedMemPerBlock); @@ -244,7 +240,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment); GPUInfo(" "); } - if (deviceProp.warpSize != GPUCA_WARP_SIZE && !mProcessingSettings.rtc.enable) { + if (deviceProp.warpSize != GPUCA_WARP_SIZE && !GetProcessingSettings().rtc.enable) { throw std::runtime_error("Invalid warp size on GPU"); } mWarpSize = deviceProp.warpSize; @@ -280,7 +276,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUChkErrI(cudaDeviceReset()); return (1); } - if (GPUChkErrI(cudaDeviceSetLimit(cudaLimitMallocHeapSize, mProcessingSettings.deterministicGPUReconstruction ? std::max(1024 * 1024 * 1024, GPUCA_GPU_HEAP_SIZE) : GPUCA_GPU_HEAP_SIZE))) { + if (GPUChkErrI(cudaDeviceSetLimit(cudaLimitMallocHeapSize, GetProcessingSettings().deterministicGPUReconstruction ? std::max(1024 * 1024 * 1024, GPUCA_GPU_HEAP_SIZE) : GPUCA_GPU_HEAP_SIZE))) { GPUError("Error setting CUDA stack size"); GPUChkErrI(cudaDeviceReset()); return (1); @@ -302,7 +298,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() mDeviceMemorySize = mDeviceMemorySize * 2 / 3; // Leave 1/3 of GPU memory for event display } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Allocating memory on GPU"); } if (mDeviceMemorySize > deviceProp.totalGlobalMem || GPUChkErrI(cudaMalloc(&mDeviceMemoryBase, mDeviceMemorySize))) { @@ -312,7 +308,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUChkErrI(cudaDeviceReset()); return (1); } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Allocating memory on Host"); } if (GPUChkErrI(cudaMallocHost(&mHostMemoryBase, mHostMemorySize))) { @@ -320,7 +316,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUChkErrI(cudaDeviceReset()); return (1); } - if (mProcessingSettings.debugLevel >= 1) { + if (GetProcessingSettings().debugLevel >= 1) { GPUInfo("Memory ptrs: GPU (%ld bytes): %p - Host (%ld bytes): %p", (int64_t)mDeviceMemorySize, mDeviceMemoryBase, (int64_t)mHostMemorySize, mHostMemoryBase); memset(mHostMemoryBase, 0xDD, mHostMemorySize); if (GPUChkErrI(cudaMemset(mDeviceMemoryBase, 0xDD, mDeviceMemorySize))) { @@ -344,7 +340,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() hipLaunchKernelGGL(HIP_KERNEL_NAME(dummyInitKernel), dim3(mBlockCount), dim3(256), 0, 0, mDeviceMemoryBase); #endif - if (mProcessingSettings.rtc.enable) { + if (GetProcessingSettings().rtc.enable) { genAndLoadRTC(); } #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 @@ -419,17 +415,17 @@ void GPUReconstructionCUDA::genAndLoadRTC() throw std::runtime_error("Runtime compilation failed"); } for (uint32_t i = 0; i < nCompile; i++) { - if (mProcessingSettings.rtctech.runTest != 2) { + if (GetProcessingSettings().rtctech.runTest != 2) { mInternals->kernelModules.emplace_back(std::make_unique()); GPUChkErr(cuModuleLoad(mInternals->kernelModules.back().get(), (filename + "_" + std::to_string(i) + mRtcBinExtension).c_str())); } remove((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str()); remove((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str()); } - if (mProcessingSettings.rtctech.runTest == 2) { + if (GetProcessingSettings().rtctech.runTest == 2) { return; } - loadKernelModules(mProcessingSettings.rtc.compilePerKernel); + loadKernelModules(GetProcessingSettings().rtc.compilePerKernel); } int32_t GPUReconstructionCUDA::ExitDevice_Runtime() @@ -472,7 +468,7 @@ int32_t GPUReconstructionCUDA::ExitDevice_Runtime() size_t GPUReconstructionCUDA::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents) { - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { stream = -1; } if (stream == -1) { @@ -490,7 +486,7 @@ size_t GPUReconstructionCUDA::GPUMemCpy(void* dst, const void* src, size_t size, if (ev) { GPUChkErr(cudaEventRecord(ev->get(), mInternals->Streams[stream == -1 ? 0 : stream])); } - if (mProcessingSettings.serializeGPU & 2) { + if (GetProcessingSettings().serializeGPU & 2) { GPUDebug(("GPUMemCpy " + std::to_string(toGPU)).c_str(), stream, true); } return size; @@ -512,7 +508,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s if (ev && stream != -1) { GPUChkErr(cudaEventRecord(ev->get(), mInternals->Streams[stream])); } - if (mProcessingSettings.serializeGPU & 2) { + if (GetProcessingSettings().serializeGPU & 2) { GPUDebug("WriteToConstantMemory", stream, true); } return size; @@ -521,7 +517,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s void GPUReconstructionCUDA::ReleaseEvent(deviceEvent ev) {} void GPUReconstructionCUDA::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(cudaEventRecord(ev->get(), mInternals->Streams[stream])); } -std::unique_ptr GPUReconstructionCUDA::GetThreadContext() +std::unique_ptr GPUReconstructionCUDA::GetThreadContext() { GPUChkErr(cudaSetDevice(mDeviceId)); return GPUReconstructionProcessing::GetThreadContext(); @@ -565,14 +561,14 @@ int32_t GPUReconstructionCUDA::GPUDebug(const char* state, int32_t stream, bool GPUError("CUDA Error %s while running (%s) (Stream %d)", cudaGetErrorString(cuErr), state, stream); return (1); } - if (!force && mProcessingSettings.debugLevel <= 0) { + if (!force && GetProcessingSettings().debugLevel <= 0) { return (0); } if (GPUChkErrI(stream == -1 ? cudaDeviceSynchronize() : cudaStreamSynchronize(mInternals->Streams[stream]))) { GPUError("CUDA Error while synchronizing (%s) (Stream %d)", state, stream); return (1); } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("GPU Sync Done"); } return (0); @@ -580,7 +576,7 @@ int32_t GPUReconstructionCUDA::GPUDebug(const char* state, int32_t stream, bool int32_t GPUReconstructionCUDA::registerMemoryForGPU_internal(const void* ptr, size_t size) { - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Registering %zu bytes of memory for GPU", size); } return GPUChkErrI(cudaHostRegister((void*)ptr, size, cudaHostRegisterDefault)); @@ -591,7 +587,7 @@ int32_t GPUReconstructionCUDA::unregisterMemoryForGPU_internal(const void* ptr) return GPUChkErrI(cudaHostUnregister((void*)ptr)); } -void GPUReconstructionCUDABackend::PrintKernelOccupancies() +void GPUReconstructionCUDA::PrintKernelOccupancies() { int32_t maxBlocks = 0, threads = 0, suggestedBlocks = 0, nRegs = 0, sMem = 0; GPUChkErr(cudaSetDevice(mDeviceId)); @@ -612,7 +608,7 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) GPUFatal("kernel numbers out of sync"); \ } \ mInternals->kernelFunctions.emplace_back(new CUfunction); \ - if (mProcessingSettings.debugLevel >= 3) { \ + if (GetProcessingSettings().debugLevel >= 3) { \ GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j); \ } \ GPUChkErr(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \ @@ -680,8 +676,3 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options #endif // ORT_ROCM_BUILD } #endif // __HIPCC__ - -namespace o2::gpu -{ -template class GPUReconstructionKernels; -} diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 2fc4d14bba491..ed75100dfe351 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -16,6 +16,7 @@ #define GPURECONSTRUCTIONCUDA_H #include "GPUReconstructionDeviceBase.h" +#include "GPUCommonAlgorithm.h" #include #include @@ -34,38 +35,31 @@ namespace o2::gpu { struct GPUReconstructionCUDAInternals; -class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase +class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterface { public: - ~GPUReconstructionCUDABackend() override; - - protected: - GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg); + GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg); + ~GPUReconstructionCUDA() override; void PrintKernelOccupancies() override; virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override; template void runKernelBackend(const krnlSetupArgs& args); - template - void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); template friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); + + protected: GPUReconstructionCUDAInternals* mInternals; -}; -class GPUReconstructionCUDA : public GPUReconstructionKernels -{ - public: - ~GPUReconstructionCUDA() override; - GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg); + template + void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - protected: int32_t InitDevice_Runtime() override; int32_t ExitDevice_Runtime() override; - std::unique_ptr GetThreadContext() override; + std::unique_ptr GetThreadContext() override; void SynchronizeGPU() override; int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override; void SynchronizeStream(int32_t stream) override; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index abcd47ca01c90..5706f32e73e96 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -37,8 +37,8 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math); int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) { std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") + - std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + - GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr); + std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + + GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr); if (filename == "") { filename = "/tmp/o2cagpu_rtc_"; } @@ -53,12 +53,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) kernelsall += kernels[i] + "\n"; } - std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : ""); + std::string baseCommand = (GetProcessingSettings().rtctech.prependCommand != "" ? (GetProcessingSettings().rtctech.prependCommand + " ") : ""); baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len)); - baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len)); + baseCommand += std::string(" ") + (GetProcessingSettings().rtctech.overrideArchitecture != "" ? GetProcessingSettings().rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len)); - if (mProcessingSettings.rtctech.loadLaunchBoundsFromFile.size()) { - FILE* fp = fopen(mProcessingSettings.rtctech.loadLaunchBoundsFromFile.c_str(), "rb"); + if (GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.size()) { + FILE* fp = fopen(GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.c_str(), "rb"); if (fp == nullptr) { throw std::runtime_error("Cannot open launch bounds parameter module file"); } @@ -75,12 +75,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) + "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n"; - if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) { GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str()); } char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21]; - if (mProcessingSettings.rtc.cacheOutput) { + if (GetProcessingSettings().rtc.cacheOutput) { o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len); o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size()); o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size()); @@ -88,16 +88,16 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size()); } - nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1; + nCompile = GetProcessingSettings().rtc.compilePerKernel ? kernels.size() : 1; bool cacheLoaded = false; int32_t fd = 0; - if (mProcessingSettings.rtc.cacheOutput) { - if (mProcessingSettings.rtctech.cacheFolder != ".") { - std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder); + if (GetProcessingSettings().rtc.cacheOutput) { + if (GetProcessingSettings().rtctech.cacheFolder != ".") { + std::filesystem::create_directories(GetProcessingSettings().rtctech.cacheFolder); } - if (mProcessingSettings.rtctech.cacheMutex) { + if (GetProcessingSettings().rtctech.cacheMutex) { mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; - fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask); + fd = open((GetProcessingSettings().rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask); if (fd == -1) { throw std::runtime_error("Error opening rtc cache mutex lock file"); } @@ -107,7 +107,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } } - FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb"); + FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb"); char sharead[20]; if (fp) { size_t len; @@ -116,7 +116,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) if (fread(sharead, 1, 20, fp) != 20) { throw std::runtime_error("Cache file corrupt"); } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { char shaprint1[41], shaprint2[41]; for (uint32_t i = 0; i < 20; i++) { sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]); @@ -124,7 +124,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2); } - if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) { + if (!GetProcessingSettings().rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) { GPUInfo("Cache file content outdated (%s)", name); return 1; } @@ -142,7 +142,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) { throw std::runtime_error("Cache file corrupt"); } - if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) { + if (!GetProcessingSettings().rtctech.ignoreCacheValid && !(cachedSettings == GetProcessingSettings().rtc)) { GPUInfo("Cache file content outdated (rtc parameters)"); break; } @@ -172,13 +172,13 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } } if (!cacheLoaded) { - if (mProcessingSettings.debugLevel >= 0) { + if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("Starting CUDA RTC Compilation"); } HighResTimer rtcTimer; rtcTimer.ResetStart(); tbb::parallel_for(0, nCompile, [&](auto i) { - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str()); } FILE* fp = fopen((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str(), "w+b"); @@ -187,10 +187,10 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } std::string kernel = "extern \"C\" {"; - kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall; + kernel += GetProcessingSettings().rtc.compilePerKernel ? kernels[i] : kernelsall; kernel += "}"; - bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end()); + bool deterministic = GetProcessingSettings().rtc.deterministic || (GetProcessingSettings().rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end()); const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n"); if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() || @@ -206,26 +206,26 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len); } command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension; - if (mProcessingSettings.debugLevel < 0) { + if (GetProcessingSettings().debugLevel < 0) { command += " &> /dev/null"; - } else if (mProcessingSettings.debugLevel < 2) { + } else if (GetProcessingSettings().debugLevel < 2) { command += " > /dev/null"; } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { printf("Running command %s\n", command.c_str()); } if (system(command.c_str())) { - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { printf("Source code file: %s", filename.c_str()); } throw std::runtime_error("Error during CUDA compilation"); } // clang-format off }, tbb::simple_partitioner()); // clang-format on - if (mProcessingSettings.debugLevel >= 0) { + if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime()); } - if (mProcessingSettings.rtc.cacheOutput) { - FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b"); + if (GetProcessingSettings().rtc.cacheOutput) { + FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b"); if (fp == nullptr) { throw std::runtime_error("Cannot open cache file for writing"); } @@ -236,7 +236,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) fwrite(shacmd, 1, 20, fp) != 20 || fwrite(shakernels, 1, 20, fp) != 20 || fwrite(shabounds, 1, 20, fp) != 20 || - fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) { + fwrite(&GetProcessingSettings().rtc, sizeof(GetProcessingSettings().rtc), 1, fp) != 1) { throw std::runtime_error("Error writing cache file"); } @@ -263,7 +263,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) fclose(fp); } } - if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) { + if (GetProcessingSettings().rtc.cacheOutput && GetProcessingSettings().rtctech.cacheMutex) { if (lockf(fd, F_ULOCK, 0)) { throw std::runtime_error("Error unlocking RTC cache mutex file"); } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h index f3fc21243ef0e..0813c9d22ea09 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h @@ -44,7 +44,7 @@ struct GPUReconstructionCUDAInternals { class GPUDebugTiming { public: - GPUDebugTiming(bool d, gpu_reconstruction_kernels::deviceEvent* t, cudaStream_t* s, const gpu_reconstruction_kernels::krnlSetupTime& x, GPUReconstructionCUDABackend* r) : mDeviceTimers(t), mStreams(s), mXYZ(x), mRec(r), mDo(d) + GPUDebugTiming(bool d, GPUReconstructionProcessing::deviceEvent* t, cudaStream_t* s, const GPUReconstructionProcessing::krnlSetupTime& x, GPUReconstructionCUDA* r) : mDeviceTimers(t), mStreams(s), mXYZ(x), mRec(r), mDo(d) { if (mDo) { if (mDeviceTimers) { @@ -71,10 +71,10 @@ class GPUDebugTiming } private: - gpu_reconstruction_kernels::deviceEvent* mDeviceTimers; + GPUReconstructionProcessing::deviceEvent* mDeviceTimers; cudaStream_t* mStreams; - const gpu_reconstruction_kernels::krnlSetupTime& mXYZ; - GPUReconstructionCUDABackend* mRec; + const GPUReconstructionProcessing::krnlSetupTime& mXYZ; + GPUReconstructionCUDA* mRec; HighResTimer mTimer; bool mDo; }; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index d53f7cbd81ca9..11a62bcec2318 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -22,21 +22,17 @@ using namespace o2::gpu; #include "GPUReconstructionIncludesDeviceAll.h" +#include "GPUReconstructionCUDAKernelsSpecialize.inc" + #if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; } #endif -template <> -inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) -{ - GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream])); -} - template -inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) { #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1 - if (!mProcessingSettings.rtc.enable) { + if (!GetProcessingSettings().rtc.enable) { backendInternal::runKernelBackendMacro(_xyz, this, args...); } else #endif @@ -56,7 +52,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet } template -void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args) +void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args) { auto& x = args.s.x; auto& z = args.s.z; @@ -66,7 +62,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this); + GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this); std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal(args.s, vals...); }, args.v); } GPUChkErr(cudaGetLastError()); @@ -79,7 +75,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs(const krnlSetupArgs& args); +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args); #else // ---------- COMPILE_MODE = onefile | rdc ---------- #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2 #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc @@ -87,7 +83,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs(const krnlSetupArgs& args); + template void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args); #ifndef __HIPCC__ // CUDA version #define GPUCA_KRNL_CALL(x_class, ...) \ diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc new file mode 100644 index 0000000000000..899c2e240cd94 --- /dev/null +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -0,0 +1,138 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionCUDAKernelsSpecialize.inc +/// \author David Rohr + +#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) + +namespace o2::gpu::internal +{ +namespace // anonymous +{ +struct MergeBorderTracks_compMax { + GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) + { + return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); + } +}; +struct MergeBorderTracks_compMin { + GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) + { + return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); + } +}; + +struct GPUTPCGMMergerSortTracks_comp { + const GPUTPCGMMergedTrack* const mCmp; + GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} + GPUd() bool operator()(const int32_t aa, const int32_t bb) + { + const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; + const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; + if (a.CCE() != b.CCE()) { + return a.CCE() > b.CCE(); + } + if (a.Legs() != b.Legs()) { + return a.Legs() > b.Legs(); + } + GPUCA_DETERMINISTIC_CODE( // clang-format off + if (a.NClusters() != b.NClusters()) { + return a.NClusters() > b.NClusters(); + } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + } if (a.GetParam().GetY() != b.GetParam().GetY()) { + return a.GetParam().GetY() > b.GetParam().GetY(); + } + return aa > bb; + , // !GPUCA_DETERMINISTIC_CODE + return a.NClusters() > b.NClusters(); + ) // clang-format on + } +}; + +struct GPUTPCGMMergerSortTracksQPt_comp { + const GPUTPCGMMergedTrack* const mCmp; + GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} + GPUd() bool operator()(const int32_t aa, const int32_t bb) + { + const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; + const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; + GPUCA_DETERMINISTIC_CODE( // clang-format off + if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + } if (a.GetParam().GetY() != b.GetParam().GetY()) { + return a.GetParam().GetY() > b.GetParam().GetY(); + } + return a.GetParam().GetZ() > b.GetParam().GetZ(); + , // !GPUCA_DETERMINISTIC_CODE + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + ) // clang-format on + } +}; + +struct GPUTPCGMMergerMergeLoopers_comp { + GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b) + { + return CAMath::Abs(a.refz) < CAMath::Abs(b.refz); + } +}; + +struct GPUTPCGMO2OutputSort_comp { + GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b) + { + return (a.y > b.y); + } +}; + +} // anonymous namespace +} // namespace o2::gpu::internal + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) +{ + if (cmpMax) { + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax()); + } else { + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin()); + } +} + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +{ + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); +} + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +{ + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); +} + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +{ + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); +} + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +{ + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp()); +} +#endif // GPUCA_SPECIALIZE_THRUST_SORTS + +template <> +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +{ + GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream])); +} diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index d7adb222d547b..d29a6afb60899 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -24,7 +24,7 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}") if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}") set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify) file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR}) - set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu) + set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDAKernelsSpecialize.inc GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu) set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesSystem.h) set(HIP_SOURCES "") foreach(file ${GPUCA_HIP_FILE_LIST}) diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index e276f83413bbc..28c809dd4a09a 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -13,6 +13,7 @@ /// \author David Rohr #include "GPUReconstructionOCLIncludesHost.h" +#include "GPUReconstructionProcessingKernels.inc" #include "GPUDefParametersLoad.inc" #include @@ -33,16 +34,16 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionOCLCode_spirv); GPUReconstruction* GPUReconstruction_Create_OCL(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionOCL(cfg); } -GPUReconstructionOCLBackend::GPUReconstructionOCLBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionDeviceBase(cfg, sizeof(GPUReconstructionDeviceBase)) +GPUReconstructionOCL::GPUReconstructionOCL(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface(cfg, sizeof(GPUReconstructionDeviceBase)) { if (mMaster == nullptr) { mInternals = new GPUReconstructionOCLInternals; *mParDevice = o2::gpu::internal::GPUDefParametersLoad(); } - mDeviceBackendSettings.deviceType = DeviceType::OCL; + mDeviceBackendSettings->deviceType = DeviceType::OCL; } -GPUReconstructionOCLBackend::~GPUReconstructionOCLBackend() +GPUReconstructionOCL::~GPUReconstructionOCL() { Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit if (mMaster == nullptr) { @@ -51,7 +52,7 @@ GPUReconstructionOCLBackend::~GPUReconstructionOCLBackend() } static_assert(sizeof(cl_int) <= sizeof(int64_t) && CL_SUCCESS == 0); -int32_t GPUReconstructionOCLBackend::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const +int32_t GPUReconstructionOCL::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const { // Check for OPENCL Error and in the case of an error display the corresponding error string if (error != CL_SUCCESS) { @@ -60,7 +61,7 @@ int32_t GPUReconstructionOCLBackend::GPUChkErrInternal(const int64_t error, cons return error != CL_SUCCESS; } -int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() +int32_t GPUReconstructionOCL::InitDevice_Runtime() { if (mMaster == nullptr) { cl_int ocl_error; @@ -71,7 +72,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() if (num_platforms == 0) { GPUErrorReturn("No OpenCL Platform found"); } - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("%d OpenCL Platforms found", num_platforms); } @@ -118,17 +119,17 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() cl_uint deviceCount, bestDevice = (cl_uint)-1, bestPlatform = (cl_uint)-1; for (uint32_t iPlatform = 0; iPlatform < num_platforms; iPlatform++) { - if (mProcessingSettings.oclPlatformNum >= 0) { - if (mProcessingSettings.oclPlatformNum >= (int32_t)num_platforms) { + if (GetProcessingSettings().oclPlatformNum >= 0) { + if (GetProcessingSettings().oclPlatformNum >= (int32_t)num_platforms) { GPUErrorReturn("Invalid platform specified"); } - iPlatform = mProcessingSettings.oclPlatformNum; + iPlatform = GetProcessingSettings().oclPlatformNum; } std::string platformUsageInfo; bool platformCompatible = false; queryPlatform(platforms[iPlatform]); if (clGetDeviceIDs(platforms[iPlatform], CL_DEVICE_TYPE_ALL, 0, nullptr, &deviceCount) != CL_SUCCESS) { - if (mProcessingSettings.oclPlatformNum >= 0) { + if (GetProcessingSettings().oclPlatformNum >= 0) { GPUErrorReturn("No device in requested platform or error obtaining device count"); } platformUsageInfo += " - no devices"; @@ -139,32 +140,32 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() } } - if (mProcessingSettings.oclPlatformNum >= 0 || mProcessingSettings.debugLevel >= 2) { - GPUInfo("%s Platform %d: (%s %s) %s %s (Compatible: %s)%s", mProcessingSettings.oclPlatformNum >= 0 ? "Enforced" : "Available", iPlatform, platform_profile.c_str(), platform_version.c_str(), platform_vendor.c_str(), platform_name.c_str(), platformCompatible ? "yes" : "no", mProcessingSettings.debugLevel >= 2 ? platformUsageInfo.c_str() : ""); + if (GetProcessingSettings().oclPlatformNum >= 0 || GetProcessingSettings().debugLevel >= 2) { + GPUInfo("%s Platform %d: (%s %s) %s %s (Compatible: %s)%s", GetProcessingSettings().oclPlatformNum >= 0 ? "Enforced" : "Available", iPlatform, platform_profile.c_str(), platform_version.c_str(), platform_vendor.c_str(), platform_name.c_str(), platformCompatible ? "yes" : "no", GetProcessingSettings().debugLevel >= 2 ? platformUsageInfo.c_str() : ""); } - if (platformCompatible || mProcessingSettings.oclPlatformNum >= 0 || (mProcessingSettings.oclPlatformNum == -2 && deviceCount)) { + if (platformCompatible || GetProcessingSettings().oclPlatformNum >= 0 || (GetProcessingSettings().oclPlatformNum == -2 && deviceCount)) { if (deviceCount > devices.size()) { devices.resize(deviceCount); } if (clGetDeviceIDs(platforms[iPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices.data(), nullptr) != CL_SUCCESS) { - if (mProcessingSettings.oclPlatformNum >= 0) { + if (GetProcessingSettings().oclPlatformNum >= 0) { GPUErrorReturn("Error getting OpenCL devices"); } continue; } for (uint32_t i = 0; i < deviceCount; i++) { - if (mProcessingSettings.deviceNum >= 0) { - if (mProcessingSettings.deviceNum >= (signed)deviceCount) { - GPUErrorReturn("Requested device ID %d does not exist", mProcessingSettings.deviceNum); + if (GetProcessingSettings().deviceNum >= 0) { + if (GetProcessingSettings().deviceNum >= (signed)deviceCount) { + GPUErrorReturn("Requested device ID %d does not exist", GetProcessingSettings().deviceNum); } - i = mProcessingSettings.deviceNum; + i = GetProcessingSettings().deviceNum; } bool deviceOK = true; queryDevice(devices[i]); std::string deviceFailure; - if (mProcessingSettings.gpuDeviceOnly && ((device_type & CL_DEVICE_TYPE_CPU) || !(device_type & CL_DEVICE_TYPE_GPU))) { + if (GetProcessingSettings().gpuDeviceOnly && ((device_type & CL_DEVICE_TYPE_CPU) || !(device_type & CL_DEVICE_TYPE_GPU))) { deviceOK = false; deviceFailure += " - No GPU device"; } @@ -193,12 +194,12 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() } double bestDeviceSpeed = -1, deviceSpeed = (double)device_freq * (double)device_shaders; - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo(" Device %s%2d: %s %s (Frequency %d, Shaders %d, %d bit) (Speed Value: %ld)%s %s", deviceOK ? " " : "[", i, device_vendor.c_str(), device_name.c_str(), (int32_t)device_freq, (int32_t)device_shaders, (int32_t)device_nbits, (int64_t)deviceSpeed, deviceOK ? " " : " ]", deviceOK ? "" : deviceFailure.c_str()); } if (!deviceOK) { - if (mProcessingSettings.deviceNum >= 0) { - GPUInfo("Unsupported device requested on platform %d: (%d)", iPlatform, mProcessingSettings.deviceNum); + if (GetProcessingSettings().deviceNum >= 0) { + GPUInfo("Unsupported device requested on platform %d: (%d)", iPlatform, GetProcessingSettings().deviceNum); break; } continue; @@ -209,12 +210,12 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() bestDeviceSpeed = deviceSpeed; mOclVersion = platform_version_f; } - if (mProcessingSettings.deviceNum >= 0) { + if (GetProcessingSettings().deviceNum >= 0) { break; } } } - if (mProcessingSettings.oclPlatformNum >= 0) { + if (GetProcessingSettings().oclPlatformNum >= 0) { break; } } @@ -238,7 +239,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() deviceVersion = query(clGetDeviceInfo, mInternals->device, CL_DEVICE_VERSION); int versionMajor, versionMinor; sscanf(deviceVersion.c_str(), "OpenCL %d.%d", &versionMajor, &versionMinor); - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Using OpenCL platform %d / device %d: %s %s with properties:", bestPlatform, bestDevice, device_vendor.c_str(), device_name.c_str()); GPUInfo("\tVersion = %s", deviceVersion); GPUInfo("\tFrequency = %d", (int32_t)device_freq); @@ -271,7 +272,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() return 1; } - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("OpenCL program and kernels loaded successfully"); } @@ -289,21 +290,21 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() } if (device_type & CL_DEVICE_TYPE_CPU) { - if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel >= 2) { GPUInfo("Disabling device timers for CPU device"); } - mProcessingSettings.deviceTimers = 0; + mProcessingSettings->deviceTimers = 0; } for (int32_t i = 0; i < mNStreams; i++) { #ifdef CL_VERSION_2_0 cl_queue_properties prop = 0; - if (versionMajor >= 2 && IsGPU() && mProcessingSettings.deviceTimers) { + if (versionMajor >= 2 && IsGPU() && GetProcessingSettings().deviceTimers) { prop |= CL_QUEUE_PROFILING_ENABLE; } mInternals->command_queue[i] = clCreateCommandQueueWithProperties(mInternals->context, mInternals->device, &prop, &ocl_error); - if (mProcessingSettings.deviceTimers && ocl_error == CL_INVALID_QUEUE_PROPERTIES) { + if (GetProcessingSettings().deviceTimers && ocl_error == CL_INVALID_QUEUE_PROPERTIES) { GPUError("GPU device timers not supported by OpenCL platform, disabling"); - mProcessingSettings.deviceTimers = 0; + mProcessingSettings->deviceTimers = 0; prop = 0; mInternals->command_queue[i] = clCreateCommandQueueWithProperties(mInternals->context, mInternals->device, &prop, &ocl_error); } @@ -351,7 +352,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() GPUErrorReturn("Error obtaining device memory ptr"); } - if (mProcessingSettings.debugLevel >= 2) { + if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Mapping hostmemory"); } mHostMemoryBase = clEnqueueMapBuffer(mInternals->command_queue[0], mInternals->mem_host, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, mHostMemorySize, 0, nullptr, nullptr, &ocl_error); @@ -362,7 +363,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() mDeviceMemoryBase = ((void**)mHostMemoryBase)[0]; mDeviceConstantMem = (GPUConstantMem*)((void**)mHostMemoryBase)[1]; - if (mProcessingSettings.debugLevel >= 1) { + if (GetProcessingSettings().debugLevel >= 1) { GPUInfo("Memory ptrs: GPU (%ld bytes): %p - Host (%ld bytes): %p", (int64_t)mDeviceMemorySize, mDeviceMemoryBase, (int64_t)mHostMemorySize, mHostMemoryBase); memset(mHostMemoryBase, 0xDD, mHostMemorySize); } @@ -386,7 +387,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() return (0); } -int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime() +int32_t GPUReconstructionOCL::ExitDevice_Runtime() { // Uninitialize OPENCL SynchronizeGPU(); @@ -418,12 +419,12 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime() return (0); } -size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents) +size_t GPUReconstructionOCL::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents) { if (evList == nullptr) { nEvents = 0; } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { stream = -1; } if (stream == -1) { @@ -440,33 +441,33 @@ size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t } else { GPUChkErr(clEnqueueReadBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, stream == -1, (char*)src - (char*)mDeviceMemoryBase, size, dst, nEvents, evList->getEventList(), ev->getEventList())); } - if (mProcessingSettings.serializeGPU & 2) { + if (GetProcessingSettings().serializeGPU & 2) { GPUDebug(("GPUMemCpy " + std::to_string(toGPU)).c_str(), stream, true); } return size; } -size_t GPUReconstructionOCLBackend::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev) +size_t GPUReconstructionOCL::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev) { if (stream == -1) { SynchronizeGPU(); } GPUChkErr(clEnqueueWriteBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_constant, stream == -1, offset, size, src, 0, nullptr, ev->getEventList())); - if (mProcessingSettings.serializeGPU & 2) { + if (GetProcessingSettings().serializeGPU & 2) { GPUDebug("WriteToConstantMemory", stream, true); } return size; } -void GPUReconstructionOCLBackend::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEvent(ev.get())); } +void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEvent(ev.get())); } -void GPUReconstructionOCLBackend::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList())); } +void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList())); } -int32_t GPUReconstructionOCLBackend::DoStuckProtection(int32_t stream, deviceEvent event) +int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event) { - if (mProcessingSettings.stuckProtection) { + if (GetProcessingSettings().stuckProtection) { cl_int tmp = 0; - for (int32_t i = 0; i <= mProcessingSettings.stuckProtection / 50; i++) { + for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) { usleep(50); clGetEventInfo(event.get(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr); if (tmp == CL_COMPLETE) { @@ -483,25 +484,25 @@ int32_t GPUReconstructionOCLBackend::DoStuckProtection(int32_t stream, deviceEve return 0; } -void GPUReconstructionOCLBackend::SynchronizeGPU() +void GPUReconstructionOCL::SynchronizeGPU() { for (int32_t i = 0; i < mNStreams; i++) { GPUChkErr(clFinish(mInternals->command_queue[i])); } } -void GPUReconstructionOCLBackend::SynchronizeStream(int32_t stream) { GPUChkErr(clFinish(mInternals->command_queue[stream])); } +void GPUReconstructionOCL::SynchronizeStream(int32_t stream) { GPUChkErr(clFinish(mInternals->command_queue[stream])); } -void GPUReconstructionOCLBackend::SynchronizeEvents(deviceEvent* evList, int32_t nEvents) { GPUChkErr(clWaitForEvents(nEvents, evList->getEventList())); } +void GPUReconstructionOCL::SynchronizeEvents(deviceEvent* evList, int32_t nEvents) { GPUChkErr(clWaitForEvents(nEvents, evList->getEventList())); } -void GPUReconstructionOCLBackend::StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents) +void GPUReconstructionOCL::StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents) { if (nEvents) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], nEvents, evList->getEventList(), nullptr)); } } -bool GPUReconstructionOCLBackend::IsEventDone(deviceEvent* evList, int32_t nEvents) +bool GPUReconstructionOCL::IsEventDone(deviceEvent* evList, int32_t nEvents) { cl_int eventdone; for (int32_t i = 0; i < nEvents; i++) { @@ -513,10 +514,10 @@ bool GPUReconstructionOCLBackend::IsEventDone(deviceEvent* evList, int32_t nEven return true; } -int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream, bool force) +int32_t GPUReconstructionOCL::GPUDebug(const char* state, int32_t stream, bool force) { // Wait for OPENCL-Kernel to finish and check for OPENCL errors afterwards, in case of debugmode - if (!force && mProcessingSettings.debugLevel <= 0) { + if (!force && GetProcessingSettings().debugLevel <= 0) { return (0); } for (int32_t i = 0; i < mNStreams; i++) { @@ -524,13 +525,13 @@ int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream, GPUError("OpenCL Error while synchronizing (%s) (Stream %d/%d)", state, stream, i); } } - if (mProcessingSettings.debugLevel >= 3) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("GPU Sync Done"); } return (0); } -int32_t GPUReconstructionOCLBackend::GetOCLPrograms() +int32_t GPUReconstructionOCL::GetOCLPrograms() { cl_int ocl_error; @@ -571,7 +572,7 @@ int32_t GPUReconstructionOCLBackend::GetOCLPrograms() return AddKernels(); } -const char* GPUReconstructionOCLBackend::convertErrorToString(int32_t errorcode) +const char* GPUReconstructionOCL::convertErrorToString(int32_t errorcode) { static const std::map error_map = { {CL_SUCCESS, "CL_SUCCESS"}, diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index abde42f01f073..091bc0409630d 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -27,14 +27,16 @@ namespace o2::gpu { struct GPUReconstructionOCLInternals; -class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase +class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface { public: - ~GPUReconstructionOCLBackend() override; + GPUReconstructionOCL(const GPUSettingsDeviceBackend& cfg); + ~GPUReconstructionOCL() override; - protected: - GPUReconstructionOCLBackend(const GPUSettingsDeviceBackend& cfg); + template + void runKernelBackend(const krnlSetupArgs& args); + protected: int32_t InitDevice_Runtime() override; int32_t ExitDevice_Runtime() override; @@ -61,8 +63,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase GPUReconstructionOCLInternals* mInternals; float mOclVersion; - template - void runKernelBackend(const krnlSetupArgs& args); template S& getKernelObject(); @@ -78,7 +78,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase int32_t AddKernels(); }; -using GPUReconstructionOCL = GPUReconstructionKernels; } // namespace o2::gpu #endif diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h index 0bb2f25093789..919791948d6c3 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h @@ -54,7 +54,7 @@ struct GPUReconstructionOCLInternals { } // namespace o2::gpu template -inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters) +inline int64_t GPUReconstructionOCL::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters) { int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter); if (retVal) { @@ -67,12 +67,12 @@ inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_ker } template -inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args) +inline int64_t GPUReconstructionOCL::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args) { return OCLsetKernelParameters_helper(kernel, 0, args...); } -inline int64_t GPUReconstructionOCLBackend::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents) +inline int64_t GPUReconstructionOCL::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents) { return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent); } diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx index cca634fba65fc..72c68428149dd 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx @@ -13,16 +13,12 @@ /// \author David Rohr #include "GPUReconstructionOCLIncludesHost.h" +#include "GPUReconstructionKernelIncludes.h" -template <> -inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) -{ - cl_int4 val0 = {0, 0, 0, 0}; - GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList(), _xyz.z.ev->getEventList())); -} +#include "GPUReconstructionOCLKernelsSpecialize.inc" template -inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) { cl_kernel k = getKernelObject(); auto& x = _xyz.x; @@ -33,14 +29,14 @@ inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetu cl_event ev; cl_event* evr; bool tmpEvent = false; - if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { + if (z.ev == nullptr && GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0) { evr = &ev; tmpEvent = true; } else { evr = (cl_event*)z.ev; } GPUChkErr(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents)); - if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { + if (GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0) { cl_ulong time_start, time_end; GPUChkErr(clWaitForEvents(1, evr)); GPUChkErr(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr)); @@ -53,13 +49,13 @@ inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetu } template -void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args) +void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs& args) { std::apply([this, &args](auto&... vals) { runKernelBackendInternal(args.s, vals...); }, args.v); } template -int32_t GPUReconstructionOCLBackend::AddKernel() +int32_t GPUReconstructionOCL::AddKernel() { std::string name(GetKernelName()); std::string kname("krnl_" + name); @@ -75,12 +71,12 @@ int32_t GPUReconstructionOCLBackend::AddKernel() } template -S& GPUReconstructionOCLBackend::getKernelObject() +S& GPUReconstructionOCL::getKernelObject() { return mInternals->kernels[GetKernelNum()]; } -int32_t GPUReconstructionOCLBackend::AddKernels() +int32_t GPUReconstructionOCL::AddKernels() { #define GPUCA_KRNL(x_class, ...) \ if (AddKernel()) { \ @@ -91,6 +87,6 @@ int32_t GPUReconstructionOCLBackend::AddKernels() return 0; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args); +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs& args); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc new file mode 100644 index 0000000000000..1b860e47a4243 --- /dev/null +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc @@ -0,0 +1,20 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionOCLKernelsSpecialize.inc +/// \author David Rohr + +template <> +inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +{ + cl_int4 val0 = {0, 0, 0, 0}; + GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList(), _xyz.z.ev->getEventList())); +} diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index e82799b9e59c3..0cd302cc0be94 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -113,7 +113,7 @@ set(HDRS_INSTALL Base/GPUReconstructionIO.h Base/GPUReconstructionIncludesITS.h Base/GPUReconstructionKernelMacros.h - Base/GPUReconstructionKernels.h + Base/GPUReconstructionCPUKernels.h DataCompression/GPUTPCClusterRejection.h DataCompression/GPUTPCCompressionKernels.inc DataCompression/TPCClusterDecompressionCore.inc @@ -127,6 +127,7 @@ set(HDRS_INSTALL DataTypes/GPUTRDDef.h DataTypes/GPUTRDInterfaceO2Track.h DataTypes/GPUTriggerOutputs.h + DataTypes/GPUKernelClassesFwd.h Debug/GPUROOTDump.h Definitions/GPUDefConstantsAndSettings.h Definitions/GPUDefParametersWrapper.h @@ -137,6 +138,7 @@ set(HDRS_INSTALL Definitions/GPULogging.h Definitions/GPUSettingsList.h Global/GPUChainTrackingDefs.h + Global/GPUChainTrackingGetters.inc Global/GPUErrorCodes.h Merger/GPUTPCGMBorderTrack.h Merger/GPUTPCGMMergedTrack.h @@ -217,11 +219,11 @@ set(SRCS_NO_H ${SRCS_NO_H} set(HDRS_INSTALL ${HDRS_INSTALL} ITS/GPUITSTrack.h - TPCClusterFinder/Array2D.h + TPCClusterFinder/CfArray2D.h TPCClusterFinder/CfConsts.h TPCClusterFinder/CfFragment.h TPCClusterFinder/CfUtils.h - TPCClusterFinder/ChargePos.h + TPCClusterFinder/CfChargePos.h Definitions/clusterFinderDefs.h TPCClusterFinder/PackedCharge.h TPCClusterFinder/GPUTPCCFChainContext.h) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx index 8a22545314252..2a0c5b58d8a83 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx @@ -17,6 +17,7 @@ #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" #include "GPUDefParametersRuntime.h" +#include "GPUConstantMem.h" using namespace o2::gpu; @@ -123,7 +124,7 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io) mMaxClusters = io.clustersNative->nClustersTotal; mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->NTPCUnattachedHitsBase1024(mRec->GetParam().rec.tpc.rejectionStrategy) : 1024; mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024; - mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); + mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); // TODO: Why is this not using ioPtrs? Could remove GPUConstantMem.h include mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks(); if (mMaxClusters % 16) { mMaxClusters += 16 - (mMaxClusters % 16); diff --git a/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h b/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h new file mode 100644 index 0000000000000..405eb339dea3b --- /dev/null +++ b/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h @@ -0,0 +1,40 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUKernelClassesFwd.h +/// \author David Rohr + +#ifndef GPUKERNELCLASSESFWDN_H +#define GPUKERNELCLASSESFWDN_H + +#include "GPUTRDDef.h" + +namespace o2::gpu +{ +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) class GPUCA_M_FIRST(GPUCA_M_STRIP(x_class)); +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL + +struct GPUTPCClusterOccupancyMapBin; +namespace gputpcgmmergertypes +{ +struct GPUTPCGMBorderRange; +} +struct GPUTPCLinearLabels; +struct CfChargePos; +} // namespace o2::gpu + +namespace o2::tpc +{ +struct ClusterNative; +} // namespace o2::tpc + +#endif diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 1e99e3b73736f..5df324fcba648 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -16,11 +16,14 @@ #define GPUCHAIN_H #include "GPUReconstructionCPU.h" +#include "GPUReconstructionCPUKernels.h" +#include "GPUKernelClassesFwd.h" #include namespace o2::gpu { + class GPUChain { friend class GPUReconstruction; @@ -30,10 +33,10 @@ class GPUChain using GeneralStep = GPUReconstruction::GeneralStep; using InOutPointerType = GPUReconstruction::InOutPointerType; using GeometryType = GPUReconstruction::GeometryType; - using krnlRunRange = gpu_reconstruction_kernels::krnlRunRange; - using krnlExec = gpu_reconstruction_kernels::krnlExec; - using krnlEvent = gpu_reconstruction_kernels::krnlEvent; - using deviceEvent = gpu_reconstruction_kernels::deviceEvent; + using krnlRunRange = GPUReconstructionProcessing::krnlRunRange; + using krnlExec = GPUReconstructionProcessing::krnlExec; + using krnlEvent = GPUReconstructionProcessing::krnlEvent; + using deviceEvent = GPUReconstructionProcessing::deviceEvent; static constexpr krnlRunRange krnlRunRangeNone{0}; static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0}; @@ -56,20 +59,20 @@ class GPUChain virtual void DumpSettings(const char* dir = "") {} virtual void ReadSettings(const char* dir = "") {} - const GPUParam& GetParam() const { return mRec->mHostConstantMem->param; } - const GPUSettingsGRP& GetGRPSettings() const { return mRec->mGRPSettings; } - const GPUCalibObjectsConst& calib() const { return processors()->calibObjects; } + const GPUParam& GetParam() const { return mRec->GetParam(); } + const GPUSettingsGRP& GetGRPSettings() const { return mRec->GetGRPSettings(); } + const GPUCalibObjectsConst& GetCalib() const { return mRec->GetCalib(); } GPUReconstruction* rec() { return mRec; } const GPUReconstruction* rec() const { return mRec; } - inline const GPUConstantMem* GetProcessors() { return mRec->processors(); } + inline const GPUConstantMem* GetProcessors() const { return mRec->processors(); } // Make functions from GPUReconstruction*** available GPUReconstruction::RecoStepField GetRecoSteps() const { return mRec->GetRecoSteps(); } GPUReconstruction::RecoStepField GetRecoStepsGPU() const { return mRec->GetRecoStepsGPU(); } GPUReconstruction::InOutTypeField GetRecoStepsInputs() const { return mRec->GetRecoStepsInputs(); } GPUReconstruction::InOutTypeField GetRecoStepsOutputs() const { return mRec->GetRecoStepsOutputs(); } - inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; } - inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; } + inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->GetDeviceBackendSettings(); } + inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->GetProcessingSettings(); } protected: GPUReconstructionCPU* mRec; @@ -102,7 +105,7 @@ class GPUChain } inline bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return mRec->IsEventDone(evList, nEvents); } inline void RecordMarker(deviceEvent* ev, int32_t stream) { mRec->RecordMarker(ev, stream); } - virtual inline std::unique_ptr GetThreadContext() { return mRec->GetThreadContext(); } + virtual inline std::unique_ptr GetThreadContext() { return mRec->GetThreadContext(); } inline void SynchronizeGPU() { mRec->SynchronizeGPU(); } inline void ReleaseEvent(deviceEvent ev, bool doGPU = true) { @@ -171,13 +174,16 @@ class GPUChain { mRec->ReadStructFromFile(file, obj); } + template - inline void runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args) + requires(sizeof(S) >= 0) // Yields better incomplete type errors than calling runKernelCallInterface directly + inline void runKernel(GPUReconstructionProcessing::krnlSetup&& setup, Args const&... args) { - return mRec->runKernel(std::forward(setup), std::forward(args)...); + runKernelCallInterface(std::forward(setup), args...); } + template - gpu_reconstruction_kernels::krnlProperties getKernelProperties() + GPUReconstructionProcessing::krnlProperties getKernelProperties() { return mRec->getKernelProperties(); } @@ -233,6 +239,16 @@ class GPUChain private: template void timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args); + +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ + template \ + requires(std::is_same_v && I == S::GPUCA_M_FIRST(GPUCA_M_SHIFT(GPUCA_M_STRIP(x_class), defaultKernel))) \ + inline void runKernelCallInterface(GPUReconstructionProcessing::krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \ + { \ + mRec->runKernelInterface(std::forward(setup) GPUCA_M_STRIP(x_forward)); \ + } +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL }; template @@ -243,7 +259,7 @@ inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... } HighResTimer* timer = nullptr; size_t* bytes = nullptr; - if (mRec->mProcessingSettings.debugLevel >= 1 && toGPU >= 0) { // Todo: time special cases toGPU < 0 + if (mRec->GetProcessingSettings().debugLevel >= 1 && toGPU >= 0) { // Todo: time special cases toGPU < 0 int32_t id = mRec->getRecoStepNum(step, false); if (id != -1) { auto& tmp = mRec->mTimersRecoSteps[id]; diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index 640b92a0eb0f4..eeead79b1840b 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -13,6 +13,7 @@ /// \author David Rohr #include "GPUChainITS.h" +#include "GPUConstantMem.h" #include "DataFormatsITS/TrackITS.h" #include "ITStracking/ExternalAllocator.h" #include "GPUReconstructionIncludesITS.h" diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 43fa49ff74817..a3f9b996e070d 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -18,6 +18,8 @@ #include #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" +#include "GPUReconstructionIO.h" #include "GPUChainTrackingDefs.h" #include "GPUTPCClusterData.h" #include "GPUTPCSectorOutCluster.h" @@ -755,7 +757,7 @@ int32_t GPUChainTracking::RunChain() } } - if (GetProcessingSettings().trdTrackModelO2 ? runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking) : runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking)) { + if (runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking)) { return 1; } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 5779cec31130c..8664652b549e3 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -24,6 +24,12 @@ #include #include +namespace o2::dataformats +{ +template +class ConstMCTruthContainer; +} // namespace o2::dataformats + namespace o2::trd { class GeometryFlat; @@ -39,6 +45,9 @@ class CalibdEdxContainer; namespace o2::base { class MatLayerCylSet; +template +class PropagatorImpl; +using Propagator = PropagatorImpl; } // namespace o2::base namespace o2::gpu @@ -55,6 +64,8 @@ struct GPUChainTrackingFinalContext; struct GPUTPCCFChainContext; struct GPUNewCalibValues; struct GPUTriggerOutputs; +struct CfFragment; +class GPUTPCClusterFinder; class GPUChainTracking : public GPUChain { @@ -137,11 +148,6 @@ class GPUChainTracking : public GPUChain void ConvertZSFilter(bool zs12bit); // Getters for external usage of tracker classes - GPUTRDTrackerGPU* GetTRDTrackerGPU() { return &processors()->trdTrackerGPU; } - GPUTPCTracker* GetTPCSectorTrackers() { return processors()->tpcTrackers; } - const GPUTPCTracker* GetTPCSectorTrackers() const { return processors()->tpcTrackers; } - const GPUTPCGMMerger& GetTPCMerger() const { return processors()->tpcMerger; } - GPUTPCGMMerger& GetTPCMerger() { return processors()->tpcMerger; } GPUDisplayInterface* GetEventDisplay() { return mEventDisplay.get(); } const GPUQA* GetQA() const { return mQAFromForeignChain ? mQAFromForeignChain->mQA.get() : mQA.get(); } GPUQA* GetQA() { return mQAFromForeignChain ? mQAFromForeignChain->mQA.get() : mQA.get(); } @@ -155,7 +161,6 @@ class GPUChainTracking : public GPUChain int32_t ForwardTPCDigits(); int32_t RunTPCTrackingSectors(); int32_t RunTPCTrackingMerger(bool synchronizeOutput = true); - template int32_t RunTRDTracking(); template int32_t DoTRDGPUTracking(T* externalInstance = nullptr); @@ -164,22 +169,22 @@ class GPUChainTracking : public GPUChain int32_t RunRefit(); // Getters / setters for parameters - const CorrectionMapsHelper* GetTPCTransformHelper() const { return processors()->calibObjects.fastTransformHelper; } - const TPCPadGainCalib* GetTPCPadGainCalib() const { return processors()->calibObjects.tpcPadGain; } - const TPCZSLinkMapping* GetTPCZSLinkMapping() const { return processors()->calibObjects.tpcZSLinkMapping; } - const o2::tpc::CalibdEdxContainer* GetdEdxCalibContainer() const { return processors()->calibObjects.dEdxCalibContainer; } - const o2::base::MatLayerCylSet* GetMatLUT() const { return processors()->calibObjects.matLUT; } - const GPUTRDGeometry* GetTRDGeometry() const { return (GPUTRDGeometry*)processors()->calibObjects.trdGeometry; } - const o2::base::Propagator* GetO2Propagator() const { return processors()->calibObjects.o2Propagator; } + const CorrectionMapsHelper* GetTPCTransformHelper() const; + const TPCPadGainCalib* GetTPCPadGainCalib() const; + const TPCZSLinkMapping* GetTPCZSLinkMapping() const; + const o2::tpc::CalibdEdxContainer* GetdEdxCalibContainer() const; + const o2::base::MatLayerCylSet* GetMatLUT() const; + const GPUTRDGeometry* GetTRDGeometry() const; + const o2::base::Propagator* GetO2Propagator() const; const o2::base::Propagator* GetDeviceO2Propagator(); void SetTPCFastTransform(std::unique_ptr&& tpcFastTransform, std::unique_ptr&& tpcTransformHelper); void SetMatLUT(std::unique_ptr&& lut); void SetTRDGeometry(std::unique_ptr&& geo); - void SetMatLUT(const o2::base::MatLayerCylSet* lut) { processors()->calibObjects.matLUT = lut; } - void SetTRDGeometry(const o2::trd::GeometryFlat* geo) { processors()->calibObjects.trdGeometry = geo; } + void SetMatLUT(const o2::base::MatLayerCylSet* lut); + void SetTRDGeometry(const o2::trd::GeometryFlat* geo); void SetO2Propagator(const o2::base::Propagator* prop); - void SetCalibObjects(const GPUCalibObjectsConst& obj) { processors()->calibObjects = obj; } - void SetCalibObjects(const GPUCalibObjects& obj) { memcpy((void*)&processors()->calibObjects, (const void*)&obj, sizeof(obj)); } + void SetCalibObjects(const GPUCalibObjectsConst& obj); + void SetCalibObjects(const GPUCalibObjects& obj); void SetUpdateCalibObjects(const GPUCalibObjectsConst& obj, const GPUNewCalibValues& vals); void SetSubOutputControl(int32_t i, GPUOutputControl* v) { mSubOutputControls[i] = v; } void SetFinalInputCallback(std::function v) { mWaitForFinalInputs = v; } @@ -298,6 +303,8 @@ class GPUChainTracking : public GPUChain void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType); void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function allocator, bool applyClusterCuts); bool NeedTPCClustersOnGPU(); + template + int32_t RunTRDTrackingInternal(); uint32_t StreamForSector(uint32_t sector) const; std::mutex mMutexUpdateCalib; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 7db0ba66305e9..981d565852d28 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -19,11 +19,20 @@ #include "GPUMemorySizeScalers.h" #include "GPUTrackingInputProvider.h" #include "GPUNewCalibValues.h" -#include - -#ifdef GPUCA_O2_LIB -#include "CommonDataFormat/InteractionRecord.h" -#endif +#include "GPUConstantMem.h" +#include "CfChargePos.h" +#include "CfArray2D.h" +#include "GPUGeneralKernels.h" +#include "GPUTPCCFStreamCompaction.h" +#include "GPUTPCCFChargeMapFiller.h" +#include "GPUTPCCFDecodeZS.h" +#include "GPUTPCCFCheckPadBaseline.h" +#include "GPUTPCCFPeakFinder.h" +#include "GPUTPCCFNoiseSuppression.h" +#include "GPUTPCCFDeconvolution.h" +#include "GPUTPCCFClusterizer.h" +#include "GPUTPCCFGather.h" +#include "GPUTPCCFMCLabelFlattener.h" #include "GPUTriggerOutputs.h" #include "GPUHostDataTypes.h" #include "GPUTPCCFChainContext.h" @@ -32,18 +41,24 @@ #include "DataFormatsTPC/Digit.h" #include "DataFormatsTPC/Constants.h" #include "TPCBase/RDHUtils.h" +#include "GPULogging.h" + +#ifdef GPUCA_HAS_ONNX +#include "GPUTPCNNClusterizerKernels.h" +#include "GPUTPCNNClusterizerHost.h" +#endif + +#ifdef GPUCA_O2_LIB +#include "CommonDataFormat/InteractionRecord.h" +#endif #include "utils/strtag.h" +#include #ifndef GPUCA_NO_VC #include #endif -#ifdef GPUCA_HAS_ONNX -#include "GPUTPCNNClusterizerKernels.h" -#include "GPUTPCNNClusterizerHost.h" -#endif - using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -791,7 +806,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) using ChargeMapType = decltype(*clustererShadow.mPchargeMap); using PeakMapType = decltype(*clustererShadow.mPpeakMap); - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!! + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType)); if (fragment.index == 0) { runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy)); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 8fb6fc4771658..fc07a91004c5f 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -19,6 +19,9 @@ #include "GPUTPCCFChainContext.h" #include "TPCClusterDecompressor.h" #include "GPUDefParametersRuntime.h" +#include "GPUConstantMem.h" // TODO: Try to get rid of as many GPUConstantMem includes as possible! +#include "GPUTPCCompressionKernels.h" +#include "GPUTPCDecompressionKernels.h" #include "utils/strtag.h" #include diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index c42d9622f5332..5d05cd6a97776 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -15,6 +15,8 @@ #include "GPUChainTracking.h" #include "GPUTrackingInputProvider.h" #include "GPUMemorySizeScalers.h" +#include "GPUConstantMem.h" +#include "GPUTPCClusterFilter.h" #include #include #include @@ -23,8 +25,6 @@ #include "bitmapfile.h" #endif -#include "GPUTPCClusterFilter.h" - #define PROFILE_MAX_SIZE (100 * 1024 * 1024) using namespace o2::gpu; @@ -209,7 +209,7 @@ void GPUChainTracking::PrintDebugOutput() void GPUChainTracking::PrintOutputStat() { int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0; - uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters(); + uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : processors()->tpcMerger.NMaxClusters(); if (GetProcessingSettings().createO2Output > 1) { nTracks = mIOPtrs.nOutputTracksTPCO2; nAttachedClusters = mIOPtrs.nMergedTrackHits; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDefs.h b/GPU/GPUTracking/Global/GPUChainTrackingDefs.h index dc1a665e6052c..e02419955001a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDefs.h +++ b/GPU/GPUTracking/Global/GPUChainTrackingDefs.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUChainTracking.h +/// \file GPUChainTrackingDefs.h /// \author David Rohr #ifndef GPUCHAINTRACKINGDEFS_H diff --git a/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc b/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc new file mode 100644 index 0000000000000..5b72a8f23c242 --- /dev/null +++ b/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc @@ -0,0 +1,36 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUChainTrackingGetters.inc +/// \author David Rohr + +#ifndef GPUCHAINTRACKINGGETTERS_INC_H +#define GPUCHAINTRACKINGGETTERS_INC_H + +#include "GPUChainTracking.h" +#include "GPUConstantMem.h" + +namespace o2::gpu +{ +inline const CorrectionMapsHelper* GPUChainTracking::GetTPCTransformHelper() const { return processors()->calibObjects.fastTransformHelper; } +inline const TPCPadGainCalib* GPUChainTracking::GetTPCPadGainCalib() const { return processors()->calibObjects.tpcPadGain; } +inline const TPCZSLinkMapping* GPUChainTracking::GetTPCZSLinkMapping() const { return processors()->calibObjects.tpcZSLinkMapping; } +inline const o2::tpc::CalibdEdxContainer* GPUChainTracking::GetdEdxCalibContainer() const { return processors()->calibObjects.dEdxCalibContainer; } +inline const o2::base::MatLayerCylSet* GPUChainTracking::GetMatLUT() const { return processors()->calibObjects.matLUT; } +inline const GPUTRDGeometry* GPUChainTracking::GetTRDGeometry() const { return (GPUTRDGeometry*)processors()->calibObjects.trdGeometry; } +inline const o2::base::Propagator* GPUChainTracking::GetO2Propagator() const { return processors()->calibObjects.o2Propagator; } +inline void GPUChainTracking::SetMatLUT(const o2::base::MatLayerCylSet* lut) { processors()->calibObjects.matLUT = lut; } +inline void GPUChainTracking::SetTRDGeometry(const o2::trd::GeometryFlat* geo) { processors()->calibObjects.trdGeometry = geo; } +inline void GPUChainTracking::SetCalibObjects(const GPUCalibObjectsConst& obj) { processors()->calibObjects = obj; } +inline void GPUChainTracking::SetCalibObjects(const GPUCalibObjects& obj) { memcpy((void*)&processors()->calibObjects, (const void*)&obj, sizeof(obj)); } +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx index 4f7846b852b98..5e7672022b3ff 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx @@ -34,6 +34,7 @@ #include "GPUTrackingInputProvider.h" #include "TPCZSLinkMapping.h" #include "GPUTriggerOutputs.h" +#include "GPUConstantMem.h" #include "SimulationDataFormat/MCCompLabel.h" #include "SimulationDataFormat/MCTruthContainer.h" diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index a647c213660c9..163f08634ef86 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -17,6 +17,11 @@ #include "GPUDefParametersRuntime.h" #include "GPUO2DataTypes.h" #include "GPUQA.h" +#include "GPUTPCGMMerger.h" +#include "GPUConstantMem.h" +#include "GPUTPCGMMergerGPU.h" +#include "GPUTPCGMO2Output.h" +#include "GPUTPCGlobalDebugSortKernels.h" #include "utils/strtag.h" #include diff --git a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx index 8d1efd7011227..4662b5464f710 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx @@ -13,8 +13,12 @@ /// \author David Rohr #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" #include "GPULogging.h" #include "GPUO2DataTypes.h" +#include "GPUTrackingRefit.h" +#include "GPUConstantMem.h" +#include "GPUTrackingRefitKernel.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index 962b0922eeecc..635641c00ae14 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -20,6 +20,16 @@ #include "GPUTrackingInputProvider.h" #include "GPUTPCClusterOccupancyMap.h" #include "GPUDefParametersRuntime.h" +#include "GPUTPCExtrapolationTracking.h" +#include "GPUTPCCreateOccupancyMap.h" +#include "GPUTPCCreateTrackingData.h" +#include "GPUTPCNeighboursFinder.h" +#include "GPUTPCNeighboursCleaner.h" +#include "GPUTPCStartHitsFinder.h" +#include "GPUTPCStartHitsSorter.h" +#include "GPUTPCTrackletConstructor.h" +#include "GPUTPCTrackletSelector.h" +#include "GPUTPCSectorDebugSortKernels.h" #include "utils/strtag.h" #include diff --git a/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx b/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx index 0f17bbcc26842..f9011131803e3 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx @@ -21,13 +21,19 @@ #include "GPUTRDTracker.h" #include "GPUTrackingInputProvider.h" #include "GPUTRDTrackerKernels.h" +#include "GPUConstantMem.h" #include "utils/strtag.h" using namespace o2::gpu; using namespace o2::trd; -template int32_t GPUChainTracking::RunTRDTracking() +{ + return GetProcessingSettings().trdTrackModelO2 ? RunTRDTrackingInternal() : RunTRDTrackingInternal(); +} + +template +int32_t GPUChainTracking::RunTRDTrackingInternal() { auto& Tracker = processors()->getTRDTracker(); if (!Tracker.IsInitialized()) { @@ -189,9 +195,7 @@ int32_t GPUChainTracking::DoTRDGPUTracking(T* externalInstance) return (0); } -template int32_t GPUChainTracking::RunTRDTracking(); template int32_t GPUChainTracking::DoTRDGPUTracking(GPUTRDTrackerGPU*); template int32_t GPUChainTracking::DoTRDGPUTracking(GPUTRDTracker*); -template int32_t GPUChainTracking::RunTRDTracking(); template int32_t GPUChainTracking::DoTRDGPUTracking(GPUTRDTracker*); template int32_t GPUChainTracking::DoTRDGPUTracking(GPUTRDTrackerGPU*); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx index db5e5ae3aeb75..c9d4d269f070c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx @@ -19,9 +19,13 @@ #include "GPUTPCClusterData.h" #include "GPUReconstructionConvert.h" #include "GPUMemorySizeScalers.h" +#include "GPUTPCConvert.h" #include "AliHLTTPCRawCluster.h" +#include "GPUConstantMem.h" +#include "GPUTPCConvertKernel.h" #include "DataFormatsTPC/ClusterNative.h" +#include "DataFormatsTPC/ZeroSuppression.h" #include "CommonDataFormat/InteractionRecord.h" #include "utils/strtag.h" diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx index a5457bf3f2f23..7ef9baa903fbe 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx @@ -18,6 +18,9 @@ #include "GPUReconstruction.h" #include "GPUTPCClusterOccupancyMap.h" #include "GPUErrors.h" +#include "GPUParam.h" +#include "DataFormatsTPC/ClusterNative.h" +#include "GPUTRDSpacePoint.h" using namespace o2::gpu; using namespace o2::tpc; diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.cxx b/GPU/GPUTracking/Interface/GPUO2Interface.cxx index 4dac56afed671..81eb2c285192b 100644 --- a/GPU/GPUTracking/Interface/GPUO2Interface.cxx +++ b/GPU/GPUTracking/Interface/GPUO2Interface.cxx @@ -15,6 +15,7 @@ #include "GPUO2Interface.h" #include "GPUReconstruction.h" #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" #include "GPUChainITS.h" #include "GPUMemorySizeScalers.h" #include "GPUOutputControl.h" @@ -23,6 +24,7 @@ #include "GPUParam.inc" #include "GPUQA.h" #include "GPUOutputControl.h" +#include "DetectorsBase/Propagator.h" #include #include #include diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index 82b21e2045b8e..b6241ad36b5de 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -736,46 +736,15 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<1>(int32_t nBlocks, int32_t nThrea #endif } -#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize MergeBorderTracks<3> -namespace o2::gpu::internal -{ -namespace // anonymous -{ -struct MergeBorderTracks_compMax { - GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) - { - return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); - } -}; -struct MergeBorderTracks_compMin { - GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) - { - return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); - } -}; -} // anonymous namespace -} // namespace o2::gpu::internal - -template <> -inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) -{ - if (cmpMax) { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax()); - } else { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin()); - } -} -#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize MergeBorderTracks<3> - template <> GPUd() void GPUTPCGMMerger::MergeBorderTracks<3>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUTPCGMBorderRange* range, int32_t N, int32_t cmpMax) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS if (iThread == 0) { if (cmpMax) { - GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMax < b.fMax; }); + GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); }); } else { - GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMin < b.fMin; }); + GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); }); } } #endif @@ -1783,74 +1752,6 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit0(int32_t nBlocks, int32_t nThr } } -#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt -namespace o2::gpu::internal -{ -namespace // anonymous -{ -struct GPUTPCGMMergerSortTracks_comp { - const GPUTPCGMMergedTrack* const mCmp; - GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} - GPUd() bool operator()(const int32_t aa, const int32_t bb) - { - const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; - if (a.CCE() != b.CCE()) { - return a.CCE() > b.CCE(); - } - if (a.Legs() != b.Legs()) { - return a.Legs() > b.Legs(); - } - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (a.NClusters() != b.NClusters()) { - return a.NClusters() > b.NClusters(); - } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return aa > bb; - , // !GPUCA_DETERMINISTIC_CODE - return a.NClusters() > b.NClusters(); - ) // clang-format on - } -}; - -struct GPUTPCGMMergerSortTracksQPt_comp { - const GPUTPCGMMergedTrack* const mCmp; - GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} - GPUd() bool operator()(const int32_t aa, const int32_t bb) - { - const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return a.GetParam().GetZ() > b.GetParam().GetZ(); - , // !GPUCA_DETERMINISTIC_CODE - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - ) // clang-format on - } -}; -} // anonymous namespace -} // namespace o2::gpu::internal - -template <> -inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) -{ - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); -} - -template <> -inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) -{ - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); -} -#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt - GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS @@ -2050,27 +1951,6 @@ GPUd() void GPUTPCGMMerger::MergeLoopersSort(int32_t nBlocks, int32_t nThreads, #endif } -#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt -namespace o2::gpu::internal -{ -namespace // anonymous -{ -struct GPUTPCGMMergerMergeLoopers_comp { - GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b) - { - return CAMath::Abs(a.refz) < CAMath::Abs(b.refz); - } -}; -} // anonymous namespace -} // namespace o2::gpu::internal - -template <> -inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) -{ - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); -} -#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt - GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { const MergeLooperParam* params = mLooperCandidates; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index 1e4cc633eb4ca..2f8fbecadce5f 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -36,6 +36,7 @@ #include "GPUTPCClusterOccupancyMap.h" #include "GPUTrackingRefit.h" #include "CorrectionMapsHelper.h" +#include "GPUConstantMem.h" using namespace o2::gpu; using namespace gputpcgmmergertypes; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 13b34a0a64a84..ea219a02a1887 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -93,21 +93,6 @@ GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, #endif } -#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMO2Output::Thread -struct GPUTPCGMO2OutputSort_comp { - GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b) - { - return (a.y > b.y); - } -}; - -template <> -inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) -{ - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp()); -} -#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread - template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx index 7d83ff9abd91c..e66ad71783dbd 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx @@ -16,6 +16,7 @@ #include "GPUReconstruction.h" #include "GPUTPCHitId.h" #include "GPUTPCTrack.h" +#include "GPULogging.h" #include #include diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index b32db2bfebf11..d4c83f92a2157 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -17,11 +17,24 @@ #include "GPUReconstructionTimeframe.h" #include "GPUReconstructionConvert.h" #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" #include "GPUTPCDef.h" #include "GPUQA.h" +#include "GPUParam.h" #include "display/GPUDisplayInterface.h" #include "genEvents.h" +#include "TPCFastTransform.h" +#include "CorrectionMapsHelper.h" +#include "GPUTPCGMMergedTrack.h" +#include "GPUSettings.h" +#include "GPUConstantMem.h" + +#include "GPUO2DataTypes.h" +#include "GPUChainITS.h" + +#include "DataFormatsTPC/CompressedClusters.h" + #include #include #include @@ -32,6 +45,7 @@ #include #include #include +#include #ifndef _WIN32 #include @@ -48,15 +62,6 @@ #include "utils/qmaths_helpers.h" #include "utils/vecpod.h" -#include "TPCFastTransform.h" -#include "CorrectionMapsHelper.h" -#include "GPUTPCGMMergedTrack.h" -#include "GPUSettings.h" -#include - -#include "GPUO2DataTypes.h" -#include "GPUChainITS.h" - using namespace o2::gpu; // #define BROKEN_EVENTS @@ -915,7 +920,7 @@ int32_t main(int argc, char** argv) nEventsProcessed++; if (configStandalone.timeFrameTime) { - double nClusters = chainTracking->GetTPCMerger().NMaxClusters(); + double nClusters = chainTracking->GetProcessors()->tpcMerger.NMaxClusters(); if (nClusters > 0) { const int32_t nOrbits = 32; const double colRate = 50000; diff --git a/GPU/GPUTracking/Standalone/tools/createGeo.C b/GPU/GPUTracking/Standalone/tools/createGeo.C index 307d687f716d1..c454978177ad6 100644 --- a/GPU/GPUTracking/Standalone/tools/createGeo.C +++ b/GPU/GPUTracking/Standalone/tools/createGeo.C @@ -20,6 +20,7 @@ #include "GPUO2Interface.h" #include "GPUReconstruction.h" #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" using namespace o2::gpu; diff --git a/GPU/GPUTracking/Standalone/tools/createLUT.C b/GPU/GPUTracking/Standalone/tools/createLUT.C index 7bb4edbf89f18..b4a053aa46c66 100644 --- a/GPU/GPUTracking/Standalone/tools/createLUT.C +++ b/GPU/GPUTracking/Standalone/tools/createLUT.C @@ -18,6 +18,7 @@ #include "GPUO2Interface.h" #include "GPUReconstruction.h" #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" using namespace o2::gpu; diff --git a/GPU/GPUTracking/TPCClusterFinder/Array2D.h b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h similarity index 81% rename from GPU/GPUTracking/TPCClusterFinder/Array2D.h rename to GPU/GPUTracking/TPCClusterFinder/CfArray2D.h index b62176fdc4365..3c8bcf94da4b3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/Array2D.h +++ b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h @@ -16,22 +16,22 @@ #define O2_GPU_ARRAY2D_H #include "clusterFinderDefs.h" -#include "ChargePos.h" +#include "CfChargePos.h" namespace o2::gpu { template -class AbstractArray2D +class AbstractCfArray2D { public: - GPUdi() explicit AbstractArray2D(T* d) : data(d) {} + GPUdi() explicit AbstractCfArray2D(T* d) : data(d) {} - GPUdi() T& operator[](const ChargePos& p) { return data[Layout::idx(p)]; } - GPUdi() const T& operator[](const ChargePos& p) const { return data[Layout::idx(p)]; } + GPUdi() T& operator[](const CfChargePos& p) { return data[Layout::idx(p)]; } + GPUdi() const T& operator[](const CfChargePos& p) const { return data[Layout::idx(p)]; } - GPUdi() void safeWrite(const ChargePos& p, const T& v) + GPUdi() void safeWrite(const CfChargePos& p, const T& v) { if (data != nullptr) { (*this)[p] = v; @@ -52,7 +52,7 @@ class TilingLayout WidthInTiles = (TPC_NUM_OF_PADS + Width - 1) / Width, }; - GPUdi() static tpccf::SizeT idx(const ChargePos& p) + GPUdi() static tpccf::SizeT idx(const CfChargePos& p) { const tpccf::SizeT tilePad = p.gpad / Width; const tpccf::SizeT tileTime = p.timePadded / Height; @@ -72,7 +72,7 @@ class TilingLayout class LinearLayout { public: - GPUdi() static tpccf::SizeT idx(const ChargePos& p) + GPUdi() static tpccf::SizeT idx(const CfChargePos& p) { return TPC_NUM_OF_PADS * p.timePadded + p.gpad; } @@ -119,7 +119,7 @@ using TPCMapMemoryLayout = LinearLayout; #endif template -using Array2D = AbstractArray2D>; +using CfArray2D = AbstractCfArray2D>; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/CfChargePos.h similarity index 80% rename from GPU/GPUTracking/TPCClusterFinder/ChargePos.h rename to GPU/GPUTracking/TPCClusterFinder/CfChargePos.h index cdd489e0ef938..bf6ce2fc804ba 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h +++ b/GPU/GPUTracking/TPCClusterFinder/CfChargePos.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file ChargePos.h +/// \file CfChargePos.h /// \author Felix Weiglhofer #ifndef O2_GPU_CHARGE_POS_H @@ -22,20 +22,20 @@ namespace o2::gpu #define INVALID_TIME_BIN (-GPUCF_PADDING_TIME - 1) -struct ChargePos { +struct CfChargePos { tpccf::GlobalPad gpad; tpccf::TPCFragmentTime timePadded; - GPUdDefault() ChargePos() = default; + GPUdDefault() CfChargePos() = default; - constexpr GPUhdi() ChargePos(tpccf::Row row, tpccf::Pad pad, tpccf::TPCFragmentTime t) + constexpr GPUhdi() CfChargePos(tpccf::Row row, tpccf::Pad pad, tpccf::TPCFragmentTime t) : gpad(tpcGlobalPadIdx(row, pad)), timePadded(t + GPUCF_PADDING_TIME) { } - GPUdi() ChargePos(const tpccf::GlobalPad& p, const tpccf::TPCFragmentTime& t) : gpad(p), timePadded(t) {} + GPUdi() CfChargePos(const tpccf::GlobalPad& p, const tpccf::TPCFragmentTime& t) : gpad(p), timePadded(t) {} - GPUdi() ChargePos delta(const tpccf::Delta2& d) const + GPUdi() CfChargePos delta(const tpccf::Delta2& d) const { return {tpccf::GlobalPad(gpad + d.x), tpccf::TPCFragmentTime(timePadded + d.y)}; } @@ -56,7 +56,7 @@ struct ChargePos { } }; -inline constexpr ChargePos INVALID_CHARGE_POS{255, 255, INVALID_TIME_BIN}; +inline constexpr CfChargePos INVALID_CHARGE_POS{255, 255, INVALID_TIME_BIN}; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h index 75dcc166abd9b..96f4893c74af3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h +++ b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h @@ -17,7 +17,7 @@ #include "clusterFinderDefs.h" #include "GPUCommonAlgorithm.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "CfConsts.h" namespace o2::gpu @@ -169,14 +169,14 @@ class CfUtils template static GPUdi() void blockLoad( - const Array2D& map, + const CfArray2D& map, uint32_t wgSize, uint32_t elems, uint16_t ll, uint32_t offset, uint32_t N, GPUconstexprref() const tpccf::Delta2* neighbors, - const ChargePos* posBcast, + const CfChargePos* posBcast, GPUgeneric() T* buf) { #if defined(GPUCA_GPUCODE) @@ -186,7 +186,7 @@ class CfUtils tpccf::Delta2 d = neighbors[x + offset]; for (uint32_t i = y; i < wgSize; i += (elems / N)) { - ChargePos readFrom = posBcast[i]; + CfChargePos readFrom = posBcast[i]; uint32_t writeTo = N * i + x; buf[writeTo] = map[readFrom.delta(d)]; } @@ -196,7 +196,7 @@ class CfUtils return; } - ChargePos readFrom = posBcast[ll]; + CfChargePos readFrom = posBcast[ll]; GPUbarrier(); @@ -213,14 +213,14 @@ class CfUtils template static GPUdi() void condBlockLoad( - const Array2D& map, + const CfArray2D& map, uint16_t wgSize, uint16_t elems, uint16_t ll, uint16_t offset, uint16_t N, GPUconstexprref() const tpccf::Delta2* neighbors, - const ChargePos* posBcast, + const CfChargePos* posBcast, const uint8_t* aboveThreshold, GPUgeneric() T* buf) { @@ -230,7 +230,7 @@ class CfUtils uint16_t x = ll % N; tpccf::Delta2 d = neighbors[x + offset]; for (uint32_t i = y; i < wgSize; i += (elems / N)) { - ChargePos readFrom = posBcast[i]; + CfChargePos readFrom = posBcast[i]; uint8_t above = aboveThreshold[i]; uint32_t writeTo = N * i + x; T v(0); @@ -247,7 +247,7 @@ class CfUtils return; } - ChargePos readFrom = posBcast[ll]; + CfChargePos readFrom = posBcast[ll]; uint8_t above = aboveThreshold[ll]; GPUbarrier(); diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx index 622da856af805..a80283b91c940 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx @@ -58,7 +58,7 @@ GPUd() Charge ClusterAccumulator::updateOuter(PackedCharge charge, Delta2 d) return q; } -GPUd() void ClusterAccumulator::finalize(const ChargePos& pos, const Charge q, TPCTime timeOffset) +GPUd() void ClusterAccumulator::finalize(const CfChargePos& pos, const Charge q, TPCTime timeOffset) { mQtot += q; @@ -75,7 +75,7 @@ GPUd() void ClusterAccumulator::finalize(const ChargePos& pos, const Charge q, T mTimeMean += timeOffset + pos.time(); } -GPUd() bool ClusterAccumulator::toNative(const ChargePos& pos, const Charge q, tpc::ClusterNative& cn, const GPUParam& param, const Array2D& chargeMap) +GPUd() bool ClusterAccumulator::toNative(const CfChargePos& pos, const Charge q, tpc::ClusterNative& cn, const GPUParam& param, const CfArray2D& chargeMap) { Pad pad = pos.pad(); diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index 90d977372b201..fb208ca0150d4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -17,7 +17,7 @@ #include "clusterFinderDefs.h" #include "PackedCharge.h" -#include "Array2D.h" +#include "CfArray2D.h" namespace o2 { @@ -30,7 +30,7 @@ struct ClusterNative; namespace gpu { -struct ChargePos; +struct CfChargePos; struct GPUParam; class GPUTPCGeometry; @@ -52,8 +52,8 @@ class ClusterAccumulator mSplitInTime = splitInTime; } - GPUd() void finalize(const ChargePos&, const tpccf::Charge, tpccf::TPCTime); - GPUd() bool toNative(const ChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D&); + GPUd() void finalize(const CfChargePos&, const tpccf::Charge, tpccf::TPCTime); + GPUd() bool toNative(const CfChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const CfArray2D&); private: float mQtot = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx index 8dbc5804f8fb8..d2ca3d419c138 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx @@ -13,7 +13,7 @@ /// \author Felix Weiglhofer #include "GPUTPCCFChargeMapFiller.h" -#include "ChargePos.h" +#include "CfChargePos.h" #include "DataFormatsTPC/Digit.h" #include "TPCPadGainCalib.h" @@ -23,14 +23,14 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFChargeMapFiller::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D indexMap(clusterer.mPindexMap); + CfArray2D indexMap(clusterer.mPindexMap); fillIndexMapImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer.mPmemory->fragment, clusterer.mPdigits, indexMap, clusterer.mPmemory->counters.nDigitsInFragment); } GPUd() void GPUTPCCFChargeMapFiller::fillIndexMapImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, const CfFragment& fragment, const tpc::Digit* digits, - Array2D& indexMap, + CfArray2D& indexMap, size_t maxDigit) { size_t idx = get_global_id(0); @@ -39,21 +39,21 @@ GPUd() void GPUTPCCFChargeMapFiller::fillIndexMapImpl(int32_t nBlocks, int32_t n } CPU_ONLY(idx += fragment.digitsStart); CPU_ONLY(tpc::Digit digit = digits[idx]); - CPU_ONLY(ChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp()))); + CPU_ONLY(CfChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp()))); CPU_ONLY(indexMap.safeWrite(pos, idx)); } template <> GPUdii() void GPUTPCCFChargeMapFiller::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); fillFromDigitsImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, clusterer.mPmemory->counters.nPositions, clusterer.mPdigits, clusterer.mPpositions, chargeMap); } GPUd() void GPUTPCCFChargeMapFiller::fillFromDigitsImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, const CfFragment& fragment, size_t digitNum, const tpc::Digit* digits, - ChargePos* positions, - Array2D& chargeMap) + CfChargePos* positions, + CfArray2D& chargeMap) { size_t idx = get_global_id(0); if (idx >= digitNum) { @@ -61,7 +61,7 @@ GPUd() void GPUTPCCFChargeMapFiller::fillFromDigitsImpl(int32_t nBlocks, int32_t } tpc::Digit digit = digits[fragment.digitsStart + idx]; - ChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp())); + CfChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp())); positions[idx] = pos; float q = digit.getChargeFloat(); q *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(clusterer.mISector, digit.getRow(), digit.getPad()); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h index f7aab78c33bd1..800ba786c2105 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h @@ -19,7 +19,7 @@ #include "GPUGeneralKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" namespace o2::tpc @@ -30,7 +30,7 @@ class Digit; namespace o2::gpu { -struct ChargePos; +struct CfChargePos; class GPUTPCCFChargeMapFiller : public GPUKernelTemplate { @@ -55,9 +55,9 @@ class GPUTPCCFChargeMapFiller : public GPUKernelTemplate template GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); - static GPUd() void fillIndexMapImpl(int32_t, int32_t, int32_t, int32_t, const CfFragment&, const tpc::Digit*, Array2D&, size_t); + static GPUd() void fillIndexMapImpl(int32_t, int32_t, int32_t, int32_t, const CfFragment&, const tpc::Digit*, CfArray2D&, size_t); - static GPUd() void fillFromDigitsImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, size_t, const tpc::Digit*, ChargePos*, Array2D&); + static GPUd() void fillFromDigitsImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, size_t, const tpc::Digit*, CfChargePos*, CfArray2D&); private: static GPUd() size_t findTransition(int32_t, const tpc::Digit*, size_t, size_t); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx index 1e76860331de6..ec084c308312e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx @@ -13,7 +13,7 @@ /// \author Felix Weiglhofer #include "GPUTPCCFCheckPadBaseline.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" #include "GPUTPCGeometry.h" #include "clusterFinderDefs.h" @@ -33,10 +33,10 @@ template <> GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { const CfFragment& fragment = clusterer.mPmemory->fragment; - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); int32_t basePad = iBlock * PadsPerCacheline; - ChargePos basePos = padToChargePos(basePad, clusterer); + CfChargePos basePos = padToCfChargePos(basePad, clusterer); if (not basePos.valid()) { return; @@ -55,7 +55,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread bool handlePad = localTimeBin == 0; for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) { - const ChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)}); + const CfChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)}); smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0; GPUbarrier(); if (handlePad) { @@ -150,7 +150,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread #endif } -GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer) +GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer) { constexpr GPUTPCGeometry geo; @@ -161,12 +161,12 @@ GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int32_t& pad, const GP if (0 <= padInRow && padInRow < CAMath::nextMultipleOf(npads)) { int32_t cachelineOffset = padInRow % PadsPerCacheline; pad -= cachelineOffset; - return ChargePos{r, Pad(padInRow - cachelineOffset), 0}; + return CfChargePos{r, Pad(padInRow - cachelineOffset), 0}; } padOffset += npads; } - return ChargePos{0, 0, INVALID_TIME_BIN}; + return CfChargePos{0, 0, INVALID_TIME_BIN}; } GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h index d6daa6803ca39..2403aa6d29ecd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h @@ -52,7 +52,7 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer); private: - GPUd() static ChargePos padToChargePos(int32_t& pad, const GPUTPCClusterFinder&); + GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&); GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge); }; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx index 2131347decec6..c9c6b157499f2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx @@ -30,7 +30,7 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC) { - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h index 79f3325ed9ad2..466d13d3254de 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h @@ -19,7 +19,7 @@ #include "GPUGeneralKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" namespace o2::tpc @@ -38,7 +38,7 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate public: static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer); struct GPUSharedMemory { - ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; }; @@ -57,16 +57,16 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate template GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t); - static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*); + static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*); - static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); + static GPUd() void buildCluster(const GPUSettingsRec&, const CfArray2D&, CfChargePos, CfChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*); private: - static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*); + static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const CfChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*); - static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); + static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const CfChargePos&, ClusterAccumulator*, MCLabelAccumulator*); }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc index 8a6b73be8bd8d..e32abbf37584f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc @@ -19,8 +19,8 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t processorType& clusterer, const CfFragment& fragment, GPUSharedMemory& smem, - const Array2D& chargeMap, - const ChargePos* filteredPeakPositions, + const CfArray2D& chargeMap, + const CfChargePos* filteredPeakPositions, const GPUSettingsRec& calib, MCLabelAccumulator* labelAcc, uint32_t clusternum, @@ -34,7 +34,7 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t // For certain configurations dummy work items are added, so the total // number of work items is dividable by 64. // These dummy items also compute the last cluster but discard the result. - ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; + CfChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; Charge charge = chargeMap[pos].unpack(); ClusterAccumulator pc; @@ -94,7 +94,7 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterInner( uint16_t lid, uint16_t N, const PackedCharge* buf, - const ChargePos& pos, + const CfChargePos& pos, ClusterAccumulator* cluster, MCLabelAccumulator* labelAcc, uint8_t* innerAboveThreshold) @@ -125,7 +125,7 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterOuter( uint16_t M, uint16_t offset, const PackedCharge* buf, - const ChargePos& pos, + const CfChargePos& pos, ClusterAccumulator* cluster, MCLabelAccumulator* labelAcc) { @@ -144,9 +144,9 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterOuter( GPUdii() void GPUTPCCFClusterizer::buildCluster( const GPUSettingsRec& calib, - const Array2D& chargeMap, - ChargePos pos, - ChargePos* posBcast, + const CfArray2D& chargeMap, + CfChargePos pos, + CfChargePos* posBcast, PackedCharge* buf, uint8_t* innerAboveThreshold, ClusterAccumulator* myCluster, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index 6662b93eccb78..312085d2947ab 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -15,7 +15,7 @@ #include "GPUTPCCFDecodeZS.h" #include "GPUCommonMath.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" #include "CfUtils.h" #include "CommonConstants/LHCConstants.h" @@ -53,8 +53,8 @@ GPUdii() void GPUTPCCFDecodeZS::decode(GPUTPCClusterFinder& clusterer, GPUShared if (zs.count[endpoint] == 0) { return; } - ChargePos* positions = clusterer.mPpositions; - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfChargePos* positions = clusterer.mPpositions; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); const size_t nDigits = clusterer.mPzsOffsets[iBlock].offset; if (iThread == 0) { const int32_t region = endpoint / 2; @@ -175,7 +175,7 @@ GPUdii() void GPUTPCCFDecodeZS::decode(GPUTPCClusterFinder& clusterer, GPUShared TPCTime globalTime = timeBin + l; bool inFragment = fragment.contains(globalTime); Row row = rowOffset + m; - ChargePos pos(row, Pad(pad), inFragment ? fragment.toLocal(globalTime) : INVALID_TIME_BIN); + CfChargePos pos(row, Pad(pad), inFragment ? fragment.toLocal(globalTime) : INVALID_TIME_BIN); positions[nDigitsTmp++] = pos; if (inFragment) { @@ -552,7 +552,7 @@ GPUd() o2::tpc::PadPos GPUTPCCFDecodeZSLinkBase::GetPadAndRowFromFEC(processorTy GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, float charge, PadPos padAndRow, TPCFragmentTime localTime, size_t positionOffset) { const uint32_t sector = clusterer.mISector; - ChargePos* positions = clusterer.mPpositions; + CfChargePos* positions = clusterer.mPpositions; #ifdef GPUCA_CHECK_TPCZS_CORRUPTION if (padAndRow.getRow() >= GPUCA_ROW_COUNT) { positions[positionOffset] = INVALID_CHARGE_POS; @@ -560,9 +560,9 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa return; } #endif - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - ChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime); + CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime); positions[positionOffset] = pos; charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx index dab8123698abf..429d51685e504 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx @@ -15,7 +15,7 @@ #include "GPUTPCCFDeconvolution.h" #include "CfConsts.h" #include "CfUtils.h" -#include "ChargePos.h" +#include "CfChargePos.h" #include "GPUDefMacros.h" using namespace o2::gpu; @@ -24,15 +24,15 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D isPeakMap(clusterer.mPpeakMap); GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions); } GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, - const Array2D& peakMap, - Array2D& chargeMap, - const ChargePos* positions, + const CfArray2D& peakMap, + CfArray2D& chargeMap, + const CfChargePos* positions, const uint32_t digitnum) { SizeT idx = get_global_id(0); @@ -40,7 +40,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t bool iamDummy = (idx >= digitnum); idx = iamDummy ? digitnum - 1 : idx; - ChargePos pos = positions[idx]; + CfChargePos pos = positions[idx]; bool iamPeak = CfUtils::isPeak(peakMap[pos]); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h index 78fcc8ba1785a..e971a042e95a4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h @@ -20,7 +20,7 @@ #include "GPUGeneralKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" namespace o2::gpu @@ -31,7 +31,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate public: static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFDeconvolution); struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { - ChargePos posBcast1[SCRATCH_PAD_WORK_GROUP_SIZE]; + CfChargePos posBcast1[SCRATCH_PAD_WORK_GROUP_SIZE]; uint8_t aboveThresholdBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; uint8_t buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_COUNT_N]; }; @@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const Array2D&, Array2D&, const ChargePos*, const uint32_t); + static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t); static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*); static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx index f3a914cbfcaee..4dfa50d9439e4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx @@ -13,10 +13,10 @@ /// \author Felix Weiglhofer #include "GPUTPCCFNoiseSuppression.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "CfConsts.h" #include "CfUtils.h" -#include "ChargePos.h" +#include "CfChargePos.h" using namespace o2::gpu; using namespace o2::gpu::tpccf; @@ -24,29 +24,29 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFNoiseSuppression::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D isPeakMap(clusterer.mPpeakMap); noiseSuppressionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, clusterer.Param().rec, chargeMap, isPeakMap, clusterer.mPpeakPositions, clusterer.mPmemory->counters.nPeaks, clusterer.mPisPeak); } template <> GPUdii() void GPUTPCCFNoiseSuppression::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D isPeakMap(clusterer.mPpeakMap); + CfArray2D isPeakMap(clusterer.mPpeakMap); updatePeaksImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer.mPpeakPositions, clusterer.mPisPeak, clusterer.mPmemory->counters.nPeaks, isPeakMap); } GPUdii() void GPUTPCCFNoiseSuppression::noiseSuppressionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, const GPUSettingsRec& calibration, - const Array2D& chargeMap, - const Array2D& peakMap, - const ChargePos* peakPositions, + const CfArray2D& chargeMap, + const CfArray2D& peakMap, + const CfChargePos* peakPositions, const uint32_t peaknum, uint8_t* isPeakPredicate) { SizeT idx = get_global_id(0); - ChargePos pos = peakPositions[CAMath::Min(idx, (SizeT)(peaknum - 1))]; + CfChargePos pos = peakPositions[CAMath::Min(idx, (SizeT)(peaknum - 1))]; Charge charge = chargeMap[pos].unpack(); uint64_t minimas, bigger, peaksAround; @@ -75,10 +75,10 @@ GPUdii() void GPUTPCCFNoiseSuppression::noiseSuppressionImpl(int32_t nBlocks, in } GPUd() void GPUTPCCFNoiseSuppression::updatePeaksImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, - const ChargePos* peakPositions, + const CfChargePos* peakPositions, const uint8_t* isPeak, const uint32_t peakNum, - Array2D& peakMap) + CfArray2D& peakMap) { SizeT idx = get_global_id(0); @@ -86,7 +86,7 @@ GPUd() void GPUTPCCFNoiseSuppression::updatePeaksImpl(int32_t nBlocks, int32_t n return; } - ChargePos pos = peakPositions[idx]; + CfChargePos pos = peakPositions[idx]; uint8_t peak = isPeak[idx]; @@ -164,12 +164,12 @@ GPUdi() bool GPUTPCCFNoiseSuppression::keepPeak( } GPUd() void GPUTPCCFNoiseSuppression::findMinimaAndPeaks( - const Array2D& chargeMap, - const Array2D& peakMap, + const CfArray2D& chargeMap, + const CfArray2D& peakMap, const GPUSettingsRec& calibration, float q, - const ChargePos& pos, - ChargePos* posBcast, + const CfChargePos& pos, + CfChargePos* posBcast, PackedCharge* buf, uint64_t* minimas, uint64_t* bigger, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h index 71236bc317443..59196da11079b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h @@ -19,13 +19,13 @@ #include "GPUGeneralKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" namespace o2::gpu { -struct ChargePos; +struct CfChargePos; class GPUTPCCFNoiseSuppression : public GPUKernelTemplate { @@ -38,7 +38,7 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks)); struct GPUSharedMemory { - ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_NOISE_N]; }; @@ -57,9 +57,9 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void noiseSuppressionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const GPUSettingsRec&, const Array2D&, const Array2D&, const ChargePos*, const uint32_t, uint8_t*); + static GPUd() void noiseSuppressionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const GPUSettingsRec&, const CfArray2D&, const CfArray2D&, const CfChargePos*, const uint32_t, uint8_t*); - static GPUd() void updatePeaksImpl(int32_t, int32_t, int32_t, int32_t, const ChargePos*, const uint8_t*, const uint32_t, Array2D&); + static GPUd() void updatePeaksImpl(int32_t, int32_t, int32_t, int32_t, const CfChargePos*, const uint8_t*, const uint32_t, CfArray2D&); static GPUdi() void checkForMinima(const float, const float, const float, PackedCharge, int32_t, uint64_t*, uint64_t*); @@ -69,7 +69,7 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate static GPUdi() bool keepPeak(uint64_t, uint64_t); - static GPUd() void findMinimaAndPeaks(const Array2D&, const Array2D&, const GPUSettingsRec&, float, const ChargePos&, ChargePos*, PackedCharge*, uint64_t*, uint64_t*, uint64_t*); + static GPUd() void findMinimaAndPeaks(const CfArray2D&, const CfArray2D&, const GPUSettingsRec&, float, const CfChargePos&, CfChargePos*, PackedCharge*, uint64_t*, uint64_t*, uint64_t*); }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx index 1de922f716c14..6749ab8e8485e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx @@ -14,7 +14,7 @@ #include "GPUTPCCFPeakFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "CfUtils.h" #include "PackedCharge.h" #include "TPCPadGainCalib.h" @@ -25,19 +25,19 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFPeakFinder::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D isPeakMap(clusterer.mPpeakMap); findPeaksImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, chargeMap, clusterer.mPpadIsNoisy, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, clusterer.Param().rec, *clusterer.GetConstantMem()->calibObjects.tpcPadGain, clusterer.mPisPeak, isPeakMap); } GPUdii() bool GPUTPCCFPeakFinder::isPeak( GPUSharedMemory& smem, Charge q, - const ChargePos& pos, + const CfChargePos& pos, uint16_t N, - const Array2D& chargeMap, + const CfArray2D& chargeMap, const GPUSettingsRec& calib, - ChargePos* posBcast, + CfChargePos* posBcast, PackedCharge* buf) { uint16_t ll = get_local_id(0); @@ -91,21 +91,21 @@ GPUdii() bool GPUTPCCFPeakFinder::isPeak( } GPUd() void GPUTPCCFPeakFinder::findPeaksImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, - const Array2D& chargeMap, + const CfArray2D& chargeMap, const uint8_t* padHasLostBaseline, - const ChargePos* positions, + const CfChargePos* positions, SizeT digitnum, const GPUSettingsRec& calib, const TPCPadGainCalib& gainCorrection, // Only used for globalPad() function uint8_t* isPeakPredicate, - Array2D& peakMap) + CfArray2D& peakMap) { SizeT idx = get_global_id(0); // For certain configurations dummy work items are added, so the total // number of work items is dividable by 64. // These dummy items also compute the last digit but discard the result. - ChargePos pos = positions[CAMath::Min(idx, (SizeT)(digitnum - 1))]; + CfChargePos pos = positions[CAMath::Min(idx, (SizeT)(digitnum - 1))]; Charge charge = pos.valid() ? chargeMap[pos].unpack() : Charge(0); bool hasLostBaseline = padHasLostBaseline[gainCorrection.globalPad(pos.row(), pos.pad())]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h index ec17d98322239..e480518ddc9dd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h @@ -19,20 +19,20 @@ #include "GPUConstantMem.h" #include "clusterFinderDefs.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" namespace o2::gpu { -struct ChargePos; +struct CfChargePos; class GPUTPCCFPeakFinder : public GPUKernelTemplate { public: static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFPeakFinder); struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { - ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_SEARCH_N]; }; @@ -51,9 +51,9 @@ class GPUTPCCFPeakFinder : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void findPeaksImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const Array2D&, const uint8_t*, const ChargePos*, tpccf::SizeT, const GPUSettingsRec&, const TPCPadGainCalib&, uint8_t*, Array2D&); + static GPUd() void findPeaksImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, const uint8_t*, const CfChargePos*, tpccf::SizeT, const GPUSettingsRec&, const TPCPadGainCalib&, uint8_t*, CfArray2D&); - static GPUd() bool isPeak(GPUSharedMemory&, tpccf::Charge, const ChargePos&, uint16_t, const Array2D&, const GPUSettingsRec&, ChargePos*, PackedCharge*); + static GPUd() bool isPeak(GPUSharedMemory&, tpccf::Charge, const CfChargePos&, uint16_t, const CfArray2D&, const GPUSettingsRec&, CfChargePos*, PackedCharge*); }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx index efed3643800b6..1da5a1158a8c2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx @@ -15,7 +15,7 @@ #include "GPUTPCCFStreamCompaction.h" #include "GPUCommonAlgorithm.h" -#include "ChargePos.h" +#include "CfChargePos.h" #include "CfUtils.h" using namespace o2::gpu; @@ -92,7 +92,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread -GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, ChargePos* in, ChargePos* out) +GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, CfChargePos* in, CfChargePos* out) { uint32_t nElems = CompactionElems(clusterer, stage); SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx index 613c4ad9e5fa6..051391f12cc6d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx @@ -16,12 +16,14 @@ #include "GPUReconstruction.h" #include "GPUMemorySizeScalers.h" #include "GPUHostDataTypes.h" +#include "GPUSettings.h" +#include "DataFormatsTPC/ClusterNative.h" #include "DataFormatsTPC/ZeroSuppression.h" #include "DataFormatsTPC/Digit.h" -#include "ChargePos.h" -#include "Array2D.h" +#include "CfChargePos.h" +#include "CfArray2D.h" using namespace o2::gpu; using namespace o2::tpc; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 3e9ea2c6f608b..96efe08be6dc6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -46,7 +46,7 @@ namespace o2::gpu struct GPUTPCClusterMCInterimArray; struct TPCPadGainCalib; -struct ChargePos; +struct CfChargePos; class GPUTPCClusterFinder : public GPUProcessor { @@ -98,9 +98,9 @@ class GPUTPCClusterFinder : public GPUProcessor MinMaxCN* mMinMaxCN = nullptr; uint8_t* mPpadIsNoisy = nullptr; tpc::Digit* mPdigits = nullptr; // input digits, only set if ZS is skipped - ChargePos* mPpositions = nullptr; - ChargePos* mPpeakPositions = nullptr; - ChargePos* mPfilteredPeakPositions = nullptr; + CfChargePos* mPpositions = nullptr; + CfChargePos* mPpeakPositions = nullptr; + CfChargePos* mPfilteredPeakPositions = nullptr; uint8_t* mPisPeak = nullptr; uint32_t* mPclusterPosInRow = nullptr; // store the index where the corresponding cluster is stored in a bucket. // Required when MC are enabled to write the mc data to the correct position. diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx index a9fbc1b5f40e0..da30375149b7c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx @@ -14,8 +14,10 @@ #include "GPUTPCClusterFinder.h" #include "GPUReconstruction.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "DataFormatsTPC/Digit.h" +#include "DataFormatsTPC/ClusterNative.h" +#include "GPUSettings.h" using namespace o2::gpu; using namespace o2::gpu::tpccf; @@ -37,7 +39,7 @@ void GPUTPCClusterFinder::DumpDigits(std::ostream& out) void GPUTPCClusterFinder::DumpChargeMap(std::ostream& out, std::string_view title) { out << "\nClusterer - " << title << " - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n"; - Array2D map(mPchargeMap); + CfArray2D map(mPchargeMap); out << std::hex; @@ -71,7 +73,7 @@ void GPUTPCClusterFinder::DumpPeakMap(std::ostream& out, std::string_view title) { out << "\nClusterer - " << title << " - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n"; - Array2D map(mPpeakMap); + CfArray2D map(mPpeakMap); out << std::hex; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index fe3202fe7b439..092af2ea393c5 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -15,6 +15,7 @@ #include "GPUReconstruction.h" #include "ML/3rdparty/GPUORTFloat16.h" #include "GPUTPCNNClusterizer.h" +#include "GPUSettings.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index da490b0f94d58..022642f9f142e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -15,7 +15,7 @@ #ifndef O2_GPUTPCNNCLUSTERIZER_H #define O2_GPUTPCNNCLUSTERIZER_H -#include "ChargePos.h" +#include "CfChargePos.h" #include "GPUProcessor.h" namespace o2::OrtDataType diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 2cf9ab2037007..512bc1d3bb09b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -43,7 +43,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; @@ -58,9 +58,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); - ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D isPeakMap(clusterer.mPpeakMap); + CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); @@ -75,7 +75,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); - ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfArray2D isPeakMap(clusterer.mPpeakMap); + CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()); if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) { uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; - ChargePos tmp_pos = peak.delta(d); + CfChargePos tmp_pos = peak.delta(d); clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; } @@ -161,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(chargeMap[peak].unpack()); int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime; - ChargePos tmp_pos(row + r, pad + p, time + t); + CfChargePos tmp_pos(row + r, pad + p, time + t); if (dtype == 0) { clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { @@ -227,8 +227,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; float central_charge = static_cast(chargeMap[peak].unpack()); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); @@ -322,8 +322,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; float central_charge = static_cast(chargeMap[peak].unpack()); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 27cfba2487aed..dc7f537c6c1e8 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -19,7 +19,7 @@ #include "GPUGeneralKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "Array2D.h" +#include "CfArray2D.h" #include "PackedCharge.h" #include "GPUTPCNNClusterizer.h" @@ -47,7 +47,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); struct GPUSharedMemory { // Regular cluster finder - ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; }; diff --git a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx index 19ef7aa9ecd0d..e58edae208115 100644 --- a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx @@ -26,7 +26,7 @@ MCLabelAccumulator::MCLabelAccumulator(GPUTPCClusterFinder& clusterer) { } -void MCLabelAccumulator::collect(const ChargePos& pos, Charge q) +void MCLabelAccumulator::collect(const CfChargePos& pos, Charge q) { if (q == 0 || !engaged()) { return; diff --git a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h index 176fbea02befe..35c24bfeb5f18 100644 --- a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h @@ -16,7 +16,7 @@ #define O2_GPU_MC_LABEL_ACCUMULATOR_H #include "clusterFinderDefs.h" -#include "Array2D.h" +#include "CfArray2D.h" #include #include @@ -44,14 +44,14 @@ class MCLabelAccumulator public: MCLabelAccumulator(GPUTPCClusterFinder&); - void collect(const ChargePos&, tpccf::Charge); + void collect(const CfChargePos&, tpccf::Charge); bool engaged() const { return mLabels != nullptr && mOutput != nullptr; } void commit(tpccf::Row, uint32_t, uint32_t); private: - Array2D mIndexMap; + CfArray2D mIndexMap; const o2::dataformats::ConstMCLabelContainerView* mLabels = nullptr; GPUTPCClusterMCInterimArray* mOutput = nullptr; diff --git a/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx b/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx index 3d6b45c372ea0..899149d320bda 100644 --- a/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx +++ b/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx @@ -17,6 +17,7 @@ #include "GPUTPCClusterData.h" #include "GPUReconstruction.h" #include "GPUO2DataTypes.h" +#include "GPUParam.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx index c633f10adae38..2f754d2416bc1 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx @@ -22,6 +22,7 @@ #include "GPUTRDTrackerDebug.h" #include "GPUCommonMath.h" #include "GPUCommonAlgorithm.h" +#include "GPUConstantMem.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx index c2b74489e6250..5b0960919da15 100644 --- a/GPU/GPUTracking/display/GPUDisplay.cxx +++ b/GPU/GPUTracking/display/GPUDisplay.cxx @@ -13,6 +13,7 @@ /// \author David Rohr #include "GPUDisplay.h" +#include "frontend/GPUDisplayInfo.inc" #include "GPUTPCDef.h" @@ -204,7 +205,7 @@ int32_t GPUDisplay::DrawGLScene() int32_t retVal = 0; if (mChain) { mIOPtrs = &mChain->mIOPtrs; - mCalib = &mChain->calib(); + mCalib = &mChain->GetCalib(); } if (!mIOPtrs) { mNCollissions = 0; diff --git a/GPU/GPUTracking/display/GPUDisplay.h b/GPU/GPUTracking/display/GPUDisplay.h index bb270cda23565..dbd90020698b2 100644 --- a/GPU/GPUTracking/display/GPUDisplay.h +++ b/GPU/GPUTracking/display/GPUDisplay.h @@ -15,12 +15,10 @@ #ifndef GPUDISPLAY_H #define GPUDISPLAY_H -#include "GPUSettings.h" #include "frontend/GPUDisplayFrontend.h" #include "backend/GPUDisplayBackend.h" #include "GPUDisplayInterface.h" -#include "GPUChainTracking.h" #include "../utils/vecpod.h" #include "../utils/qsem.h" @@ -34,6 +32,7 @@ namespace o2::gpu class GPUTPCTracker; struct GPUParam; class GPUQA; +class GPUTRDGeometry; class GPUDisplay : public GPUDisplayInterface { @@ -77,7 +76,7 @@ class GPUDisplay : public GPUDisplayInterface int32_t& drawTextFontSize() { return mDrawTextFontSize; } private: - static constexpr int32_t NSECTORS = GPUChainTracking::NSECTORS; + static constexpr int32_t NSECTORS = GPUCA_NSECTORS; static constexpr float GL_SCALE_FACTOR = (1.f / 100.f); static constexpr const int32_t N_POINTS_TYPE = 15; @@ -157,16 +156,7 @@ class GPUDisplay : public GPUDisplayInterface void insertVertexList(std::pair*, vecpod*>& vBuf, size_t first, size_t last); void insertVertexList(int32_t iSector, size_t first, size_t last); template - void SetInfo(Args... args) - { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wformat-security" -#pragma GCC diagnostic ignored "-Wformat-truncation" - snprintf(mInfoText2, 1024, args...); -#pragma GCC diagnostic pop - GPUInfo("%s", mInfoText2); - mInfoText2Timer.ResetStart(); - } + void SetInfo(Args... args); void PrintGLHelpText(float colorValue); void calcXYZ(const float*); void mAnimationCloseAngle(float& newangle, float lastAngle); diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx index ded8803801fb7..98d2593c27950 100644 --- a/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx +++ b/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx @@ -34,6 +34,7 @@ #endif #include "GPUDisplay.h" +#include "GPULogging.h" #include using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx index 3ee3384c8e118..25ae5e1f8055f 100644 --- a/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx +++ b/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx @@ -27,6 +27,8 @@ #include "GPUDisplayBackendOpenGL.h" #include "shaders/GPUDisplayShaders.h" #include "GPUDisplay.h" +#include "GPULogging.h" +#include "GPUParam.h" #define OPENGL_EMULATE_MULTI_DRAW 0 diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx index 2324c194d04b9..93c19356ac062 100644 --- a/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx +++ b/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx @@ -19,6 +19,8 @@ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE #include "GPUCommonDef.h" #include "GPUDisplayBackendVulkan.h" #include "GPUDisplay.h" +#include "GPULogging.h" +#include "GPUParam.h" #include diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx index 590d8648eb5bb..22970c3228815 100644 --- a/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx +++ b/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx @@ -30,6 +30,9 @@ #include "GPUDisplayFrontendWayland.h" #endif +#include "GPULogging.h" +#include + #ifdef GPUCA_BUILD_EVENT_DISPLAY_QT #include "GPUDisplayGUIWrapper.h" #else diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx index ad3b620ba8f55..d0aae2ffaad02 100644 --- a/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx +++ b/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx @@ -18,6 +18,7 @@ #include "GPUDisplayGUIWrapper.h" #include "GPUDisplay.h" #include "GPULogging.h" +#include "GPUParam.h" #include #include #include diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc b/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc new file mode 100644 index 0000000000000..b6ac78b31f315 --- /dev/null +++ b/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc @@ -0,0 +1,36 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUDisplayInfo.inc +/// \author David Rohr + +#ifndef GPUDISPLAYINFO_INC_H +#define GPUDISPLAYINFO_INC_H + +#include "GPUDisplay.h" +#include "GPULogging.h" + +namespace o2::gpu +{ +template +void GPUDisplay::SetInfo(Args... args) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-security" +#pragma GCC diagnostic ignored "-Wformat-truncation" + snprintf(mInfoText2, 1024, args...); +#pragma GCC diagnostic pop + GPUInfo("%s", mInfoText2); + mInfoText2Timer.ResetStart(); +} +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx index 32ff6c73e110c..54258857a244c 100644 --- a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx +++ b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx @@ -13,6 +13,7 @@ /// \author David Rohr #include "GPUDisplay.h" +#include "frontend/GPUDisplayInfo.inc" using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx index 6c0595b073cd0..cc9ec2e766c4d 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx @@ -13,6 +13,8 @@ /// \author David Rohr #include "GPUDisplay.h" +#include "frontend/GPUDisplayInfo.inc" +#include "GPUCommonMath.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx index b04c93ab8496e..ef94628baeb38 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx @@ -31,6 +31,7 @@ #include "backend/GPUDisplayBackendOpenGL.h" #include "shaders/GPUDisplayShaders.h" #include "GPUDisplay.h" +#include "GPULogging.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx index ca9fd6be01703..9d188d03c7b69 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx @@ -13,6 +13,9 @@ /// \author David Rohr #include "GPUDisplay.h" +#include "GPUChainTracking.h" +#include "GPULogging.h" +#include "GPUParam.h" #ifndef _WIN32 #include "bitmapfile.h" diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx index 644995929acb7..4dacaec2fbca5 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx @@ -14,6 +14,7 @@ #include #include "GPUDisplay.h" +#include "GPULogging.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx index ee50f32e3c1ac..d31ee206f35e2 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx @@ -16,6 +16,7 @@ #include "frontend/GPUDisplayFrontend.h" #include "GPUDisplayInterface.h" +#include #include #include diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx index dbeefc7bf9b07..07a05e585d422 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx @@ -17,6 +17,9 @@ #endif #include "GPUDisplay.h" +#include "GPULogging.h" +#include "GPUConstantMem.h" +#include "GPUChainTracking.h" using namespace o2::gpu; #ifndef GPUCA_NO_ROOT diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx index 24668c576d795..e1d63ea1a21e4 100644 --- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx +++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx @@ -28,6 +28,7 @@ #include "GPUTPCMCInfo.h" #include "GPUParam.inc" #include "GPUCommonMath.h" +#include "GPUChainTracking.h" #include @@ -43,7 +44,7 @@ using namespace o2::gpu; #define GET_CID(sector, i) (mParam->par.earlyTpcTransform ? mIOPtrs->clusterData[sector][i].id : (mIOPtrs->clustersNative->clusterOffset[sector][0] + i)) const GPUTRDGeometry* GPUDisplay::trdGeometry() { return (GPUTRDGeometry*)mCalib->trdGeometry; } -const GPUTPCTracker& GPUDisplay::sectorTracker(int32_t iSector) { return mChain->GetTPCSectorTrackers()[iSector]; } +const GPUTPCTracker& GPUDisplay::sectorTracker(int32_t iSector) { return mChain->GetProcessors()->tpcTrackers[iSector]; } inline void GPUDisplay::insertVertexList(std::pair*, vecpod*>& vBuf, size_t first, size_t last) { diff --git a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx index 0a780732273db..ab4c0abd7b60e 100644 --- a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx +++ b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx @@ -17,6 +17,7 @@ #endif #include "GPUDisplay.h" +#include "frontend/GPUDisplayInfo.inc" #include "GPUO2DataTypes.h" #include "GPUTPCClusterData.h" #include "GPUTPCConvertImpl.h" diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 7e3ddf868af2a..202ea47d1f3bf 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -128,7 +128,7 @@ o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart" "= TPCCLUS o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanTop" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanDown" "= TPCCLUSTERFINDER" LB int32_t iBuf "uint32_t" offset int32_t nElems) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage ChargePos* in ChargePos* out) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage CfChargePos* in CfChargePos* out) o2_gpu_add_kernel("GPUTPCCFDecodeZS" "= TPCCLUSTERFINDER" LB int32_t firstHBF) o2_gpu_add_kernel("GPUTPCCFDecodeZSLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index ba7aeb3800a5e..6a2623fb6e09d 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -38,6 +38,7 @@ #include "GPUTPCDef.h" #include "GPUTPCTrackingData.h" #include "GPUChainTracking.h" +#include "GPUChainTrackingGetters.inc" #include "GPUTPCTrack.h" #include "GPUTPCTracker.h" #include "GPUTPCGMMergedTrack.h" @@ -1702,7 +1703,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } } - uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetTPCMerger().NMaxClusters(); + uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetProcessors()->tpcMerger.NMaxClusters(); mClusterCounts.nTotal += nCl; if (mQATasks & taskClusterCounts) { for (uint32_t i = 0; i < nCl; i++) { diff --git a/GPU/GPUTracking/qa/genEvents.cxx b/GPU/GPUTracking/qa/genEvents.cxx index 2e1bc1c5c64b2..9c2220f9ef748 100644 --- a/GPU/GPUTracking/qa/genEvents.cxx +++ b/GPU/GPUTracking/qa/genEvents.cxx @@ -37,6 +37,7 @@ #include "GPUTPCGMPropagator.h" #include "GPUTPCGMMerger.h" #include "GPUChainTracking.h" +#include "GPUConstantMem.h" #include "../utils/qconfig.h" @@ -169,7 +170,7 @@ int32_t genEvents::GenerateEvent(const GPUParam& param, char* filename) GPUTPCGMPropagator prop; { prop.SetToyMCEventsFlag(kTRUE); - const GPUTPCGMMerger& merger = mRec->GetTPCMerger(); + const GPUTPCGMMerger& merger = mRec->GetProcessors()->tpcMerger; prop.SetPolynomialField(&merger.Param().polynomialField); }