From 18682470758c8cde1a5dc8d4dfa79f426050be6a Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 3 Feb 2025 00:52:31 +0100 Subject: [PATCH 1/2] GPU: Remove obsolete code paths --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 14 +- GPU/GPUTracking/Definitions/GPUSettingsList.h | 3 - GPU/GPUTracking/Global/GPUChainTracking.cxx | 20 +- .../Global/GPUChainTrackingMerger.cxx | 73 +++---- .../Global/GPUChainTrackingSliceTracker.cxx | 9 +- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 200 +++++------------- GPU/GPUTracking/Merger/GPUTPCGMMerger.h | 6 - GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx | 6 +- GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx | 29 +-- .../SliceTracker/GPUTPCTracker.cxx | 4 +- .../Standalone/Benchmark/standalone.cxx | 4 +- GPU/GPUTracking/qa/GPUQA.cxx | 5 - 12 files changed, 109 insertions(+), 264 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 5df69c416e858..1496300818fd8 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -282,21 +282,9 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mProcessingSettings.nDeviceHelperThreads = 0; } - if (param().rec.nonConsecutiveIDs) { - param().rec.tpc.disableRefitAttachment = 0xFF; - } - if (!(mRecoSteps.stepsGPUMask & RecoStep::TPCMerging) || !param().rec.tpc.mergerReadFromTrackerDirectly) { - mProcessingSettings.fullMergerOnGPU = false; - } - if (mProcessingSettings.debugLevel > 3 || !IsGPU() || !mProcessingSettings.fullMergerOnGPU || mProcessingSettings.deterministicGPUReconstruction) { + if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) { mProcessingSettings.delayedOutput = false; } - if (!mProcessingSettings.fullMergerOnGPU && (GetRecoStepsGPU() & RecoStep::TPCMerging)) { - param().rec.tpc.looperInterpolationInExtraPass = 0; - if (param().rec.tpc.retryRefit == 1) { - param().rec.tpc.retryRefit = 2; - } - } UpdateAutomaticProcessingSettings(); GPUCA_GPUReconstructionUpdateDefaults(); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 76370c17f9f53..c10793975453d 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -149,7 +149,6 @@ AddOptionRTC(mergerInterpolateErrors, uint8_t, 1, "", 0, "Use interpolation inst AddOptionRTC(mergeCE, uint8_t, 1, "", 0, "Merge tracks accross the central electrode") AddOptionRTC(retryRefit, int8_t, 1, "", 0, "Retry refit with seeding errors and without cluster rejection when fit fails (=2 means retry in same kernel, =1 for separate kernel") AddOptionRTC(looperInterpolationInExtraPass, int8_t, -1, "", 0, "Perform looper interpolation in an extra pass") -AddOptionRTC(mergerReadFromTrackerDirectly, int8_t, 1, "", 0, "Forward data directly from tracker to merger on GPU") AddOptionRTC(dropSecondaryLegsInOutput, int8_t, 1, "", 0, "Do not store secondary legs of looping track in TrackTPC") AddOptionRTC(enablePID, int8_t, 1, "", 0, "Enable PID response") AddOptionRTC(PID_useNsigma, int8_t, 1, "", 0, "Use nSigma instead of absolute distance in PID response") @@ -188,7 +187,6 @@ EndConfig() BeginSubConfig(GPUSettingsRec, rec, configStandalone, "REC", 0, "Reconstruction settings", rec) AddOptionRTC(maxTrackQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_DEFAULT, "", 0, "required max Q/Pt (==min Pt) of tracks") -AddOptionRTC(nonConsecutiveIDs, int8_t, false, "", 0, "Non-consecutive cluster IDs as in HLT, disables features that need access to slice data in TPC merger") AddOptionRTC(fwdTPCDigitsAsClusters, uint8_t, 0, "", 0, "Forward TPC digits as clusters (if they pass the ZS threshold)") AddOptionRTC(bz0Pt10MeV, uint8_t, 60, "", 0, "Nominal Pt to set when bz = 0 (in 10 MeV)") AddOptionRTC(fitInProjections, int8_t, -1, "", 0, "Fit in projection, -1 to enable full fit for all but passes but the first one") @@ -261,7 +259,6 @@ AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster AddOption(trackletSelectorSlices, int8_t, -1, "", 0, "Number of slices to processes in parallel at max") AddOption(trackletConstructorInPipeline, int8_t, -1, "", 0, "Run tracklet constructor in the pipeline") AddOption(trackletSelectorInPipeline, int8_t, -1, "", 0, "Run tracklet selector in the pipeline") -AddOption(fullMergerOnGPU, bool, true, "", 0, "Perform full TPC track merging on GPU instead of only refit") AddOption(delayedOutput, bool, true, "", 0, "Delay output to be parallel to track fit") AddOption(mergerSortTracks, int8_t, -1, "", 0, "Sort track indizes for GPU track fit") AddOption(alternateBorderSort, int8_t, -1, "", 0, "Alternative implementation for sorting of border tracks") diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 1aa5f9ca0dad8..889e12c258cb4 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -185,12 +185,8 @@ bool GPUChainTracking::ValidateSteps() GPUError("Invalid input, TPC Clusterizer needs TPC raw input"); return false; } - if (param().rec.tpc.mergerReadFromTrackerDirectly && (GetRecoSteps() & GPUDataTypes::RecoStep::TPCMerging) && ((GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCSectorTracks) || (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCConversion))) { - GPUError("Invalid input / output / step, mergerReadFromTrackerDirectly cannot read/store sectors tracks and needs TPC conversion"); - return false; - } - if (!GetProcessingSettings().fullMergerOnGPU && (param().rec.tpc.mergerReadFromTrackerDirectly || GetProcessingSettings().createO2Output) && (GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging)) { - GPUError("createO2Output and mergerReadFromTrackerDirectly works only in combination with fullMergerOnGPU if the merger is to run on GPU"); + if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCMerging) && ((GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCSectorTracks) || (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCConversion))) { + GPUError("Invalid input / output / step, merger cannot read/store sectors tracks and needs TPC conversion"); return false; } bool tpcClustersAvail = (GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCClusters) || (GetRecoSteps() & GPUDataTypes::RecoStep::TPCClusterFinding) || (GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression); @@ -265,14 +261,6 @@ bool GPUChainTracking::ValidateSettings() GPUError("Cannot do error interpolation with NWays = 1!"); return false; } - if ((param().rec.tpc.mergerReadFromTrackerDirectly || !param().par.earlyTpcTransform) && param().rec.nonConsecutiveIDs) { - GPUError("incompatible settings for non consecutive ids"); - return false; - } - if (!param().rec.tpc.mergerReadFromTrackerDirectly && GetProcessingSettings().ompKernels) { - GPUError("OMP Kernels require mergerReadFromTrackerDirectly"); - return false; - } if (param().continuousMaxTimeBin > (int32_t)GPUSettings::TPC_MAX_TF_TIME_BIN) { GPUError("configured max time bin exceeds 256 orbits"); return false; @@ -743,10 +731,6 @@ int32_t GPUChainTracking::RunChain() return 1; } - for (uint32_t i = 0; i < NSLICES; i++) { - // GPUInfo("slice %d clusters %d tracks %d", i, mClusterData[i].NumberOfClusters(), processors()->tpcTrackers[i].Output()->NTracks()); - processors()->tpcMerger.SetSliceData(i, param().rec.tpc.mergerReadFromTrackerDirectly ? nullptr : processors()->tpcTrackers[i].Output()); - } if (runRecoStep(RecoStep::TPCMerging, &GPUChainTracking::RunTPCTrackingMerger, false)) { return 1; } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index 8dd5140db6952..0831b260f881d 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -24,14 +24,14 @@ using namespace o2::gpu; void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType) { GPUTPCGMMerger& Merger = processors()->tpcMerger; - bool doGPUall = GetRecoStepsGPU() & RecoStep::TPCMerging && GetProcessingSettings().fullMergerOnGPU; - GPUTPCGMMerger& MergerShadow = doGPUall ? processorsShadow()->tpcMerger : Merger; + bool doGPU = GetRecoStepsGPU() & RecoStep::TPCMerging; + GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow()->tpcMerger : Merger; if (GetProcessingSettings().deterministicGPUReconstruction) { uint32_t nBorderTracks = withinSlice == 1 ? NSLICES : (2 * NSLICES); runKernel({{nBorderTracks, -WarpSize(), 0, deviceType}}, 0); } uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES; - if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPUall)) { + if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPU)) { TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->init); RecordMarker(&mEvents->single, 0); for (uint32_t i = 0; i < n; i++) { @@ -72,7 +72,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice runKernel(GetGridAuto(0, deviceType), i, withinSlice, mergeMode); } } - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSlice, mergeMode); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSlice, mergeMode); mRec->ReturnVolatileDeviceMemory(); } @@ -89,12 +89,11 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) { mRec->PushNonPersistentMemory(qStr2Tag("TPCMERGE")); bool doGPU = GetRecoStepsGPU() & RecoStep::TPCMerging; - bool doGPUall = doGPU && GetProcessingSettings().fullMergerOnGPU; - GPUReconstruction::krnlDeviceType deviceType = doGPUall ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU; - uint32_t numBlocks = (!mRec->IsGPU() || doGPUall) ? BlockCount() : 1; + GPUReconstruction::krnlDeviceType deviceType = doGPU ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU; + uint32_t numBlocks = (!mRec->IsGPU() || doGPU) ? BlockCount() : 1; GPUTPCGMMerger& Merger = processors()->tpcMerger; GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow()->tpcMerger : Merger; - GPUTPCGMMerger& MergerShadowAll = doGPUall ? processorsShadow()->tpcMerger : Merger; + GPUTPCGMMerger& MergerShadowAll = doGPU ? processorsShadow()->tpcMerger : Merger; const int32_t outputStream = OutputStream(); if (GetProcessingSettings().debugLevel >= 2) { GPUInfo("Running TPC Merger"); @@ -112,7 +111,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) memset(Merger.Memory(), 0, sizeof(*Merger.Memory())); WriteToConstantMemory(RecoStep::TPCMerging, (char*)&processors()->tpcMerger - (char*)processors(), &MergerShadow, sizeof(MergerShadow), 0); - if (doGPUall) { + if (doGPU) { TransferMemoryResourcesToGPU(RecoStep::TPCMerging, &Merger, 0); } @@ -136,14 +135,14 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) if (GetProcessingSettings().deterministicGPUReconstruction) { runKernel({{GPUCA_NSLICES, -WarpSize(), 0, deviceType}}, 1); } - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile); runKernel(GetGridAuto(0, deviceType), false); runKernel({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), NSLICES * sizeof(*MergerShadowAll.TmpCounter())); runKernel(GetGridAuto(0, deviceType)); RunTPCTrackingMerger_MergeBorderTracks(1, 0, deviceType); RunTPCTrackingMerger_Resolve(0, 1, deviceType); - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile); runKernel(GetGridAuto(0, deviceType), false); runKernel({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSLICES * sizeof(*MergerShadowAll.TmpCounter())); @@ -158,7 +157,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridBlk(std::max(2u, numBlocks), 0, deviceType), 0, 1, 1); RunTPCTrackingMerger_MergeBorderTracks(0, -1, deviceType); RunTPCTrackingMerger_Resolve(0, 1, deviceType); - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile); runKernel({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSLICES * sizeof(*MergerShadowAll.TmpCounter())); @@ -168,17 +167,17 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel({{1, -WarpSize(), 0, deviceType}}, 1); runKernel({{1, -WarpSize(), 0, deviceType}}, 1); } - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile); if (param().rec.tpc.mergeCE) { runKernel(GetGridAuto(0, deviceType), true); RunTPCTrackingMerger_MergeBorderTracks(-1, 1, deviceType); RunTPCTrackingMerger_MergeBorderTracks(-1, 2, deviceType); runKernel(GetGridAuto(0, deviceType)); - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile); } int32_t waitForTransfer = 0; - if (doGPUall) { + if (doGPU) { TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->single); waitForTransfer = 1; } @@ -189,23 +188,21 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridAuto(0, deviceType)); } - uint32_t maxId = param().rec.nonConsecutiveIDs ? Merger.Memory()->nOutputTrackClusters : Merger.NMaxClusters(); + uint32_t maxId = Merger.NMaxClusters(); if (maxId > Merger.NMaxClusters()) { throw std::runtime_error("mNMaxClusters too small"); } - if (!param().rec.nonConsecutiveIDs) { - runKernel({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount(), maxId * sizeof(*MergerShadowAll.SharedCount())); - runKernel({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment(), maxId * sizeof(*MergerShadowAll.ClusterAttachment())); - runKernel(GetGridAuto(0, deviceType)); - CondWaitEvent(waitForTransfer, &mEvents->single); - runKernel(GetGridAuto(0, deviceType)); - runKernel(GetGridAuto(0, deviceType)); - runKernel(GetGridAuto(0, deviceType)); - } + runKernel({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount(), maxId * sizeof(*MergerShadowAll.SharedCount())); + runKernel({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment(), maxId * sizeof(*MergerShadowAll.ClusterAttachment())); + runKernel(GetGridAuto(0, deviceType)); + CondWaitEvent(waitForTransfer, &mEvents->single); + runKernel(GetGridAuto(0, deviceType)); + runKernel(GetGridAuto(0, deviceType)); + runKernel(GetGridAuto(0, deviceType)); - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile); - if (doGPUall) { + if (doGPU) { CondWaitEvent(waitForTransfer, &mEvents->single); if (waitForTransfer) { ReleaseEvent(mEvents->single); @@ -228,29 +225,23 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) if (param().rec.tpc.looperInterpolationInExtraPass) { runKernel(GetGridAuto(0)); } - if (doGPU && !doGPUall) { - TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, 0); - SynchronizeStream(0); - } DoDebugAndDump(RecoStep::TPCMerging, 2048, Merger, &GPUTPCGMMerger::DumpRefit, *mDebugFile); runKernel(GetGridAuto(0, deviceType)); - if (!param().rec.nonConsecutiveIDs) { - runKernel(GetGridAuto(0, deviceType)); - runKernel(GetGridAuto(0, deviceType)); - } + runKernel(GetGridAuto(0, deviceType)); + runKernel(GetGridAuto(0, deviceType)); if (param().rec.tpc.mergeLoopersAfterburner) { - runKernel(doGPUall ? GetGrid(Merger.NOutputTracks(), 0, deviceType) : GetGridAuto(0, deviceType)); + runKernel(doGPU ? GetGrid(Merger.NOutputTracks(), 0, deviceType) : GetGridAuto(0, deviceType)); if (doGPU) { TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0); SynchronizeStream(0); // TODO: could probably synchronize on an event after runKernel } runKernel(GetGridAuto(0, deviceType)); - runKernel(doGPUall ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType)); + runKernel(doGPU ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType)); } - DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile); - if (doGPUall) { + if (doGPU) { RecordMarker(&mEvents->single, 0); auto* waitEvent = &mEvents->single; if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1 || mFractionalQAEnabled) { @@ -302,7 +293,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->single); runKernel(GetGridAuto(0, deviceType)); mRec->ReturnVolatileDeviceMemory(); - SynchronizeEventAndRelease(mEvents->single, doGPUall); + SynchronizeEventAndRelease(mEvents->single, doGPU); if (GetProcessingSettings().clearO2OutputFromGPU) { mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile @@ -316,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) AllocateRegisteredMemory(Merger.MemoryResOutputO2MC(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2Labels)]); TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, -1, true); runKernel(GetGridAuto(0, GPUReconstruction::krnlDeviceType::CPU)); - } else if (doGPUall) { + } else if (doGPU) { RecordMarker(&mEvents->single, 0); TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2(), outputStream, nullptr, &mEvents->single); TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus(), outputStream); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx index ba6ba03fca8a1..35a8c6c455048 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx @@ -30,11 +30,8 @@ int32_t GPUChainTracking::GlobalTracking(uint32_t iSlice, int32_t threadId, bool GPUInfo("GPU Tracker running Global Tracking for slice %u on thread %d\n", iSlice, threadId); } - GPUReconstruction::krnlDeviceType deviceType = GetProcessingSettings().fullMergerOnGPU ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU; - runKernel({GetGridBlk(256, iSlice % mRec->NStreams(), deviceType), {iSlice}}); - if (GetProcessingSettings().fullMergerOnGPU) { - TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, processors()->tpcTrackers[iSlice].MemoryResCommon(), iSlice % mRec->NStreams()); - } + runKernel({GetGridBlk(256, iSlice % mRec->NStreams()), {iSlice}}); + TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, processors()->tpcTrackers[iSlice].MemoryResCommon(), iSlice % mRec->NStreams()); if (synchronizeOutput) { SynchronizeStream(iSlice % mRec->NStreams()); } @@ -450,7 +447,7 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() blocking[tmpSlice * mRec->NStreams() + sliceRight % mRec->NStreams()] = true; } } - GlobalTracking(tmpSlice, 0, !GetProcessingSettings().fullMergerOnGPU); + GlobalTracking(tmpSlice, 0, false); } } for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index fab4469eeb488..60dd18a254904 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -81,7 +81,7 @@ struct MergeLooperParam { #include "GPUMemorySizeScalers.h" GPUTPCGMMerger::GPUTPCGMMerger() - : mTrackLinks(nullptr), mNTotalSliceTracks(0), mNMaxTracks(0), mNMaxSingleSliceTracks(0), mNMaxOutputTrackClusters(0), mNMaxClusters(0), mMemoryResMemory(-1), mNClusters(0), mOutputTracks(nullptr), mSliceTrackInfos(nullptr), mSliceTrackInfoIndex(nullptr), mClusters(nullptr), mClustersXYZ(nullptr), mGlobalClusterIDs(nullptr), mClusterAttachment(nullptr), mOutputTracksTPCO2(nullptr), mOutputClusRefsTPCO2(nullptr), mOutputTracksTPCO2MC(nullptr), mTrackOrderAttach(nullptr), mTrackOrderProcess(nullptr), mBorderMemory(nullptr), mBorderRangeMemory(nullptr), mMemory(nullptr), mRetryRefitIds(nullptr), mLoopData(nullptr) + : mTrackLinks(nullptr), mNTotalSliceTracks(0), mNMaxTracks(0), mNMaxSingleSliceTracks(0), mNMaxOutputTrackClusters(0), mNMaxClusters(0), mMemoryResMemory(-1), mNClusters(0), mOutputTracks(nullptr), mSliceTrackInfos(nullptr), mSliceTrackInfoIndex(nullptr), mClusters(nullptr), mClustersXYZ(nullptr), mClusterAttachment(nullptr), mOutputTracksTPCO2(nullptr), mOutputClusRefsTPCO2(nullptr), mOutputTracksTPCO2MC(nullptr), mTrackOrderAttach(nullptr), mTrackOrderProcess(nullptr), mBorderMemory(nullptr), mBorderRangeMemory(nullptr), mMemory(nullptr), mRetryRefitIds(nullptr), mLoopData(nullptr) { //* constructor @@ -95,10 +95,6 @@ GPUTPCGMMerger::GPUTPCGMMerger() mPrevSliceInd[0] = mid; mNextSliceInd[last] = NSLICES / 2; mPrevSliceInd[NSLICES / 2] = last; - - for (int32_t i = 0; i < NSLICES; i++) { - mkSlices[i] = nullptr; - } } // DEBUG CODE @@ -180,13 +176,9 @@ int64_t GPUTPCGMMerger::GetTrackLabelA(const S& trk) const for (int32_t i = 0; i < nClusters; i++) { int32_t id; if constexpr (std::is_same::value) { - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[sliceTrack->Slice()]; - const GPUTPCHitId& ic = tracker.TrackHits()[sliceTrack->OrigTrack()->FirstHitID() + i]; - id = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[sliceTrack->Slice()][0]; - } else { - id = sliceTrack->OrigTrack()->OutTrackClusters()[i].GetId(); - } + const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[sliceTrack->Slice()]; + const GPUTPCHitId& ic = tracker.TrackHits()[sliceTrack->OrigTrack()->FirstHitID() + i]; + id = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[sliceTrack->Slice()][0]; } else { id = mClusters[trk.FirstClusterRef() + i].num; } @@ -251,9 +243,6 @@ void* GPUTPCGMMerger::SetPointersMerger(void* mem) { computePointerWithAlignment(mem, mSliceTrackInfos, mNTotalSliceTracks); computePointerWithAlignment(mem, mSliceTrackInfoIndex, NSLICES * 2 + 1); - if (mRec->GetParam().rec.nonConsecutiveIDs) { - computePointerWithAlignment(mem, mGlobalClusterIDs, mNMaxOutputTrackClusters); - } if (mRec->GetProcessingSettings().deterministicGPUReconstruction) { computePointerWithAlignment(mem, mTmpSortMemory, std::max(mNTotalSliceTracks, mNMaxTracks * 2)); } @@ -263,7 +252,7 @@ void* GPUTPCGMMerger::SetPointersMerger(void* mem) computePointerWithAlignment(mem, mBorderRangeMemory, 2 * mNTotalSliceTracks); int32_t nTracks = 0; for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { - const int32_t n = mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks() : mkSlices[iSlice]->NTracks(); + const int32_t n = *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks(); mBorder[iSlice] = mBorderMemory + 2 * nTracks; mBorder[NSLICES + iSlice] = mBorderMemory + 2 * nTracks + n; mBorderRange[iSlice] = mBorderRangeMemory + 2 * nTracks; @@ -296,14 +285,6 @@ void* GPUTPCGMMerger::SetPointersMemory(void* mem) } void* GPUTPCGMMerger::SetPointersRefitScratch(void* mem) -{ - if (mRec->GetProcessingSettings().fullMergerOnGPU) { - mem = SetPointersRefitScratch2(mem); - } - return mem; -} - -void* GPUTPCGMMerger::SetPointersRefitScratch2(void* mem) { computePointerWithAlignment(mem, mTrackOrderAttach, mNMaxTracks); if (mRec->GetProcessingSettings().mergerSortTracks) { @@ -323,9 +304,6 @@ void* GPUTPCGMMerger::SetPointersOutput(void* mem) computePointerWithAlignment(mem, mClustersXYZ, mNMaxOutputTrackClusters); } computePointerWithAlignment(mem, mClusterAttachment, mNMaxClusters); - if (!mRec->GetProcessingSettings().fullMergerOnGPU) { - mem = SetPointersRefitScratch2(mem); - } return mem; } @@ -367,10 +345,10 @@ void* GPUTPCGMMerger::SetPointersOutputO2Scratch(void* mem) void GPUTPCGMMerger::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersMerger, (mRec->GetProcessingSettings().fullMergerOnGPU ? 0 : GPUMemoryResource::MEMORY_HOST) | GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMerger"); + mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersMerger, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMerger"); mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersRefitScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMergerRefitScratch"); - mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutput, (mRec->GetProcessingSettings().fullMergerOnGPU ? (mRec->GetProcessingSettings().createO2Output > 1 ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT) : GPUMemoryResource::MEMORY_INOUT) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutput"); - mMemoryResOutputState = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputState, (mRec->GetProcessingSettings().fullMergerOnGPU ? (mRec->GetProcessingSettings().outputSharedClusterMap ? GPUMemoryResource::MEMORY_OUTPUT : GPUMemoryResource::MEMORY_GPU) : GPUMemoryResource::MEMORY_HOST) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputState"); + mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutput, (mRec->GetProcessingSettings().createO2Output > 1 ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutput"); + mMemoryResOutputState = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputState, (mRec->GetProcessingSettings().outputSharedClusterMap ? GPUMemoryResource::MEMORY_OUTPUT : GPUMemoryResource::MEMORY_GPU) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputState"); if (mRec->GetProcessingSettings().createO2Output) { mMemoryResOutputO2Scratch = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputO2Scratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputO2Scratch"); mMemoryResOutputO2 = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputO2, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputO2"); @@ -388,9 +366,9 @@ void GPUTPCGMMerger::SetMaxData(const GPUTrackingInOutPointers& io) mNClusters = 0; mNMaxSingleSliceTracks = 0; for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { - uint32_t ntrk = mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks() : mkSlices[iSlice]->NTracks(); + uint32_t ntrk = *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks(); mNTotalSliceTracks += ntrk; - mNClusters += mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTrackHits() : mkSlices[iSlice]->NTrackClusters(); + mNClusters += *mRec->GetConstantMem().tpcTrackers[iSlice].NTrackHits(); if (mNMaxSingleSliceTracks < ntrk) { mNMaxSingleSliceTracks = ntrk; } @@ -417,12 +395,12 @@ void GPUTPCGMMerger::SetMaxData(const GPUTrackingInOutPointers& io) int32_t GPUTPCGMMerger::CheckSlices() { for (int32_t i = 0; i < NSLICES; i++) { - if ((Param().rec.tpc.mergerReadFromTrackerDirectly ? mRec->GetConstantMem().tpcTrackers[i].CommonMemory()->nLocalTracks : mkSlices[i]->NLocalTracks()) > mNMaxSingleSliceTracks) { + if (mRec->GetConstantMem().tpcTrackers[i].CommonMemory()->nLocalTracks > (int32_t)mNMaxSingleSliceTracks) { throw std::runtime_error("mNMaxSingleSliceTracks too small"); } } - if (!(mRec->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking) && (!Param().rec.nonConsecutiveIDs || Param().rec.tpc.mergerReadFromTrackerDirectly)) { - throw std::runtime_error("Must run also slice tracking if nonConsecutiveIDs = false or mergerReadFromTrackerDirectly"); + if (!(mRec->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking)) { + throw std::runtime_error("Must run also slice tracking"); } return 0; } @@ -469,32 +447,18 @@ GPUd() int32_t GPUTPCGMMerger::RefitSliceTrack(GPUTPCGMSliceTrack& sliceTrack, c for (int32_t i = start; i != end; i += incr) { float x, y, z; int32_t row, flags; - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[slice]; - const GPUTPCHitId& ic = tracker.TrackHits()[inTrack->FirstHitID() + i]; - int32_t clusterIndex = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()); - row = ic.RowIndex(); - const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[GetConstantMem()->ioPtrs.clustersNative->clusterOffset[slice][0] + clusterIndex]; - flags = cl.getFlags(); - if (Param().par.earlyTpcTransform) { - x = tracker.Data().ClusterData()[clusterIndex].x; - y = tracker.Data().ClusterData()[clusterIndex].y; - z = tracker.Data().ClusterData()[clusterIndex].z - trk.TZOffset(); - } else { - GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset()); - } + const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[slice]; + const GPUTPCHitId& ic = tracker.TrackHits()[inTrack->FirstHitID() + i]; + int32_t clusterIndex = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()); + row = ic.RowIndex(); + const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[GetConstantMem()->ioPtrs.clustersNative->clusterOffset[slice][0] + clusterIndex]; + flags = cl.getFlags(); + if (Param().par.earlyTpcTransform) { + x = tracker.Data().ClusterData()[clusterIndex].x; + y = tracker.Data().ClusterData()[clusterIndex].y; + z = tracker.Data().ClusterData()[clusterIndex].z - trk.TZOffset(); } else { - const GPUTPCSliceOutCluster& clo = inTrack->OutTrackCluster(i); - row = clo.GetRow(); - flags = clo.GetFlags(); - if (Param().par.earlyTpcTransform) { - x = clo.GetX(); - y = clo.GetY(); - z = clo.GetZ() - trk.TZOffset(); - } else { - const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[clo.GetId()]; - GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset()); - } + GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset()); } if (prop.PropagateToXAlpha(x, alpha, true)) { return way == 0; @@ -516,25 +480,16 @@ GPUd() int32_t GPUTPCGMMerger::RefitSliceTrack(GPUTPCGMSliceTrack& sliceTrack, c GPUd() void GPUTPCGMMerger::SetTrackClusterZT(GPUTPCGMSliceTrack& track, int32_t iSlice, const GPUTPCTrack* sliceTr) { - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice]; - const GPUTPCHitId& ic1 = trk.TrackHits()[sliceTr->FirstHitID()]; - const GPUTPCHitId& ic2 = trk.TrackHits()[sliceTr->FirstHitID() + sliceTr->NHits() - 1]; - int32_t clusterIndex1 = trk.Data().ClusterDataIndex(trk.Data().Row(ic1.RowIndex()), ic1.HitIndex()); - int32_t clusterIndex2 = trk.Data().ClusterDataIndex(trk.Data().Row(ic2.RowIndex()), ic2.HitIndex()); - if (Param().par.earlyTpcTransform) { - track.SetClusterZT(trk.Data().ClusterData()[clusterIndex1].z, trk.Data().ClusterData()[clusterIndex2].z); - } else { - const ClusterNative* cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0]; - track.SetClusterZT(cl[clusterIndex1].getTime(), cl[clusterIndex2].getTime()); - } + const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice]; + const GPUTPCHitId& ic1 = trk.TrackHits()[sliceTr->FirstHitID()]; + const GPUTPCHitId& ic2 = trk.TrackHits()[sliceTr->FirstHitID() + sliceTr->NHits() - 1]; + int32_t clusterIndex1 = trk.Data().ClusterDataIndex(trk.Data().Row(ic1.RowIndex()), ic1.HitIndex()); + int32_t clusterIndex2 = trk.Data().ClusterDataIndex(trk.Data().Row(ic2.RowIndex()), ic2.HitIndex()); + if (Param().par.earlyTpcTransform) { + track.SetClusterZT(trk.Data().ClusterData()[clusterIndex1].z, trk.Data().ClusterData()[clusterIndex2].z); } else { - if (Param().par.earlyTpcTransform) { - track.SetClusterZT(sliceTr->OutTrackClusters()->GetZ(), (sliceTr->OutTrackClusters() + sliceTr->NHits() - 1)->GetZ()); - } else { - const ClusterNative* cls = mConstantMem->ioPtrs.clustersNative->clustersLinear; - track.SetClusterZT(cls[sliceTr->OutTrackClusters()->GetId()].getTime(), cls[(sliceTr->OutTrackClusters() + sliceTr->NHits() - 1)->GetId()].getTime()); - } + const ClusterNative* cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0]; + track.SetClusterZT(cl[clusterIndex1].getTime(), cl[clusterIndex2].getTime()); } } @@ -548,14 +503,10 @@ GPUd() void GPUTPCGMMerger::UnpackSliceGlobal(int32_t nBlocks, int32_t nThreads, const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice]; float alpha = Param().Alpha(iSlice); const GPUTPCTrack* sliceTr = mMemory->firstGlobalTracks[iSlice]; - uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks(); - uint32_t nTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? *trk.NTracks() : mkSlices[iSlice]->NTracks(); + uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks; + uint32_t nTracks = *trk.NTracks(); for (uint32_t itr = nLocalTracks + iBlock * nThreads + iThread; itr < nTracks; itr += nBlocks * nThreads) { - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - sliceTr = &trk.Tracks()[itr]; - } else if (itr > nLocalTracks) { - sliceTr = sliceTr->GetNextTrack(); - } + sliceTr = &trk.Tracks()[itr]; int32_t localId = mTrackIDs[(sliceTr->LocalTrackId() >> 24) * mNMaxSingleSliceTracks + (sliceTr->LocalTrackId() & 0xFFFFFF)]; if (localId == -1) { continue; @@ -576,7 +527,7 @@ GPUd() void GPUTPCGMMerger::UnpackSliceGlobal(int32_t nBlocks, int32_t nThreads, GPUd() void GPUTPCGMMerger::UnpackResetIds(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, int32_t iSlice) { const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice]; - uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks(); + uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks; for (uint32_t i = iBlock * nThreads + iThread; i < nLocalTracks; i += nBlocks * nThreads) { mTrackIDs[iSlice * mNMaxSingleSliceTracks + i] = -1; } @@ -585,17 +536,13 @@ GPUd() void GPUTPCGMMerger::UnpackResetIds(int32_t nBlocks, int32_t nThreads, in GPUd() void GPUTPCGMMerger::RefitSliceTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, int32_t iSlice) { const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice]; - uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks(); + uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks; float alpha = Param().Alpha(iSlice); - const GPUTPCTrack* sliceTr = Param().rec.tpc.mergerReadFromTrackerDirectly ? nullptr : mkSlices[iSlice]->GetFirstTrack(); + const GPUTPCTrack* sliceTr = nullptr; for (uint32_t itr = iBlock * nThreads + iThread; itr < nLocalTracks; itr += nBlocks * nThreads) { - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - sliceTr = &trk.Tracks()[itr]; - } else if (itr) { - sliceTr = sliceTr->GetNextTrack(); - } + sliceTr = &trk.Tracks()[itr]; GPUTPCGMSliceTrack track; SetTrackClusterZT(track, iSlice, sliceTr); if (Param().rec.tpc.mergerCovSource == 0) { @@ -626,9 +573,6 @@ GPUd() void GPUTPCGMMerger::RefitSliceTracks(int32_t nBlocks, int32_t nThreads, mTrackIDs[iSlice * mNMaxSingleSliceTracks + sliceTr->LocalTrackId()] = myTrack; mSliceTrackInfos[myTrack] = track; } - if (!Param().rec.tpc.mergerReadFromTrackerDirectly) { - mMemory->firstGlobalTracks[iSlice] = nLocalTracks ? sliceTr->GetNextTrack() : mkSlices[iSlice]->GetFirstTrack(); - } } GPUd() void GPUTPCGMMerger::LinkGlobalTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) @@ -730,7 +674,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<0>(int32_t nBlocks, int32_t nThrea { CADEBUG(GPUInfo("\nMERGING Slices %d %d NTracks %d %d CROSS %d", iSlice1, iSlice2, N1, N2, mergeMode)); GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1]; - GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks()); + GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks(); bool sameSlice = (iSlice1 == iSlice2); for (int32_t itr = iBlock * nThreads + iThread; itr < N1; itr += nThreads * nBlocks) { GPUTPCGMBorderTrack& b = B1[itr]; @@ -774,7 +718,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<1>(int32_t nBlocks, int32_t nThrea { #if !defined(GPUCA_GPUCODE_COMPILEKERNELS) GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1]; - GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks()); + GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks(); if (iThread == 0) { if (iBlock == 0) { @@ -864,7 +808,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<2>(int32_t nBlocks, int32_t nThrea bool sameSlice = (iSlice1 == iSlice2); GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1]; - GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks()); + GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks(); int32_t i2 = 0; for (int32_t i1 = iBlock * nThreads + iThread; i1 < N1; i1 += nThreads * nBlocks) { @@ -1326,10 +1270,6 @@ GPUd() void GPUTPCGMMerger::ResolveMergeSlices(GPUResolveSharedMemory& smem, int GPUd() void GPUTPCGMMerger::MergeCEFill(const GPUTPCGMSliceTrack* track, const GPUTPCGMMergedTrackHit& cls, const GPUTPCGMMergedTrackHitXYZ* clsXYZ, int32_t itr) { - if (Param().rec.nonConsecutiveIDs) { - return; - } - if (Param().rec.tpc.mergerCERowLimit > 0 && CAMath::Abs(track->QPt()) * Param().qptB5Scaler < 0.3f && (cls.row < Param().rec.tpc.mergerCERowLimit || cls.row >= GPUCA_ROW_COUNT - Param().rec.tpc.mergerCERowLimit)) { return; } @@ -1646,16 +1586,10 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread int32_t nTrackHits = t->NClusters(); trackCluster* c2 = trackClusters + nHits + nTrackHits - 1; for (int32_t i = 0; i < nTrackHits; i++, c2--) { - if (Param().rec.tpc.mergerReadFromTrackerDirectly) { - const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[t->Slice()]; - const GPUTPCHitId& ic = trk.TrackHits()[t->OrigTrack()->FirstHitID() + i]; - uint32_t id = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[t->Slice()][0]; - *c2 = trackCluster{id, (uint8_t)ic.RowIndex(), t->Slice(), t->Leg()}; - } else { - const GPUTPCSliceOutCluster& c = t->OrigTrack()->OutTrackClusters()[i]; - uint32_t id = Param().rec.nonConsecutiveIDs ? ((uint32_t)((uint32_t*)&c - (uint32_t*)mkSlices[t->Slice()]->GetFirstTrack())) : c.GetId(); - *c2 = trackCluster{id, c.GetRow(), t->Slice(), t->Leg()}; - } + const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[t->Slice()]; + const GPUTPCHitId& ic = trk.TrackHits()[t->OrigTrack()->FirstHitID() + i]; + uint32_t id = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[t->Slice()][0]; + *c2 = trackCluster{id, (uint8_t)ic.RowIndex(), t->Slice(), t->Leg()}; } nHits += nTrackHits; } @@ -1771,19 +1705,7 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread for (int32_t i = 0; i < nHits; i++) { uint8_t state; - if (Param().rec.nonConsecutiveIDs) { - const GPUTPCSliceOutCluster* c = (const GPUTPCSliceOutCluster*)((const int32_t*)mkSlices[trackClusters[i].slice]->GetFirstTrack() + trackClusters[i].id); - clXYZ[i].x = c->GetX(); - clXYZ[i].y = c->GetY(); - clXYZ[i].z = c->GetZ(); - clXYZ[i].amp = c->GetAmp(); - trackClusters[i].id = c->GetId(); -#ifdef GPUCA_TPC_RAW_PROPAGATE_PAD_ROW_TIME - cl[i] XYZ.pad = c->mPad; - cl[i] XYZ.time = c->mTime; -#endif - state = c->GetFlags(); - } else if (Param().par.earlyTpcTransform) { + if (Param().par.earlyTpcTransform) { const GPUTPCClusterData& c = GetConstantMem()->tpcTrackers[trackClusters[i].slice].ClusterData()[trackClusters[i].id - GetConstantMem()->tpcTrackers[trackClusters[i].slice].Data().ClusterIdOffset()]; clXYZ[i].x = c.x; clXYZ[i].y = c.y; @@ -1800,16 +1722,10 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread } cl[i].state = state & GPUTPCGMMergedTrackHit::clustererAndSharedFlags; // Only allow edge, deconvoluted, and shared flags cl[i].row = trackClusters[i].row; - if (!Param().rec.nonConsecutiveIDs) // We already have global consecutive numbers from the slice tracker, and we need to keep them for late cluster attachment - { - cl[i].num = trackClusters[i].id; - } else { // Produce consecutive numbers for shared cluster flagging - cl[i].num = iOutTrackFirstCluster + i; - mGlobalClusterIDs[cl[i].num] = trackClusters[i].id; - } + cl[i].num = trackClusters[i].id; cl[i].slice = trackClusters[i].slice; cl[i].leg = trackClusters[i].leg; - } // nHits + } uint32_t iOutputTrack = CAMath::AtomicAdd(&mMemory->nOutputTracks, 1u); if (iOutputTrack >= mNMaxTracks) { @@ -2052,17 +1968,11 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit2(int32_t nBlocks, int32_t nThr GPUd() void GPUTPCGMMerger::Finalize0(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - if (Param().rec.nonConsecutiveIDs) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) { - mClusters[i].num = mGlobalClusterIDs[i]; - } - } else { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { - mTrackSort[mTrackOrderAttach[i]] = i; - } - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) { - mClusterAttachment[mClusters[i].num] = 0; // Reset adjacent attachment for attached clusters, set correctly below - } + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { + mTrackSort[mTrackOrderAttach[i]] = i; + } + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) { + mClusterAttachment[mClusters[i].num] = 0; // Reset adjacent attachment for attached clusters, set correctly below } } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h index a9b510e1714ba..3e4ae535fb740 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h @@ -98,7 +98,6 @@ class GPUTPCGMMerger : public GPUProcessor void SetMaxData(const GPUTrackingInOutPointers& io); void* SetPointersMerger(void* mem); void* SetPointersRefitScratch(void* mem); - void* SetPointersRefitScratch2(void* mem); void* SetPointersOutput(void* mem); void* SetPointersOutputO2(void* mem); void* SetPointersOutputO2Clus(void* mem); @@ -107,8 +106,6 @@ class GPUTPCGMMerger : public GPUProcessor void* SetPointersOutputState(void* mem); void* SetPointersMemory(void* mem); - void SetSliceData(int32_t index, const GPUTPCSliceOutput* sliceData) { mkSlices[index] = sliceData; } - GPUhdi() int32_t NOutputTracks() const { return mMemory->nOutputTracks; } GPUhdi() const GPUTPCGMMergedTrack* OutputTracks() const { return mOutputTracks; } GPUhdi() GPUTPCGMMergedTrack* OutputTracks() { return mOutputTracks; } @@ -246,8 +243,6 @@ class GPUTPCGMMerger : public GPUProcessor int32_t mNextSliceInd[NSLICES]; int32_t mPrevSliceInd[NSLICES]; - const GPUTPCSliceOutput* mkSlices[NSLICES]; //* array of input slice tracks - int32_t* mTrackLinks; int32_t* mTrackCCRoots; // root of the connected component of this track @@ -273,7 +268,6 @@ class GPUTPCGMMerger : public GPUProcessor int32_t* mSliceTrackInfoIndex; GPUTPCGMMergedTrackHit* mClusters; GPUTPCGMMergedTrackHitXYZ* mClustersXYZ; - int32_t* mGlobalClusterIDs; GPUAtomic(uint32_t) * mClusterAttachment; o2::tpc::TrackTPC* mOutputTracksTPCO2; uint32_t* mOutputClusRefsTPCO2; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index a59af7529a97d..0463966c582a5 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -67,7 +67,7 @@ void GPUTPCGMMerger::DumpMergeRanges(std::ostream& out, int32_t withinSlice, int GPUTPCGMBorderTrack *b1, *b2; int32_t jSlice; MergeBorderTracksSetup(n1, n2, b1, b2, jSlice, i, withinSlice, mergeMode); - const int32_t nTrk = Param().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[jSlice].NTracks() : mkSlices[jSlice]->NTracks(); + const int32_t nTrk = *mRec->GetConstantMem().tpcTrackers[jSlice].NTracks(); const gputpcgmmergertypes::GPUTPCGMBorderRange* range1 = BorderRange(i); const gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = BorderRange(jSlice) + nTrk; out << "\nBorder Tracks : i " << i << " withinSlice " << withinSlice << " mergeMode " << mergeMode << "\n"; @@ -174,7 +174,7 @@ void GPUTPCGMMerger::DumpFitPrepare(std::ostream& out) const } out << "\n"; } - uint32_t maxId = Param().rec.nonConsecutiveIDs ? mMemory->nOutputTrackClusters : mNMaxClusters; + uint32_t maxId = mNMaxClusters; uint32_t j = 0; for (uint32_t i = 0; i < maxId; i++) { if ((mClusterAttachment[i] & attachFlagMask) != 0) { @@ -225,7 +225,7 @@ void GPUTPCGMMerger::DumpFinal(std::ostream& out) const } out << "\n"; } - uint32_t maxId = Param().rec.nonConsecutiveIDs ? mMemory->nOutputTrackClusters : mNMaxClusters; + uint32_t maxId = mNMaxClusters; uint32_t j = 0; for (uint32_t i = 0; i < maxId; i++) { if ((mClusterAttachment[i] & attachFlagMask) != 0) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx b/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx index 3c774b13ce5b1..6c8641517b80d 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx @@ -95,26 +95,15 @@ GPUd() void GPUTPCGMSliceTrack::SetParam2(const GPUTPCGMTrackParam& trk) GPUd() bool GPUTPCGMSliceTrack::FilterErrors(const GPUTPCGMMerger* merger, int32_t iSlice, float maxSinPhi, float sinPhiMargin) { float lastX; - if (merger->Param().par.earlyTpcTransform && !merger->Param().rec.tpc.mergerReadFromTrackerDirectly) { - lastX = mOrigTrack->OutTrackCluster(mOrigTrack->NHits() - 1).GetX(); // TODO: Why is this needed, Row2X should work, but looses some tracks - } else { - //float lastX = merger->Param().tpcGeometry.Row2X(mOrigTrack->Cluster(mOrigTrack->NClusters() - 1).GetRow()); // TODO: again, why does this reduce efficiency? - float y, z; - const GPUTPCSliceOutCluster* clo; - int32_t row, index; - if (merger->Param().rec.tpc.mergerReadFromTrackerDirectly) { - const GPUTPCTracker& trk = merger->GetConstantMem()->tpcTrackers[iSlice]; - const GPUTPCHitId& ic = trk.TrackHits()[mOrigTrack->FirstHitID() + mOrigTrack->NHits() - 1]; - index = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + merger->GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0]; - row = ic.RowIndex(); - } else { - clo = &mOrigTrack->OutTrackCluster(mOrigTrack->NHits() - 1); - index = clo->GetId(); - row = clo->GetRow(); - } - const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[index]; - GPUTPCConvertImpl::convert(*merger->GetConstantMem(), iSlice, row, cl.getPad(), cl.getTime(), lastX, y, z); - } + // float lastX = merger->Param().tpcGeometry.Row2X(mOrigTrack->Cluster(mOrigTrack->NClusters() - 1).GetRow()); // TODO: Why is this needed to be set below, Row2X should work, but looses some tracks + float y, z; + int32_t row, index; + const GPUTPCTracker& trk = merger->GetConstantMem()->tpcTrackers[iSlice]; + const GPUTPCHitId& ic = trk.TrackHits()[mOrigTrack->FirstHitID() + mOrigTrack->NHits() - 1]; + index = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + merger->GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0]; + row = ic.RowIndex(); + const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[index]; + GPUTPCConvertImpl::convert(*merger->GetConstantMem(), iSlice, row, cl.getPad(), cl.getTime(), lastX, y, z); const int32_t N = 3; diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx index c038146cf8497..d5a941b333c6e 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx +++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx @@ -64,7 +64,7 @@ void GPUTPCTracker::InitializeProcessor() bool GPUTPCTracker::SliceDataOnGPU() { - return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) && mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly && (mRec->GetConstantMem().ioPtrs.clustersNative || mRec->GetConstantMem().ioPtrs.tpcZS || mRec->GetConstantMem().ioPtrs.tpcPackedDigits); + return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) && (mRec->GetConstantMem().ioPtrs.clustersNative || mRec->GetConstantMem().ioPtrs.tpcZS || mRec->GetConstantMem().ioPtrs.tpcPackedDigits); } void* GPUTPCTracker::SetPointersDataInput(void* mem) { return mData.SetPointersInput(mem, mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging, SliceDataOnGPU()); } @@ -117,7 +117,7 @@ void GPUTPCTracker::RegisterMemoryAllocation() mMemoryResCommon = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersCommon, GPUMemoryResource::MEMORY_PERMANENT, "TPCTrackerCommon"); mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataRows, GPUMemoryResource::MEMORY_PERMANENT, "TPCSliceRows"); - uint32_t type = mRec->GetProcessingSettings().fullMergerOnGPU ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT; + uint32_t type = GPUMemoryResource::MEMORY_SCRATCH; if (mRec->GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { // For individual scheme, we allocate tracklets separately, and change the type for the following allocations to custom type |= GPUMemoryResource::MEMORY_CUSTOM; mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type, "TPCTrackerTracklets"); diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 4bfcc312e27e7..e6017788144e0 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -164,7 +164,7 @@ int32_t ReadConfiguration(int argc, char** argv) } #endif #ifndef GPUCA_TPC_GEOMETRY_O2 - configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0; +#error Why was configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0 needed? configStandalone.proc.ompKernels = false; configStandalone.proc.createO2Output = 0; if (configStandalone.rundEdx == -1) { @@ -412,7 +412,7 @@ int32_t SetupReconstruction() } steps.outputs.clear(); - steps.outputs.setBits(GPUDataTypes::InOutType::TPCSectorTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TPCSliceTracking) && !recSet.tpc.mergerReadFromTrackerDirectly); + steps.outputs.setBits(GPUDataTypes::InOutType::TPCSectorTracks, false); steps.outputs.setBits(GPUDataTypes::InOutType::TPCMergedTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TPCMerging)); steps.outputs.setBits(GPUDataTypes::InOutType::TPCCompressedClusters, steps.steps.isSet(GPUDataTypes::RecoStep::TPCCompression)); steps.outputs.setBits(GPUDataTypes::InOutType::TRDTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TRDTracking)); diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 2aa0611b33779..70a093c7f1de7 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -909,11 +909,6 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx bool mcAvail = mcPresent() || tracksExtMC; - if (mcAvail && !tracksExtMC && mTracking->GetParam().rec.nonConsecutiveIDs) { - GPUError("QA incompatible to non-consecutive MC labels"); - return; - } - if (mcAvail) { // Assign Track MC Labels timer.Start(); From 91b094539e2e18848ae407fd54efa8d025ffabb6 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 4 Feb 2025 01:44:02 +0100 Subject: [PATCH 2/2] GPU: Remove support for host helper threads (no longer used) --- Common/Topologies/o2prototype_topology.xml | 2 +- GPU/GPUTracking/Base/GPUReconstruction.cxx | 3 - GPU/GPUTracking/Base/GPUReconstructionCPU.h | 10 +- .../Base/GPUReconstructionDeviceBase.cxx | 139 ------------------ .../Base/GPUReconstructionDeviceBase.h | 17 +-- .../Base/GPUReconstructionHelpers.h | 50 ------- GPU/GPUTracking/CMakeLists.txt | 1 - GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 - GPU/GPUTracking/Global/GPUChain.h | 13 -- GPU/GPUTracking/Global/GPUChainTracking.h | 7 +- .../Global/GPUChainTrackingSliceTracker.cxx | 65 +------- 11 files changed, 8 insertions(+), 300 deletions(-) delete mode 100644 GPU/GPUTracking/Base/GPUReconstructionHelpers.h diff --git a/Common/Topologies/o2prototype_topology.xml b/Common/Topologies/o2prototype_topology.xml index 240b8d87d469a..8d53c9eb0127a 100644 --- a/Common/Topologies/o2prototype_topology.xml +++ b/Common/Topologies/o2prototype_topology.xml @@ -74,7 +74,7 @@ The following parameters need adjustment when extending the FLP-EPN configuratio - $ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -GPUHelperThreads 4 -loglevel=0x7c' + $ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -loglevel=0x7c' EPNReceiverOutputAddress diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 1496300818fd8..270f092a1fd29 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -278,9 +278,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() if (!(mRecoSteps.stepsGPUMask & GPUDataTypes::RecoStep::TPCMerging)) { mProcessingSettings.mergerSortTracks = false; } - if (!IsGPU()) { - mProcessingSettings.nDeviceHelperThreads = 0; - } if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) { mProcessingSettings.delayedOutput = false; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 8cc753731d074..27959382e7b67 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -16,7 +16,6 @@ #define GPURECONSTRUCTIONICPU_H #include "GPUReconstruction.h" -#include "GPUReconstructionHelpers.h" #include "GPUConstantMem.h" #include #include "utils/timer.h" @@ -117,13 +116,6 @@ class GPUReconstructionCPU : public GPUReconstructionKernelsPtr(), res->PtrDevice()); } size_t TransferMemoryResourceToHost(GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { return TransferMemoryInternal(res, stream, ev, evList, nEvents, false, res->PtrDevice(), res->Ptr()); } @@ -294,7 +286,7 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num) static int32_t id = getNextTimerId(); timerMeta* timer = getTimerById(id); if (timer == nullptr) { - int32_t max = std::max({getOMPMaxThreads(), mProcessingSettings.nDeviceHelperThreads + 1, mProcessingSettings.nStreams}); + int32_t max = std::max({getOMPMaxThreads(), mProcessingSettings.nStreams}); timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep); } if (num == -1) { diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx index 3522095622ad4..91715fab4f668 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx @@ -41,57 +41,6 @@ GPUReconstructionDeviceBase::GPUReconstructionDeviceBase(const GPUSettingsDevice GPUReconstructionDeviceBase::~GPUReconstructionDeviceBase() = default; -void* GPUReconstructionDeviceBase::helperWrapper_static(void* arg) -{ - GPUReconstructionHelpers::helperParam* par = (GPUReconstructionHelpers::helperParam*)arg; - GPUReconstructionDeviceBase* cls = par->cls; - return cls->helperWrapper(par); -} - -void* GPUReconstructionDeviceBase::helperWrapper(GPUReconstructionHelpers::helperParam* par) -{ - if (mProcessingSettings.debugLevel >= 3) { - GPUInfo("\tHelper thread %d starting", par->num); - } - - // cpu_set_t mask; //TODO add option - // CPU_ZERO(&mask); - // CPU_SET(par->num * 2 + 2, &mask); - // sched_setaffinity(0, sizeof(mask), &mask); - - par->mutex[0].lock(); - while (par->terminate == false) { - for (int32_t i = par->num + 1; i < par->count; i += mProcessingSettings.nDeviceHelperThreads + 1) { - // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->num, i, par->phase); - if ((par->functionCls->*par->function)(i, par->num + 1, par)) { - par->error = 1; - } - if (par->reset) { - break; - } - par->done = i + 1; - // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->num, i, par->phase); - } - ResetThisHelperThread(par); - par->mutex[0].lock(); - } - if (mProcessingSettings.debugLevel >= 3) { - GPUInfo("\tHelper thread %d terminating", par->num); - } - par->mutex[1].unlock(); - pthread_exit(nullptr); - return (nullptr); -} - -void GPUReconstructionDeviceBase::ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par) -{ - if (par->reset) { - GPUImportant("GPU Helper Thread %d reseting", par->num); - } - par->reset = false; - par->mutex[1].unlock(); -} - int32_t GPUReconstructionDeviceBase::GetGlobalLock(void*& pLock) { #ifdef _WIN32 @@ -138,86 +87,6 @@ void GPUReconstructionDeviceBase::ReleaseGlobalLock(void* sem) #endif } -void GPUReconstructionDeviceBase::ResetHelperThreads(int32_t helpers) -{ - GPUImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d (%d))", mProcessingSettings.nDeviceHelperThreads, mNSlaveThreads); - SynchronizeGPU(); - for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) { - mHelperParams[i].reset = true; - if (helpers || i >= mProcessingSettings.nDeviceHelperThreads) { - pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]); - } - } - GPUImportant("GPU Tracker helper threads have ben reset"); -} - -int32_t GPUReconstructionDeviceBase::StartHelperThreads() -{ - int32_t nThreads = mProcessingSettings.nDeviceHelperThreads; - if (nThreads) { - mHelperParams = new GPUReconstructionHelpers::helperParam[nThreads]; - if (mHelperParams == nullptr) { - GPUError("Memory allocation error"); - ExitDevice(); - return (1); - } - for (int32_t i = 0; i < nThreads; i++) { - mHelperParams[i].cls = this; - mHelperParams[i].terminate = false; - mHelperParams[i].reset = false; - mHelperParams[i].num = i; - for (int32_t j = 0; j < 2; j++) { - mHelperParams[i].mutex[j].lock(); - } - - if (pthread_create(&mHelperParams[i].threadId, nullptr, helperWrapper_static, &mHelperParams[i])) { - GPUError("Error starting slave thread"); - ExitDevice(); - return (1); - } - } - } - mNSlaveThreads = nThreads; - return (0); -} - -int32_t GPUReconstructionDeviceBase::StopHelperThreads() -{ - if (mNSlaveThreads) { - for (int32_t i = 0; i < mNSlaveThreads; i++) { - mHelperParams[i].terminate = true; - mHelperParams[i].mutex[0].unlock(); - mHelperParams[i].mutex[1].lock(); - if (pthread_join(mHelperParams[i].threadId, nullptr)) { - GPUError("Error waiting for thread to terminate"); - return (1); - } - } - delete[] mHelperParams; - } - mNSlaveThreads = 0; - return (0); -} - -void GPUReconstructionDeviceBase::WaitForHelperThreads() -{ - for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) { - pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]); - } -} - -void GPUReconstructionDeviceBase::RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t i, int32_t t, GPUReconstructionHelpers::helperParam* p), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) -{ - for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) { - mHelperParams[i].done = 0; - mHelperParams[i].error = 0; - mHelperParams[i].function = function; - mHelperParams[i].functionCls = functionCls; - mHelperParams[i].count = count; - pthread_mutex_unlock(&((pthread_mutex_t*)mHelperParams[i].mutex)[0]); - } -} - int32_t GPUReconstructionDeviceBase::InitDevice() { // cpu_set_t mask; @@ -262,10 +131,6 @@ int32_t GPUReconstructionDeviceBase::InitDevice() mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors"); AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors); - if (StartHelperThreads()) { - return (1); - } - if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) { GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message! } @@ -282,10 +147,6 @@ void* GPUReconstructionDeviceBase::GPUProcessorProcessors::SetPointersDeviceProc int32_t GPUReconstructionDeviceBase::ExitDevice() { - if (StopHelperThreads()) { - return (1); - } - int32_t retVal = ExitDevice_Runtime(); mProcessorsShadow = nullptr; mHostMemoryPool = mHostMemoryBase = mDeviceMemoryPool = mDeviceMemoryBase = mHostMemoryPoolEnd = mDeviceMemoryPoolEnd = mHostMemoryPermanent = mDeviceMemoryPermanent = nullptr; diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h index 215615f558442..1381fd0f76981 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h +++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h @@ -17,7 +17,6 @@ #include "GPUReconstructionCPU.h" #include -#include "GPUReconstructionHelpers.h" #include "GPUChain.h" #include @@ -61,24 +60,10 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU size_t GPUMemCpyAlways(bool onGpu, void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override; size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override = 0; - int32_t StartHelperThreads() override; - int32_t StopHelperThreads() override; - void RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) override; - int32_t HelperError(int32_t iThread) const override { return mHelperParams[iThread].error; } - int32_t HelperDone(int32_t iThread) const override { return mHelperParams[iThread].done; } - void WaitForHelperThreads() override; - void ResetHelperThreads(int32_t helpers) override; - void ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par); - int32_t GetGlobalLock(void*& pLock); void ReleaseGlobalLock(void* sem); - static void* helperWrapper_static(void* arg); - void* helperWrapper(GPUReconstructionHelpers::helperParam* par); - - int32_t mDeviceId = -1; // Device ID used by backend - GPUReconstructionHelpers::helperParam* mHelperParams = nullptr; // Control Struct for helper threads - int32_t mNSlaveThreads = 0; // Number of slave threads currently active + int32_t mDeviceId = -1; // Device ID used by backend struct DebugEvents { deviceEvent DebugStart, DebugStop; // Debug timer events diff --git a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h b/GPU/GPUTracking/Base/GPUReconstructionHelpers.h deleted file mode 100644 index c55e81905f32f..0000000000000 --- a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file GPUReconstructionHelpers.h -/// \author David Rohr - -#ifndef GPURECONSTRUCTIONHELPERS_H -#define GPURECONSTRUCTIONHELPERS_H - -#include - -namespace o2 -{ -namespace gpu -{ -class GPUReconstructionDeviceBase; -class GPUReconstructionHelpers -{ - public: - class helperDelegateBase - { - }; - - struct helperParam { - pthread_t threadId; - GPUReconstructionDeviceBase* cls; - int32_t num; - std::mutex mutex[2]; - int8_t terminate; - helperDelegateBase* functionCls; - int32_t (helperDelegateBase::*function)(int32_t, int32_t, helperParam*); - int32_t phase; - int32_t count; - volatile int32_t done; - volatile int8_t error; - volatile int8_t reset; - }; -}; -} // namespace gpu -} // namespace o2 - -#endif diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 5dd92d41db29b..6acc7fd1dd537 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -104,7 +104,6 @@ set(HDRS_INSTALL Base/GPUConstantMem.h Base/GPUParam.inc Base/GPUParamRTC.h - Base/GPUReconstructionHelpers.h Base/GPUReconstructionIncludes.h Base/GPUReconstructionIncludesITS.h Base/GPUReconstructionKernelMacros.h diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index c10793975453d..ca6f2f370300e 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -252,7 +252,6 @@ AddOption(registerStandaloneInputMemory, bool, false, "registerInputMemory", 0, AddOption(ompThreads, int32_t, -1, "omp", 't', "Number of OMP threads to run (-1: all)", min(-1), message("Using %s OMP threads")) AddOption(ompKernels, uint8_t, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels") AddOption(ompAutoNThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data") -AddOption(nDeviceHelperThreads, int32_t, 1, "", 0, "Number of CPU helper threads for CPU processing") AddOption(nStreams, int8_t, 8, "", 0, "Number of GPU streams / command queues") AddOption(nTPCClustererLanes, int8_t, -1, "", 0, "Number of TPC clusterers that can run in parallel (-1 = autoset)") AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster max fragment len to a certain value (-1 = autodetect)") diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 06650f9d9c733..0981fea43810a 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -16,7 +16,6 @@ #define GPUCHAIN_H #include "GPUReconstructionCPU.h" -#include "GPUReconstructionHelpers.h" namespace o2 { @@ -111,12 +110,6 @@ class GPUChain } } inline void StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents = 1) { mRec->StreamWaitForEvents(stream, evList, nEvents); } - template - void RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count); - inline void WaitForHelperThreads() { mRec->WaitForHelperThreads(); } - inline int32_t HelperError(int32_t iThread) const { return mRec->HelperError(iThread); } - inline int32_t HelperDone(int32_t iThread) const { return mRec->HelperDone(iThread); } - inline void ResetHelperThreads(int32_t helpers) { mRec->ResetHelperThreads(helpers); } inline int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1) { return mRec->GPUDebug(state, stream); } // nEvents is forced to 0 if evList == nullptr inline void TransferMemoryResourceToGPU(RecoStep step, GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { timeCpy(step, true, &GPUReconstructionCPU::TransferMemoryResourceToGPU, res, stream, ev, evList, nEvents); } @@ -242,12 +235,6 @@ class GPUChain void timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args); }; -template -inline void GPUChain::RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) -{ - mRec->RunHelperThreads((int32_t(GPUReconstructionHelpers::helperDelegateBase::*)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*))function, functionCls, count); -} - template inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args) { diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 6d6d82b518097..d827b095773b1 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -16,7 +16,6 @@ #define GPUCHAINTRACKING_H #include "GPUChain.h" -#include "GPUReconstructionHelpers.h" #include "GPUDataTypes.h" #include #include @@ -68,7 +67,7 @@ struct GPUTPCCFChainContext; struct GPUNewCalibValues; struct GPUTriggerOutputs; -class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelegateBase +class GPUChainTracking : public GPUChain { friend class GPUReconstruction; @@ -314,15 +313,11 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function allocator, bool applyClusterCuts); bool NeedTPCClustersOnGPU(); - std::atomic_flag mLockAtomicOutputBuffer = ATOMIC_FLAG_INIT; std::mutex mMutexUpdateCalib; std::unique_ptr mPipelineFinalizationCtx; GPUChainTrackingFinalContext* mPipelineNotifyCtx = nullptr; std::function mWaitForFinalInputs; - int32_t HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par); - int32_t HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par); - int32_t OutputStream() const { return mRec->NStreams() - 2; } }; } // namespace gpu diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx index 35a8c6c455048..174b3757d3307 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx @@ -55,9 +55,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices() if (retVal) { SynchronizeGPU(); } - if (retVal >= 2) { - ResetHelperThreads(retVal >= 3); - } return (retVal != 0); } @@ -114,9 +111,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() processorsShadow()->tpcTrackers[iSlice].SetGPUTextureBase(mRec->DeviceMemoryBase()); } - if (!doSliceDataOnGPU) { - RunHelperThreads(&GPUChainTracking::HelperReadEvent, this, NSLICES); - } if (PrepareTextures()) { return (2); } @@ -183,22 +177,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}}); streamInit[useStream] = true; - } else if (!doGPU || iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) { + } else { if (ReadEvent(iSlice, 0)) { GPUError("Error reading event"); error = 1; continue; } - } else { - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Waiting for helper thread %d", iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1); - } - while (HelperDone(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1) < (int32_t)iSlice) { - } - if (HelperError(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1)) { - error = 1; - continue; - } } if (GetProcessingSettings().deterministicGPUReconstruction) { runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}}); @@ -297,9 +281,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() if (doGPU) { ReleaseEvent(mEvents->init); } - if (!doSliceDataOnGPU) { - WaitForHelperThreads(); - } if (!GetProcessingSettings().trackletSelectorInPipeline) { if (GetProcessingSettings().trackletConstructorInPipeline) { @@ -359,7 +340,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() if (param().rec.tpc.globalTracking) { mWriteOutputDone.fill(0); } - RunHelperThreads(&GPUChainTracking::HelperOutput, this, NSLICES); uint32_t tmpSlice = 0; for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) { @@ -402,12 +382,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() } if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1)); + GPUInfo("Data ready for slice %d", iSlice); } mSliceSelectorReady = iSlice; if (param().rec.tpc.globalTracking) { - for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a += GetProcessingSettings().nDeviceHelperThreads + 1) { + for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a++) { uint32_t tmpSlice2 = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(tmpSlice2a); uint32_t sliceLeft, sliceRight; GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice2, sliceLeft, sliceRight); @@ -419,12 +399,9 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() } } } else { - if (iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) { - WriteOutput(iSlice, 0); - } + WriteOutput(iSlice, 0); } } - WaitForHelperThreads(); } if (!(GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) && param().rec.tpc.globalTracking) { std::vector blocking(NSLICES * mRec->NStreams()); @@ -518,43 +495,9 @@ void GPUChainTracking::WriteOutput(int32_t iSlice, int32_t threadId) if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Running WriteOutput for slice %d on thread %d\n", iSlice, threadId); } - if (GetProcessingSettings().nDeviceHelperThreads) { - while (mLockAtomicOutputBuffer.test_and_set(std::memory_order_acquire)) { - } - } processors()->tpcTrackers[iSlice].WriteOutputPrepare(); - if (GetProcessingSettings().nDeviceHelperThreads) { - mLockAtomicOutputBuffer.clear(); - } processors()->tpcTrackers[iSlice].WriteOutput(); if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Finished WriteOutput for slice %d on thread %d\n", iSlice, threadId); } } - -int32_t GPUChainTracking::HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) { return ReadEvent(iSlice, threadId); } - -int32_t GPUChainTracking::HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) -{ - if (param().rec.tpc.globalTracking) { - uint32_t tmpSlice = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(iSlice); - uint32_t sliceLeft, sliceRight; - GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice, sliceLeft, sliceRight); - - while (mSliceSelectorReady < (int32_t)tmpSlice || mSliceSelectorReady < (int32_t)sliceLeft || mSliceSelectorReady < (int32_t)sliceRight) { - if (par->reset) { - return 1; - } - } - GlobalTracking(tmpSlice, 0); - WriteOutput(tmpSlice, 0); - } else { - while (mSliceSelectorReady < iSlice) { - if (par->reset) { - return 1; - } - } - WriteOutput(iSlice, threadId); - } - return 0; -}