From 18682470758c8cde1a5dc8d4dfa79f426050be6a Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 3 Feb 2025 00:52:31 +0100
Subject: [PATCH 1/2] GPU: Remove obsolete code paths

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |  14 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   3 -
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  20 +-
 .../Global/GPUChainTrackingMerger.cxx         |  73 +++----
 .../Global/GPUChainTrackingSliceTracker.cxx   |   9 +-
 GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx     | 200 +++++-------------
 GPU/GPUTracking/Merger/GPUTPCGMMerger.h       |   6 -
 GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx |   6 +-
 GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx |  29 +--
 .../SliceTracker/GPUTPCTracker.cxx            |   4 +-
 .../Standalone/Benchmark/standalone.cxx       |   4 +-
 GPU/GPUTracking/qa/GPUQA.cxx                  |   5 -
 12 files changed, 109 insertions(+), 264 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 5df69c416e858..1496300818fd8 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -282,21 +282,9 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     mProcessingSettings.nDeviceHelperThreads = 0;
   }
 
-  if (param().rec.nonConsecutiveIDs) {
-    param().rec.tpc.disableRefitAttachment = 0xFF;
-  }
-  if (!(mRecoSteps.stepsGPUMask & RecoStep::TPCMerging) || !param().rec.tpc.mergerReadFromTrackerDirectly) {
-    mProcessingSettings.fullMergerOnGPU = false;
-  }
-  if (mProcessingSettings.debugLevel > 3 || !IsGPU() || !mProcessingSettings.fullMergerOnGPU || mProcessingSettings.deterministicGPUReconstruction) {
+  if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) {
     mProcessingSettings.delayedOutput = false;
   }
-  if (!mProcessingSettings.fullMergerOnGPU && (GetRecoStepsGPU() & RecoStep::TPCMerging)) {
-    param().rec.tpc.looperInterpolationInExtraPass = 0;
-    if (param().rec.tpc.retryRefit == 1) {
-      param().rec.tpc.retryRefit = 2;
-    }
-  }
 
   UpdateAutomaticProcessingSettings();
   GPUCA_GPUReconstructionUpdateDefaults();
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 76370c17f9f53..c10793975453d 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -149,7 +149,6 @@ AddOptionRTC(mergerInterpolateErrors, uint8_t, 1, "", 0, "Use interpolation inst
 AddOptionRTC(mergeCE, uint8_t, 1, "", 0, "Merge tracks accross the central electrode")
 AddOptionRTC(retryRefit, int8_t, 1, "", 0, "Retry refit with seeding errors and without cluster rejection when fit fails (=2 means retry in same kernel, =1 for separate kernel")
 AddOptionRTC(looperInterpolationInExtraPass, int8_t, -1, "", 0, "Perform looper interpolation in an extra pass")
-AddOptionRTC(mergerReadFromTrackerDirectly, int8_t, 1, "", 0, "Forward data directly from tracker to merger on GPU")
 AddOptionRTC(dropSecondaryLegsInOutput, int8_t, 1, "", 0, "Do not store secondary legs of looping track in TrackTPC")
 AddOptionRTC(enablePID, int8_t, 1, "", 0, "Enable PID response")
 AddOptionRTC(PID_useNsigma, int8_t, 1, "", 0, "Use nSigma instead of absolute distance in PID response")
@@ -188,7 +187,6 @@ EndConfig()
 
 BeginSubConfig(GPUSettingsRec, rec, configStandalone, "REC", 0, "Reconstruction settings", rec)
 AddOptionRTC(maxTrackQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_DEFAULT, "", 0, "required max Q/Pt (==min Pt) of tracks")
-AddOptionRTC(nonConsecutiveIDs, int8_t, false, "", 0, "Non-consecutive cluster IDs as in HLT, disables features that need access to slice data in TPC merger")
 AddOptionRTC(fwdTPCDigitsAsClusters, uint8_t, 0, "", 0, "Forward TPC digits as clusters (if they pass the ZS threshold)")
 AddOptionRTC(bz0Pt10MeV, uint8_t, 60, "", 0, "Nominal Pt to set when bz = 0 (in 10 MeV)")
 AddOptionRTC(fitInProjections, int8_t, -1, "", 0, "Fit in projection, -1 to enable full fit for all but passes but the first one")
@@ -261,7 +259,6 @@ AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster
 AddOption(trackletSelectorSlices, int8_t, -1, "", 0, "Number of slices to processes in parallel at max")
 AddOption(trackletConstructorInPipeline, int8_t, -1, "", 0, "Run tracklet constructor in the pipeline")
 AddOption(trackletSelectorInPipeline, int8_t, -1, "", 0, "Run tracklet selector in the pipeline")
-AddOption(fullMergerOnGPU, bool, true, "", 0, "Perform full TPC track merging on GPU instead of only refit")
 AddOption(delayedOutput, bool, true, "", 0, "Delay output to be parallel to track fit")
 AddOption(mergerSortTracks, int8_t, -1, "", 0, "Sort track indizes for GPU track fit")
 AddOption(alternateBorderSort, int8_t, -1, "", 0, "Alternative implementation for sorting of border tracks")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 1aa5f9ca0dad8..889e12c258cb4 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -185,12 +185,8 @@ bool GPUChainTracking::ValidateSteps()
     GPUError("Invalid input, TPC Clusterizer needs TPC raw input");
     return false;
   }
-  if (param().rec.tpc.mergerReadFromTrackerDirectly && (GetRecoSteps() & GPUDataTypes::RecoStep::TPCMerging) && ((GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCSectorTracks) || (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCConversion))) {
-    GPUError("Invalid input / output / step, mergerReadFromTrackerDirectly cannot read/store sectors tracks and needs TPC conversion");
-    return false;
-  }
-  if (!GetProcessingSettings().fullMergerOnGPU && (param().rec.tpc.mergerReadFromTrackerDirectly || GetProcessingSettings().createO2Output) && (GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging)) {
-    GPUError("createO2Output and mergerReadFromTrackerDirectly works only in combination with fullMergerOnGPU if the merger is to run on GPU");
+  if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCMerging) && ((GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCSectorTracks) || (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCConversion))) {
+    GPUError("Invalid input / output / step, merger cannot read/store sectors tracks and needs TPC conversion");
     return false;
   }
   bool tpcClustersAvail = (GetRecoStepsInputs() & GPUDataTypes::InOutType::TPCClusters) || (GetRecoSteps() & GPUDataTypes::RecoStep::TPCClusterFinding) || (GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression);
@@ -265,14 +261,6 @@ bool GPUChainTracking::ValidateSettings()
     GPUError("Cannot do error interpolation with NWays = 1!");
     return false;
   }
-  if ((param().rec.tpc.mergerReadFromTrackerDirectly || !param().par.earlyTpcTransform) && param().rec.nonConsecutiveIDs) {
-    GPUError("incompatible settings for non consecutive ids");
-    return false;
-  }
-  if (!param().rec.tpc.mergerReadFromTrackerDirectly && GetProcessingSettings().ompKernels) {
-    GPUError("OMP Kernels require mergerReadFromTrackerDirectly");
-    return false;
-  }
   if (param().continuousMaxTimeBin > (int32_t)GPUSettings::TPC_MAX_TF_TIME_BIN) {
     GPUError("configured max time bin exceeds 256 orbits");
     return false;
@@ -743,10 +731,6 @@ int32_t GPUChainTracking::RunChain()
     return 1;
   }
 
-  for (uint32_t i = 0; i < NSLICES; i++) {
-    // GPUInfo("slice %d clusters %d tracks %d", i, mClusterData[i].NumberOfClusters(), processors()->tpcTrackers[i].Output()->NTracks());
-    processors()->tpcMerger.SetSliceData(i, param().rec.tpc.mergerReadFromTrackerDirectly ? nullptr : processors()->tpcTrackers[i].Output());
-  }
   if (runRecoStep(RecoStep::TPCMerging, &GPUChainTracking::RunTPCTrackingMerger, false)) {
     return 1;
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
index 8dd5140db6952..0831b260f881d 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -24,14 +24,14 @@ using namespace o2::gpu;
 void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType)
 {
   GPUTPCGMMerger& Merger = processors()->tpcMerger;
-  bool doGPUall = GetRecoStepsGPU() & RecoStep::TPCMerging && GetProcessingSettings().fullMergerOnGPU;
-  GPUTPCGMMerger& MergerShadow = doGPUall ? processorsShadow()->tpcMerger : Merger;
+  bool doGPU = GetRecoStepsGPU() & RecoStep::TPCMerging;
+  GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow()->tpcMerger : Merger;
   if (GetProcessingSettings().deterministicGPUReconstruction) {
     uint32_t nBorderTracks = withinSlice == 1 ? NSLICES : (2 * NSLICES);
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::borderTracks>({{nBorderTracks, -WarpSize(), 0, deviceType}}, 0);
   }
   uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES;
-  if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPUall)) {
+  if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPU)) {
     TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->init);
     RecordMarker(&mEvents->single, 0);
     for (uint32_t i = 0; i < n; i++) {
@@ -72,7 +72,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
       runKernel<GPUTPCGMMergerMergeBorders, 2>(GetGridAuto(0, deviceType), i, withinSlice, mergeMode);
     }
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSlice, mergeMode);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSlice, mergeMode);
   mRec->ReturnVolatileDeviceMemory();
 }
 
@@ -89,12 +89,11 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
 {
   mRec->PushNonPersistentMemory(qStr2Tag("TPCMERGE"));
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCMerging;
-  bool doGPUall = doGPU && GetProcessingSettings().fullMergerOnGPU;
-  GPUReconstruction::krnlDeviceType deviceType = doGPUall ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU;
-  uint32_t numBlocks = (!mRec->IsGPU() || doGPUall) ? BlockCount() : 1;
+  GPUReconstruction::krnlDeviceType deviceType = doGPU ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU;
+  uint32_t numBlocks = (!mRec->IsGPU() || doGPU) ? BlockCount() : 1;
   GPUTPCGMMerger& Merger = processors()->tpcMerger;
   GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow()->tpcMerger : Merger;
-  GPUTPCGMMerger& MergerShadowAll = doGPUall ? processorsShadow()->tpcMerger : Merger;
+  GPUTPCGMMerger& MergerShadowAll = doGPU ? processorsShadow()->tpcMerger : Merger;
   const int32_t outputStream = OutputStream();
   if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Running TPC Merger");
@@ -112,7 +111,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
 
   memset(Merger.Memory(), 0, sizeof(*Merger.Memory()));
   WriteToConstantMemory(RecoStep::TPCMerging, (char*)&processors()->tpcMerger - (char*)processors(), &MergerShadow, sizeof(MergerShadow), 0);
-  if (doGPUall) {
+  if (doGPU) {
     TransferMemoryResourcesToGPU(RecoStep::TPCMerging, &Merger, 0);
   }
 
@@ -136,14 +135,14 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   if (GetProcessingSettings().deterministicGPUReconstruction) {
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::sectorTracks>({{GPUCA_NSLICES, -WarpSize(), 0, deviceType}}, 1);
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile);
 
   runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), false);
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), NSLICES * sizeof(*MergerShadowAll.TmpCounter()));
   runKernel<GPUTPCGMMergerMergeWithinPrepare>(GetGridAuto(0, deviceType));
   RunTPCTrackingMerger_MergeBorderTracks(1, 0, deviceType);
   RunTPCTrackingMerger_Resolve(0, 1, deviceType);
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile);
 
   runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), false);
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSLICES * sizeof(*MergerShadowAll.TmpCounter()));
@@ -158,7 +157,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   runKernel<GPUTPCGMMergerMergeSlicesPrepare>(GetGridBlk(std::max(2u, numBlocks), 0, deviceType), 0, 1, 1);
   RunTPCTrackingMerger_MergeBorderTracks(0, -1, deviceType);
   RunTPCTrackingMerger_Resolve(0, 1, deviceType);
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile);
 
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSLICES * sizeof(*MergerShadowAll.TmpCounter()));
 
@@ -168,17 +167,17 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::globalTracks1>({{1, -WarpSize(), 0, deviceType}}, 1);
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::globalTracks2>({{1, -WarpSize(), 0, deviceType}}, 1);
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile);
 
   if (param().rec.tpc.mergeCE) {
     runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), true);
     RunTPCTrackingMerger_MergeBorderTracks(-1, 1, deviceType);
     RunTPCTrackingMerger_MergeBorderTracks(-1, 2, deviceType);
     runKernel<GPUTPCGMMergerMergeCE>(GetGridAuto(0, deviceType));
-    DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile);
+    DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile);
   }
   int32_t waitForTransfer = 0;
-  if (doGPUall) {
+  if (doGPU) {
     TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->single);
     waitForTransfer = 1;
   }
@@ -189,23 +188,21 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     runKernel<GPUTPCGMMergerSortTracks>(GetGridAuto(0, deviceType));
   }
 
-  uint32_t maxId = param().rec.nonConsecutiveIDs ? Merger.Memory()->nOutputTrackClusters : Merger.NMaxClusters();
+  uint32_t maxId = Merger.NMaxClusters();
   if (maxId > Merger.NMaxClusters()) {
     throw std::runtime_error("mNMaxClusters too small");
   }
-  if (!param().rec.nonConsecutiveIDs) {
-    runKernel<GPUMemClean16>({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount(), maxId * sizeof(*MergerShadowAll.SharedCount()));
-    runKernel<GPUMemClean16>({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment(), maxId * sizeof(*MergerShadowAll.ClusterAttachment()));
-    runKernel<GPUTPCGMMergerPrepareClusters, 0>(GetGridAuto(0, deviceType));
-    CondWaitEvent(waitForTransfer, &mEvents->single);
-    runKernel<GPUTPCGMMergerSortTracksQPt>(GetGridAuto(0, deviceType));
-    runKernel<GPUTPCGMMergerPrepareClusters, 1>(GetGridAuto(0, deviceType));
-    runKernel<GPUTPCGMMergerPrepareClusters, 2>(GetGridAuto(0, deviceType));
-  }
+  runKernel<GPUMemClean16>({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount(), maxId * sizeof(*MergerShadowAll.SharedCount()));
+  runKernel<GPUMemClean16>({{numBlocks, -ThreadCount(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment(), maxId * sizeof(*MergerShadowAll.ClusterAttachment()));
+  runKernel<GPUTPCGMMergerPrepareClusters, 0>(GetGridAuto(0, deviceType));
+  CondWaitEvent(waitForTransfer, &mEvents->single);
+  runKernel<GPUTPCGMMergerSortTracksQPt>(GetGridAuto(0, deviceType));
+  runKernel<GPUTPCGMMergerPrepareClusters, 1>(GetGridAuto(0, deviceType));
+  runKernel<GPUTPCGMMergerPrepareClusters, 2>(GetGridAuto(0, deviceType));
 
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile);
 
-  if (doGPUall) {
+  if (doGPU) {
     CondWaitEvent(waitForTransfer, &mEvents->single);
     if (waitForTransfer) {
       ReleaseEvent(mEvents->single);
@@ -228,29 +225,23 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   if (param().rec.tpc.looperInterpolationInExtraPass) {
     runKernel<GPUTPCGMMergerFollowLoopers>(GetGridAuto(0));
   }
-  if (doGPU && !doGPUall) {
-    TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, 0);
-    SynchronizeStream(0);
-  }
 
   DoDebugAndDump(RecoStep::TPCMerging, 2048, Merger, &GPUTPCGMMerger::DumpRefit, *mDebugFile);
   runKernel<GPUTPCGMMergerFinalize, 0>(GetGridAuto(0, deviceType));
-  if (!param().rec.nonConsecutiveIDs) {
-    runKernel<GPUTPCGMMergerFinalize, 1>(GetGridAuto(0, deviceType));
-    runKernel<GPUTPCGMMergerFinalize, 2>(GetGridAuto(0, deviceType));
-  }
+  runKernel<GPUTPCGMMergerFinalize, 1>(GetGridAuto(0, deviceType));
+  runKernel<GPUTPCGMMergerFinalize, 2>(GetGridAuto(0, deviceType));
   if (param().rec.tpc.mergeLoopersAfterburner) {
-    runKernel<GPUTPCGMMergerMergeLoopers, 0>(doGPUall ? GetGrid(Merger.NOutputTracks(), 0, deviceType) : GetGridAuto(0, deviceType));
+    runKernel<GPUTPCGMMergerMergeLoopers, 0>(doGPU ? GetGrid(Merger.NOutputTracks(), 0, deviceType) : GetGridAuto(0, deviceType));
     if (doGPU) {
       TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0);
       SynchronizeStream(0); // TODO: could probably synchronize on an event after runKernel<GPUTPCGMMergerMergeLoopers, 1>
     }
     runKernel<GPUTPCGMMergerMergeLoopers, 1>(GetGridAuto(0, deviceType));
-    runKernel<GPUTPCGMMergerMergeLoopers, 2>(doGPUall ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType));
+    runKernel<GPUTPCGMMergerMergeLoopers, 2>(doGPU ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType));
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);
 
-  if (doGPUall) {
+  if (doGPU) {
     RecordMarker(&mEvents->single, 0);
     auto* waitEvent = &mEvents->single;
     if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1 || mFractionalQAEnabled) {
@@ -302,7 +293,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->single);
     runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(GetGridAuto(0, deviceType));
     mRec->ReturnVolatileDeviceMemory();
-    SynchronizeEventAndRelease(mEvents->single, doGPUall);
+    SynchronizeEventAndRelease(mEvents->single, doGPU);
 
     if (GetProcessingSettings().clearO2OutputFromGPU) {
       mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
@@ -316,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
       AllocateRegisteredMemory(Merger.MemoryResOutputO2MC(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2Labels)]);
       TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, -1, true);
       runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::mc>(GetGridAuto(0, GPUReconstruction::krnlDeviceType::CPU));
-    } else if (doGPUall) {
+    } else if (doGPU) {
       RecordMarker(&mEvents->single, 0);
       TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2(), outputStream, nullptr, &mEvents->single);
       TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus(), outputStream);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
index ba6ba03fca8a1..35a8c6c455048 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -30,11 +30,8 @@ int32_t GPUChainTracking::GlobalTracking(uint32_t iSlice, int32_t threadId, bool
     GPUInfo("GPU Tracker running Global Tracking for slice %u on thread %d\n", iSlice, threadId);
   }
 
-  GPUReconstruction::krnlDeviceType deviceType = GetProcessingSettings().fullMergerOnGPU ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU;
-  runKernel<GPUTPCGlobalTracking>({GetGridBlk(256, iSlice % mRec->NStreams(), deviceType), {iSlice}});
-  if (GetProcessingSettings().fullMergerOnGPU) {
-    TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, processors()->tpcTrackers[iSlice].MemoryResCommon(), iSlice % mRec->NStreams());
-  }
+  runKernel<GPUTPCGlobalTracking>({GetGridBlk(256, iSlice % mRec->NStreams()), {iSlice}});
+  TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, processors()->tpcTrackers[iSlice].MemoryResCommon(), iSlice % mRec->NStreams());
   if (synchronizeOutput) {
     SynchronizeStream(iSlice % mRec->NStreams());
   }
@@ -450,7 +447,7 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
             blocking[tmpSlice * mRec->NStreams() + sliceRight % mRec->NStreams()] = true;
           }
         }
-        GlobalTracking(tmpSlice, 0, !GetProcessingSettings().fullMergerOnGPU);
+        GlobalTracking(tmpSlice, 0, false);
       }
     }
     for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
index fab4469eeb488..60dd18a254904 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
@@ -81,7 +81,7 @@ struct MergeLooperParam {
 #include "GPUMemorySizeScalers.h"
 
 GPUTPCGMMerger::GPUTPCGMMerger()
-  : mTrackLinks(nullptr), mNTotalSliceTracks(0), mNMaxTracks(0), mNMaxSingleSliceTracks(0), mNMaxOutputTrackClusters(0), mNMaxClusters(0), mMemoryResMemory(-1), mNClusters(0), mOutputTracks(nullptr), mSliceTrackInfos(nullptr), mSliceTrackInfoIndex(nullptr), mClusters(nullptr), mClustersXYZ(nullptr), mGlobalClusterIDs(nullptr), mClusterAttachment(nullptr), mOutputTracksTPCO2(nullptr), mOutputClusRefsTPCO2(nullptr), mOutputTracksTPCO2MC(nullptr), mTrackOrderAttach(nullptr), mTrackOrderProcess(nullptr), mBorderMemory(nullptr), mBorderRangeMemory(nullptr), mMemory(nullptr), mRetryRefitIds(nullptr), mLoopData(nullptr)
+  : mTrackLinks(nullptr), mNTotalSliceTracks(0), mNMaxTracks(0), mNMaxSingleSliceTracks(0), mNMaxOutputTrackClusters(0), mNMaxClusters(0), mMemoryResMemory(-1), mNClusters(0), mOutputTracks(nullptr), mSliceTrackInfos(nullptr), mSliceTrackInfoIndex(nullptr), mClusters(nullptr), mClustersXYZ(nullptr), mClusterAttachment(nullptr), mOutputTracksTPCO2(nullptr), mOutputClusRefsTPCO2(nullptr), mOutputTracksTPCO2MC(nullptr), mTrackOrderAttach(nullptr), mTrackOrderProcess(nullptr), mBorderMemory(nullptr), mBorderRangeMemory(nullptr), mMemory(nullptr), mRetryRefitIds(nullptr), mLoopData(nullptr)
 {
   //* constructor
 
@@ -95,10 +95,6 @@ GPUTPCGMMerger::GPUTPCGMMerger()
   mPrevSliceInd[0] = mid;
   mNextSliceInd[last] = NSLICES / 2;
   mPrevSliceInd[NSLICES / 2] = last;
-
-  for (int32_t i = 0; i < NSLICES; i++) {
-    mkSlices[i] = nullptr;
-  }
 }
 
 // DEBUG CODE
@@ -180,13 +176,9 @@ int64_t GPUTPCGMMerger::GetTrackLabelA(const S& trk) const
   for (int32_t i = 0; i < nClusters; i++) {
     int32_t id;
     if constexpr (std::is_same<S, GPUTPCGMBorderTrack&>::value) {
-      if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-        const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[sliceTrack->Slice()];
-        const GPUTPCHitId& ic = tracker.TrackHits()[sliceTrack->OrigTrack()->FirstHitID() + i];
-        id = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[sliceTrack->Slice()][0];
-      } else {
-        id = sliceTrack->OrigTrack()->OutTrackClusters()[i].GetId();
-      }
+      const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[sliceTrack->Slice()];
+      const GPUTPCHitId& ic = tracker.TrackHits()[sliceTrack->OrigTrack()->FirstHitID() + i];
+      id = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[sliceTrack->Slice()][0];
     } else {
       id = mClusters[trk.FirstClusterRef() + i].num;
     }
@@ -251,9 +243,6 @@ void* GPUTPCGMMerger::SetPointersMerger(void* mem)
 {
   computePointerWithAlignment(mem, mSliceTrackInfos, mNTotalSliceTracks);
   computePointerWithAlignment(mem, mSliceTrackInfoIndex, NSLICES * 2 + 1);
-  if (mRec->GetParam().rec.nonConsecutiveIDs) {
-    computePointerWithAlignment(mem, mGlobalClusterIDs, mNMaxOutputTrackClusters);
-  }
   if (mRec->GetProcessingSettings().deterministicGPUReconstruction) {
     computePointerWithAlignment(mem, mTmpSortMemory, std::max(mNTotalSliceTracks, mNMaxTracks * 2));
   }
@@ -263,7 +252,7 @@ void* GPUTPCGMMerger::SetPointersMerger(void* mem)
   computePointerWithAlignment(mem, mBorderRangeMemory, 2 * mNTotalSliceTracks);
   int32_t nTracks = 0;
   for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
-    const int32_t n = mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks() : mkSlices[iSlice]->NTracks();
+    const int32_t n = *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks();
     mBorder[iSlice] = mBorderMemory + 2 * nTracks;
     mBorder[NSLICES + iSlice] = mBorderMemory + 2 * nTracks + n;
     mBorderRange[iSlice] = mBorderRangeMemory + 2 * nTracks;
@@ -296,14 +285,6 @@ void* GPUTPCGMMerger::SetPointersMemory(void* mem)
 }
 
 void* GPUTPCGMMerger::SetPointersRefitScratch(void* mem)
-{
-  if (mRec->GetProcessingSettings().fullMergerOnGPU) {
-    mem = SetPointersRefitScratch2(mem);
-  }
-  return mem;
-}
-
-void* GPUTPCGMMerger::SetPointersRefitScratch2(void* mem)
 {
   computePointerWithAlignment(mem, mTrackOrderAttach, mNMaxTracks);
   if (mRec->GetProcessingSettings().mergerSortTracks) {
@@ -323,9 +304,6 @@ void* GPUTPCGMMerger::SetPointersOutput(void* mem)
     computePointerWithAlignment(mem, mClustersXYZ, mNMaxOutputTrackClusters);
   }
   computePointerWithAlignment(mem, mClusterAttachment, mNMaxClusters);
-  if (!mRec->GetProcessingSettings().fullMergerOnGPU) {
-    mem = SetPointersRefitScratch2(mem);
-  }
   return mem;
 }
 
@@ -367,10 +345,10 @@ void* GPUTPCGMMerger::SetPointersOutputO2Scratch(void* mem)
 void GPUTPCGMMerger::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
-  mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersMerger, (mRec->GetProcessingSettings().fullMergerOnGPU ? 0 : GPUMemoryResource::MEMORY_HOST) | GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMerger");
+  mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersMerger, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMerger");
   mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersRefitScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCMergerRefitScratch");
-  mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutput, (mRec->GetProcessingSettings().fullMergerOnGPU ? (mRec->GetProcessingSettings().createO2Output > 1 ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT) : GPUMemoryResource::MEMORY_INOUT) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutput");
-  mMemoryResOutputState = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputState, (mRec->GetProcessingSettings().fullMergerOnGPU ? (mRec->GetProcessingSettings().outputSharedClusterMap ? GPUMemoryResource::MEMORY_OUTPUT : GPUMemoryResource::MEMORY_GPU) : GPUMemoryResource::MEMORY_HOST) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputState");
+  mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutput, (mRec->GetProcessingSettings().createO2Output > 1 ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutput");
+  mMemoryResOutputState = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputState, (mRec->GetProcessingSettings().outputSharedClusterMap ? GPUMemoryResource::MEMORY_OUTPUT : GPUMemoryResource::MEMORY_GPU) | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputState");
   if (mRec->GetProcessingSettings().createO2Output) {
     mMemoryResOutputO2Scratch = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputO2Scratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputO2Scratch");
     mMemoryResOutputO2 = mRec->RegisterMemoryAllocation(this, &GPUTPCGMMerger::SetPointersOutputO2, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM, "TPCMergerOutputO2");
@@ -388,9 +366,9 @@ void GPUTPCGMMerger::SetMaxData(const GPUTrackingInOutPointers& io)
   mNClusters = 0;
   mNMaxSingleSliceTracks = 0;
   for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
-    uint32_t ntrk = mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks() : mkSlices[iSlice]->NTracks();
+    uint32_t ntrk = *mRec->GetConstantMem().tpcTrackers[iSlice].NTracks();
     mNTotalSliceTracks += ntrk;
-    mNClusters += mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[iSlice].NTrackHits() : mkSlices[iSlice]->NTrackClusters();
+    mNClusters += *mRec->GetConstantMem().tpcTrackers[iSlice].NTrackHits();
     if (mNMaxSingleSliceTracks < ntrk) {
       mNMaxSingleSliceTracks = ntrk;
     }
@@ -417,12 +395,12 @@ void GPUTPCGMMerger::SetMaxData(const GPUTrackingInOutPointers& io)
 int32_t GPUTPCGMMerger::CheckSlices()
 {
   for (int32_t i = 0; i < NSLICES; i++) {
-    if ((Param().rec.tpc.mergerReadFromTrackerDirectly ? mRec->GetConstantMem().tpcTrackers[i].CommonMemory()->nLocalTracks : mkSlices[i]->NLocalTracks()) > mNMaxSingleSliceTracks) {
+    if (mRec->GetConstantMem().tpcTrackers[i].CommonMemory()->nLocalTracks > (int32_t)mNMaxSingleSliceTracks) {
       throw std::runtime_error("mNMaxSingleSliceTracks too small");
     }
   }
-  if (!(mRec->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking) && (!Param().rec.nonConsecutiveIDs || Param().rec.tpc.mergerReadFromTrackerDirectly)) {
-    throw std::runtime_error("Must run also slice tracking if nonConsecutiveIDs = false or mergerReadFromTrackerDirectly");
+  if (!(mRec->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking)) {
+    throw std::runtime_error("Must run also slice tracking");
   }
   return 0;
 }
@@ -469,32 +447,18 @@ GPUd() int32_t GPUTPCGMMerger::RefitSliceTrack(GPUTPCGMSliceTrack& sliceTrack, c
     for (int32_t i = start; i != end; i += incr) {
       float x, y, z;
       int32_t row, flags;
-      if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-        const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[slice];
-        const GPUTPCHitId& ic = tracker.TrackHits()[inTrack->FirstHitID() + i];
-        int32_t clusterIndex = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex());
-        row = ic.RowIndex();
-        const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[GetConstantMem()->ioPtrs.clustersNative->clusterOffset[slice][0] + clusterIndex];
-        flags = cl.getFlags();
-        if (Param().par.earlyTpcTransform) {
-          x = tracker.Data().ClusterData()[clusterIndex].x;
-          y = tracker.Data().ClusterData()[clusterIndex].y;
-          z = tracker.Data().ClusterData()[clusterIndex].z - trk.TZOffset();
-        } else {
-          GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset());
-        }
+      const GPUTPCTracker& tracker = GetConstantMem()->tpcTrackers[slice];
+      const GPUTPCHitId& ic = tracker.TrackHits()[inTrack->FirstHitID() + i];
+      int32_t clusterIndex = tracker.Data().ClusterDataIndex(tracker.Data().Row(ic.RowIndex()), ic.HitIndex());
+      row = ic.RowIndex();
+      const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[GetConstantMem()->ioPtrs.clustersNative->clusterOffset[slice][0] + clusterIndex];
+      flags = cl.getFlags();
+      if (Param().par.earlyTpcTransform) {
+        x = tracker.Data().ClusterData()[clusterIndex].x;
+        y = tracker.Data().ClusterData()[clusterIndex].y;
+        z = tracker.Data().ClusterData()[clusterIndex].z - trk.TZOffset();
       } else {
-        const GPUTPCSliceOutCluster& clo = inTrack->OutTrackCluster(i);
-        row = clo.GetRow();
-        flags = clo.GetFlags();
-        if (Param().par.earlyTpcTransform) {
-          x = clo.GetX();
-          y = clo.GetY();
-          z = clo.GetZ() - trk.TZOffset();
-        } else {
-          const ClusterNative& cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear[clo.GetId()];
-          GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset());
-        }
+        GetConstantMem()->calibObjects.fastTransformHelper->Transform(slice, row, cl.getPad(), cl.getTime(), x, y, z, trk.TZOffset());
       }
       if (prop.PropagateToXAlpha(x, alpha, true)) {
         return way == 0;
@@ -516,25 +480,16 @@ GPUd() int32_t GPUTPCGMMerger::RefitSliceTrack(GPUTPCGMSliceTrack& sliceTrack, c
 
 GPUd() void GPUTPCGMMerger::SetTrackClusterZT(GPUTPCGMSliceTrack& track, int32_t iSlice, const GPUTPCTrack* sliceTr)
 {
-  if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-    const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice];
-    const GPUTPCHitId& ic1 = trk.TrackHits()[sliceTr->FirstHitID()];
-    const GPUTPCHitId& ic2 = trk.TrackHits()[sliceTr->FirstHitID() + sliceTr->NHits() - 1];
-    int32_t clusterIndex1 = trk.Data().ClusterDataIndex(trk.Data().Row(ic1.RowIndex()), ic1.HitIndex());
-    int32_t clusterIndex2 = trk.Data().ClusterDataIndex(trk.Data().Row(ic2.RowIndex()), ic2.HitIndex());
-    if (Param().par.earlyTpcTransform) {
-      track.SetClusterZT(trk.Data().ClusterData()[clusterIndex1].z, trk.Data().ClusterData()[clusterIndex2].z);
-    } else {
-      const ClusterNative* cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0];
-      track.SetClusterZT(cl[clusterIndex1].getTime(), cl[clusterIndex2].getTime());
-    }
+  const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice];
+  const GPUTPCHitId& ic1 = trk.TrackHits()[sliceTr->FirstHitID()];
+  const GPUTPCHitId& ic2 = trk.TrackHits()[sliceTr->FirstHitID() + sliceTr->NHits() - 1];
+  int32_t clusterIndex1 = trk.Data().ClusterDataIndex(trk.Data().Row(ic1.RowIndex()), ic1.HitIndex());
+  int32_t clusterIndex2 = trk.Data().ClusterDataIndex(trk.Data().Row(ic2.RowIndex()), ic2.HitIndex());
+  if (Param().par.earlyTpcTransform) {
+    track.SetClusterZT(trk.Data().ClusterData()[clusterIndex1].z, trk.Data().ClusterData()[clusterIndex2].z);
   } else {
-    if (Param().par.earlyTpcTransform) {
-      track.SetClusterZT(sliceTr->OutTrackClusters()->GetZ(), (sliceTr->OutTrackClusters() + sliceTr->NHits() - 1)->GetZ());
-    } else {
-      const ClusterNative* cls = mConstantMem->ioPtrs.clustersNative->clustersLinear;
-      track.SetClusterZT(cls[sliceTr->OutTrackClusters()->GetId()].getTime(), cls[(sliceTr->OutTrackClusters() + sliceTr->NHits() - 1)->GetId()].getTime());
-    }
+    const ClusterNative* cl = GetConstantMem()->ioPtrs.clustersNative->clustersLinear + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0];
+    track.SetClusterZT(cl[clusterIndex1].getTime(), cl[clusterIndex2].getTime());
   }
 }
 
@@ -548,14 +503,10 @@ GPUd() void GPUTPCGMMerger::UnpackSliceGlobal(int32_t nBlocks, int32_t nThreads,
   const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice];
   float alpha = Param().Alpha(iSlice);
   const GPUTPCTrack* sliceTr = mMemory->firstGlobalTracks[iSlice];
-  uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks();
-  uint32_t nTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? *trk.NTracks() : mkSlices[iSlice]->NTracks();
+  uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks;
+  uint32_t nTracks = *trk.NTracks();
   for (uint32_t itr = nLocalTracks + iBlock * nThreads + iThread; itr < nTracks; itr += nBlocks * nThreads) {
-    if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-      sliceTr = &trk.Tracks()[itr];
-    } else if (itr > nLocalTracks) {
-      sliceTr = sliceTr->GetNextTrack();
-    }
+    sliceTr = &trk.Tracks()[itr];
     int32_t localId = mTrackIDs[(sliceTr->LocalTrackId() >> 24) * mNMaxSingleSliceTracks + (sliceTr->LocalTrackId() & 0xFFFFFF)];
     if (localId == -1) {
       continue;
@@ -576,7 +527,7 @@ GPUd() void GPUTPCGMMerger::UnpackSliceGlobal(int32_t nBlocks, int32_t nThreads,
 GPUd() void GPUTPCGMMerger::UnpackResetIds(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, int32_t iSlice)
 {
   const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice];
-  uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks();
+  uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks;
   for (uint32_t i = iBlock * nThreads + iThread; i < nLocalTracks; i += nBlocks * nThreads) {
     mTrackIDs[iSlice * mNMaxSingleSliceTracks + i] = -1;
   }
@@ -585,17 +536,13 @@ GPUd() void GPUTPCGMMerger::UnpackResetIds(int32_t nBlocks, int32_t nThreads, in
 GPUd() void GPUTPCGMMerger::RefitSliceTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, int32_t iSlice)
 {
   const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[iSlice];
-  uint32_t nLocalTracks = Param().rec.tpc.mergerReadFromTrackerDirectly ? trk.CommonMemory()->nLocalTracks : mkSlices[iSlice]->NLocalTracks();
+  uint32_t nLocalTracks = trk.CommonMemory()->nLocalTracks;
 
   float alpha = Param().Alpha(iSlice);
-  const GPUTPCTrack* sliceTr = Param().rec.tpc.mergerReadFromTrackerDirectly ? nullptr : mkSlices[iSlice]->GetFirstTrack();
+  const GPUTPCTrack* sliceTr = nullptr;
 
   for (uint32_t itr = iBlock * nThreads + iThread; itr < nLocalTracks; itr += nBlocks * nThreads) {
-    if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-      sliceTr = &trk.Tracks()[itr];
-    } else if (itr) {
-      sliceTr = sliceTr->GetNextTrack();
-    }
+    sliceTr = &trk.Tracks()[itr];
     GPUTPCGMSliceTrack track;
     SetTrackClusterZT(track, iSlice, sliceTr);
     if (Param().rec.tpc.mergerCovSource == 0) {
@@ -626,9 +573,6 @@ GPUd() void GPUTPCGMMerger::RefitSliceTracks(int32_t nBlocks, int32_t nThreads,
     mTrackIDs[iSlice * mNMaxSingleSliceTracks + sliceTr->LocalTrackId()] = myTrack;
     mSliceTrackInfos[myTrack] = track;
   }
-  if (!Param().rec.tpc.mergerReadFromTrackerDirectly) {
-    mMemory->firstGlobalTracks[iSlice] = nLocalTracks ? sliceTr->GetNextTrack() : mkSlices[iSlice]->GetFirstTrack();
-  }
 }
 
 GPUd() void GPUTPCGMMerger::LinkGlobalTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
@@ -730,7 +674,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<0>(int32_t nBlocks, int32_t nThrea
 {
   CADEBUG(GPUInfo("\nMERGING Slices %d %d NTracks %d %d CROSS %d", iSlice1, iSlice2, N1, N2, mergeMode));
   GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1];
-  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks());
+  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks();
   bool sameSlice = (iSlice1 == iSlice2);
   for (int32_t itr = iBlock * nThreads + iThread; itr < N1; itr += nThreads * nBlocks) {
     GPUTPCGMBorderTrack& b = B1[itr];
@@ -774,7 +718,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<1>(int32_t nBlocks, int32_t nThrea
 {
 #if !defined(GPUCA_GPUCODE_COMPILEKERNELS)
   GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1];
-  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks());
+  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks();
 
   if (iThread == 0) {
     if (iBlock == 0) {
@@ -864,7 +808,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<2>(int32_t nBlocks, int32_t nThrea
   bool sameSlice = (iSlice1 == iSlice2);
 
   GPUTPCGMBorderRange* range1 = mBorderRange[iSlice1];
-  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + (Param().rec.tpc.mergerReadFromTrackerDirectly ? *GetConstantMem()->tpcTrackers[iSlice2].NTracks() : mkSlices[iSlice2]->NTracks());
+  GPUTPCGMBorderRange* range2 = mBorderRange[iSlice2] + *GetConstantMem()->tpcTrackers[iSlice2].NTracks();
 
   int32_t i2 = 0;
   for (int32_t i1 = iBlock * nThreads + iThread; i1 < N1; i1 += nThreads * nBlocks) {
@@ -1326,10 +1270,6 @@ GPUd() void GPUTPCGMMerger::ResolveMergeSlices(GPUResolveSharedMemory& smem, int
 
 GPUd() void GPUTPCGMMerger::MergeCEFill(const GPUTPCGMSliceTrack* track, const GPUTPCGMMergedTrackHit& cls, const GPUTPCGMMergedTrackHitXYZ* clsXYZ, int32_t itr)
 {
-  if (Param().rec.nonConsecutiveIDs) {
-    return;
-  }
-
   if (Param().rec.tpc.mergerCERowLimit > 0 && CAMath::Abs(track->QPt()) * Param().qptB5Scaler < 0.3f && (cls.row < Param().rec.tpc.mergerCERowLimit || cls.row >= GPUCA_ROW_COUNT - Param().rec.tpc.mergerCERowLimit)) {
     return;
   }
@@ -1646,16 +1586,10 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread
       int32_t nTrackHits = t->NClusters();
       trackCluster* c2 = trackClusters + nHits + nTrackHits - 1;
       for (int32_t i = 0; i < nTrackHits; i++, c2--) {
-        if (Param().rec.tpc.mergerReadFromTrackerDirectly) {
-          const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[t->Slice()];
-          const GPUTPCHitId& ic = trk.TrackHits()[t->OrigTrack()->FirstHitID() + i];
-          uint32_t id = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[t->Slice()][0];
-          *c2 = trackCluster{id, (uint8_t)ic.RowIndex(), t->Slice(), t->Leg()};
-        } else {
-          const GPUTPCSliceOutCluster& c = t->OrigTrack()->OutTrackClusters()[i];
-          uint32_t id = Param().rec.nonConsecutiveIDs ? ((uint32_t)((uint32_t*)&c - (uint32_t*)mkSlices[t->Slice()]->GetFirstTrack())) : c.GetId();
-          *c2 = trackCluster{id, c.GetRow(), t->Slice(), t->Leg()};
-        }
+        const GPUTPCTracker& trk = GetConstantMem()->tpcTrackers[t->Slice()];
+        const GPUTPCHitId& ic = trk.TrackHits()[t->OrigTrack()->FirstHitID() + i];
+        uint32_t id = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + GetConstantMem()->ioPtrs.clustersNative->clusterOffset[t->Slice()][0];
+        *c2 = trackCluster{id, (uint8_t)ic.RowIndex(), t->Slice(), t->Leg()};
       }
       nHits += nTrackHits;
     }
@@ -1771,19 +1705,7 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread
 
     for (int32_t i = 0; i < nHits; i++) {
       uint8_t state;
-      if (Param().rec.nonConsecutiveIDs) {
-        const GPUTPCSliceOutCluster* c = (const GPUTPCSliceOutCluster*)((const int32_t*)mkSlices[trackClusters[i].slice]->GetFirstTrack() + trackClusters[i].id);
-        clXYZ[i].x = c->GetX();
-        clXYZ[i].y = c->GetY();
-        clXYZ[i].z = c->GetZ();
-        clXYZ[i].amp = c->GetAmp();
-        trackClusters[i].id = c->GetId();
-#ifdef GPUCA_TPC_RAW_PROPAGATE_PAD_ROW_TIME
-        cl[i] XYZ.pad = c->mPad;
-        cl[i] XYZ.time = c->mTime;
-#endif
-        state = c->GetFlags();
-      } else if (Param().par.earlyTpcTransform) {
+      if (Param().par.earlyTpcTransform) {
         const GPUTPCClusterData& c = GetConstantMem()->tpcTrackers[trackClusters[i].slice].ClusterData()[trackClusters[i].id - GetConstantMem()->tpcTrackers[trackClusters[i].slice].Data().ClusterIdOffset()];
         clXYZ[i].x = c.x;
         clXYZ[i].y = c.y;
@@ -1800,16 +1722,10 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread
       }
       cl[i].state = state & GPUTPCGMMergedTrackHit::clustererAndSharedFlags; // Only allow edge, deconvoluted, and shared flags
       cl[i].row = trackClusters[i].row;
-      if (!Param().rec.nonConsecutiveIDs) // We already have global consecutive numbers from the slice tracker, and we need to keep them for late cluster attachment
-      {
-        cl[i].num = trackClusters[i].id;
-      } else { // Produce consecutive numbers for shared cluster flagging
-        cl[i].num = iOutTrackFirstCluster + i;
-        mGlobalClusterIDs[cl[i].num] = trackClusters[i].id;
-      }
+      cl[i].num = trackClusters[i].id;
       cl[i].slice = trackClusters[i].slice;
       cl[i].leg = trackClusters[i].leg;
-    } // nHits
+    }
 
     uint32_t iOutputTrack = CAMath::AtomicAdd(&mMemory->nOutputTracks, 1u);
     if (iOutputTrack >= mNMaxTracks) {
@@ -2052,17 +1968,11 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit2(int32_t nBlocks, int32_t nThr
 
 GPUd() void GPUTPCGMMerger::Finalize0(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
 {
-  if (Param().rec.nonConsecutiveIDs) {
-    for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) {
-      mClusters[i].num = mGlobalClusterIDs[i];
-    }
-  } else {
-    for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) {
-      mTrackSort[mTrackOrderAttach[i]] = i;
-    }
-    for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) {
-      mClusterAttachment[mClusters[i].num] = 0; // Reset adjacent attachment for attached clusters, set correctly below
-    }
+  for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) {
+    mTrackSort[mTrackOrderAttach[i]] = i;
+  }
+  for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) {
+    mClusterAttachment[mClusters[i].num] = 0; // Reset adjacent attachment for attached clusters, set correctly below
   }
 }
 
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h
index a9b510e1714ba..3e4ae535fb740 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h
@@ -98,7 +98,6 @@ class GPUTPCGMMerger : public GPUProcessor
   void SetMaxData(const GPUTrackingInOutPointers& io);
   void* SetPointersMerger(void* mem);
   void* SetPointersRefitScratch(void* mem);
-  void* SetPointersRefitScratch2(void* mem);
   void* SetPointersOutput(void* mem);
   void* SetPointersOutputO2(void* mem);
   void* SetPointersOutputO2Clus(void* mem);
@@ -107,8 +106,6 @@ class GPUTPCGMMerger : public GPUProcessor
   void* SetPointersOutputState(void* mem);
   void* SetPointersMemory(void* mem);
 
-  void SetSliceData(int32_t index, const GPUTPCSliceOutput* sliceData) { mkSlices[index] = sliceData; }
-
   GPUhdi() int32_t NOutputTracks() const { return mMemory->nOutputTracks; }
   GPUhdi() const GPUTPCGMMergedTrack* OutputTracks() const { return mOutputTracks; }
   GPUhdi() GPUTPCGMMergedTrack* OutputTracks() { return mOutputTracks; }
@@ -246,8 +243,6 @@ class GPUTPCGMMerger : public GPUProcessor
   int32_t mNextSliceInd[NSLICES];
   int32_t mPrevSliceInd[NSLICES];
 
-  const GPUTPCSliceOutput* mkSlices[NSLICES]; //* array of input slice tracks
-
   int32_t* mTrackLinks;
   int32_t* mTrackCCRoots; // root of the connected component of this track
 
@@ -273,7 +268,6 @@ class GPUTPCGMMerger : public GPUProcessor
   int32_t* mSliceTrackInfoIndex;
   GPUTPCGMMergedTrackHit* mClusters;
   GPUTPCGMMergedTrackHitXYZ* mClustersXYZ;
-  int32_t* mGlobalClusterIDs;
   GPUAtomic(uint32_t) * mClusterAttachment;
   o2::tpc::TrackTPC* mOutputTracksTPCO2;
   uint32_t* mOutputClusRefsTPCO2;
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
index a59af7529a97d..0463966c582a5 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
@@ -67,7 +67,7 @@ void GPUTPCGMMerger::DumpMergeRanges(std::ostream& out, int32_t withinSlice, int
     GPUTPCGMBorderTrack *b1, *b2;
     int32_t jSlice;
     MergeBorderTracksSetup(n1, n2, b1, b2, jSlice, i, withinSlice, mergeMode);
-    const int32_t nTrk = Param().rec.tpc.mergerReadFromTrackerDirectly ? *mRec->GetConstantMem().tpcTrackers[jSlice].NTracks() : mkSlices[jSlice]->NTracks();
+    const int32_t nTrk = *mRec->GetConstantMem().tpcTrackers[jSlice].NTracks();
     const gputpcgmmergertypes::GPUTPCGMBorderRange* range1 = BorderRange(i);
     const gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = BorderRange(jSlice) + nTrk;
     out << "\nBorder Tracks : i " << i << " withinSlice " << withinSlice << " mergeMode " << mergeMode << "\n";
@@ -174,7 +174,7 @@ void GPUTPCGMMerger::DumpFitPrepare(std::ostream& out) const
     }
     out << "\n";
   }
-  uint32_t maxId = Param().rec.nonConsecutiveIDs ? mMemory->nOutputTrackClusters : mNMaxClusters;
+  uint32_t maxId = mNMaxClusters;
   uint32_t j = 0;
   for (uint32_t i = 0; i < maxId; i++) {
     if ((mClusterAttachment[i] & attachFlagMask) != 0) {
@@ -225,7 +225,7 @@ void GPUTPCGMMerger::DumpFinal(std::ostream& out) const
     }
     out << "\n";
   }
-  uint32_t maxId = Param().rec.nonConsecutiveIDs ? mMemory->nOutputTrackClusters : mNMaxClusters;
+  uint32_t maxId = mNMaxClusters;
   uint32_t j = 0;
   for (uint32_t i = 0; i < maxId; i++) {
     if ((mClusterAttachment[i] & attachFlagMask) != 0) {
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx b/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx
index 3c774b13ce5b1..6c8641517b80d 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMSliceTrack.cxx
@@ -95,26 +95,15 @@ GPUd() void GPUTPCGMSliceTrack::SetParam2(const GPUTPCGMTrackParam& trk)
 GPUd() bool GPUTPCGMSliceTrack::FilterErrors(const GPUTPCGMMerger* merger, int32_t iSlice, float maxSinPhi, float sinPhiMargin)
 {
   float lastX;
-  if (merger->Param().par.earlyTpcTransform && !merger->Param().rec.tpc.mergerReadFromTrackerDirectly) {
-    lastX = mOrigTrack->OutTrackCluster(mOrigTrack->NHits() - 1).GetX(); // TODO: Why is this needed, Row2X should work, but looses some tracks
-  } else {
-    //float lastX = merger->Param().tpcGeometry.Row2X(mOrigTrack->Cluster(mOrigTrack->NClusters() - 1).GetRow()); // TODO: again, why does this reduce efficiency?
-    float y, z;
-    const GPUTPCSliceOutCluster* clo;
-    int32_t row, index;
-    if (merger->Param().rec.tpc.mergerReadFromTrackerDirectly) {
-      const GPUTPCTracker& trk = merger->GetConstantMem()->tpcTrackers[iSlice];
-      const GPUTPCHitId& ic = trk.TrackHits()[mOrigTrack->FirstHitID() + mOrigTrack->NHits() - 1];
-      index = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + merger->GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0];
-      row = ic.RowIndex();
-    } else {
-      clo = &mOrigTrack->OutTrackCluster(mOrigTrack->NHits() - 1);
-      index = clo->GetId();
-      row = clo->GetRow();
-    }
-    const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[index];
-    GPUTPCConvertImpl::convert(*merger->GetConstantMem(), iSlice, row, cl.getPad(), cl.getTime(), lastX, y, z);
-  }
+  // float lastX = merger->Param().tpcGeometry.Row2X(mOrigTrack->Cluster(mOrigTrack->NClusters() - 1).GetRow()); // TODO: Why is this needed to be set below, Row2X should work, but looses some tracks
+  float y, z;
+  int32_t row, index;
+  const GPUTPCTracker& trk = merger->GetConstantMem()->tpcTrackers[iSlice];
+  const GPUTPCHitId& ic = trk.TrackHits()[mOrigTrack->FirstHitID() + mOrigTrack->NHits() - 1];
+  index = trk.Data().ClusterDataIndex(trk.Data().Row(ic.RowIndex()), ic.HitIndex()) + merger->GetConstantMem()->ioPtrs.clustersNative->clusterOffset[iSlice][0];
+  row = ic.RowIndex();
+  const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[index];
+  GPUTPCConvertImpl::convert(*merger->GetConstantMem(), iSlice, row, cl.getPad(), cl.getTime(), lastX, y, z);
 
   const int32_t N = 3;
 
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
index c038146cf8497..d5a941b333c6e 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
@@ -64,7 +64,7 @@ void GPUTPCTracker::InitializeProcessor()
 
 bool GPUTPCTracker::SliceDataOnGPU()
 {
-  return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) && mRec->GetParam().rec.tpc.mergerReadFromTrackerDirectly && (mRec->GetConstantMem().ioPtrs.clustersNative || mRec->GetConstantMem().ioPtrs.tpcZS || mRec->GetConstantMem().ioPtrs.tpcPackedDigits);
+  return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) && (mRec->GetConstantMem().ioPtrs.clustersNative || mRec->GetConstantMem().ioPtrs.tpcZS || mRec->GetConstantMem().ioPtrs.tpcPackedDigits);
 }
 
 void* GPUTPCTracker::SetPointersDataInput(void* mem) { return mData.SetPointersInput(mem, mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging, SliceDataOnGPU()); }
@@ -117,7 +117,7 @@ void GPUTPCTracker::RegisterMemoryAllocation()
   mMemoryResCommon = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersCommon, GPUMemoryResource::MEMORY_PERMANENT, "TPCTrackerCommon");
   mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataRows, GPUMemoryResource::MEMORY_PERMANENT, "TPCSliceRows");
 
-  uint32_t type = mRec->GetProcessingSettings().fullMergerOnGPU ? GPUMemoryResource::MEMORY_SCRATCH : GPUMemoryResource::MEMORY_OUTPUT;
+  uint32_t type = GPUMemoryResource::MEMORY_SCRATCH;
   if (mRec->GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { // For individual scheme, we allocate tracklets separately, and change the type for the following allocations to custom
     type |= GPUMemoryResource::MEMORY_CUSTOM;
     mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type, "TPCTrackerTracklets");
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 4bfcc312e27e7..e6017788144e0 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -164,7 +164,7 @@ int32_t ReadConfiguration(int argc, char** argv)
   }
 #endif
 #ifndef GPUCA_TPC_GEOMETRY_O2
-  configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0;
+#error Why was configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0 needed?
   configStandalone.proc.ompKernels = false;
   configStandalone.proc.createO2Output = 0;
   if (configStandalone.rundEdx == -1) {
@@ -412,7 +412,7 @@ int32_t SetupReconstruction()
   }
 
   steps.outputs.clear();
-  steps.outputs.setBits(GPUDataTypes::InOutType::TPCSectorTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TPCSliceTracking) && !recSet.tpc.mergerReadFromTrackerDirectly);
+  steps.outputs.setBits(GPUDataTypes::InOutType::TPCSectorTracks, false);
   steps.outputs.setBits(GPUDataTypes::InOutType::TPCMergedTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TPCMerging));
   steps.outputs.setBits(GPUDataTypes::InOutType::TPCCompressedClusters, steps.steps.isSet(GPUDataTypes::RecoStep::TPCCompression));
   steps.outputs.setBits(GPUDataTypes::InOutType::TRDTracks, steps.steps.isSet(GPUDataTypes::RecoStep::TRDTracking));
diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index 2aa0611b33779..70a093c7f1de7 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -909,11 +909,6 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
 
   bool mcAvail = mcPresent() || tracksExtMC;
 
-  if (mcAvail && !tracksExtMC && mTracking->GetParam().rec.nonConsecutiveIDs) {
-    GPUError("QA incompatible to non-consecutive MC labels");
-    return;
-  }
-
   if (mcAvail) {
     // Assign Track MC Labels
     timer.Start();

From 91b094539e2e18848ae407fd54efa8d025ffabb6 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Tue, 4 Feb 2025 01:44:02 +0100
Subject: [PATCH 2/2] GPU: Remove support for host helper threads (no longer
 used)

---
 Common/Topologies/o2prototype_topology.xml    |   2 +-
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |   3 -
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  10 +-
 .../Base/GPUReconstructionDeviceBase.cxx      | 139 ------------------
 .../Base/GPUReconstructionDeviceBase.h        |  17 +--
 .../Base/GPUReconstructionHelpers.h           |  50 -------
 GPU/GPUTracking/CMakeLists.txt                |   1 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   1 -
 GPU/GPUTracking/Global/GPUChain.h             |  13 --
 GPU/GPUTracking/Global/GPUChainTracking.h     |   7 +-
 .../Global/GPUChainTrackingSliceTracker.cxx   |  65 +-------
 11 files changed, 8 insertions(+), 300 deletions(-)
 delete mode 100644 GPU/GPUTracking/Base/GPUReconstructionHelpers.h

diff --git a/Common/Topologies/o2prototype_topology.xml b/Common/Topologies/o2prototype_topology.xml
index 240b8d87d469a..8d53c9eb0127a 100644
--- a/Common/Topologies/o2prototype_topology.xml
+++ b/Common/Topologies/o2prototype_topology.xml
@@ -74,7 +74,7 @@ The following parameters need adjustment when extending the FLP-EPN configuratio
     </decltask>
 
     <decltask id="tracker">
-        <exe reachable="true">$ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -GPUHelperThreads 4 -loglevel=0x7c'</exe>
+        <exe reachable="true">$ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -loglevel=0x7c'</exe>
         <!-- <requirement></requirement> -->
         <properties>
             <id access="read">EPNReceiverOutputAddress</id>
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 1496300818fd8..270f092a1fd29 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -278,9 +278,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
   if (!(mRecoSteps.stepsGPUMask & GPUDataTypes::RecoStep::TPCMerging)) {
     mProcessingSettings.mergerSortTracks = false;
   }
-  if (!IsGPU()) {
-    mProcessingSettings.nDeviceHelperThreads = 0;
-  }
 
   if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) {
     mProcessingSettings.delayedOutput = false;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 8cc753731d074..27959382e7b67 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -16,7 +16,6 @@
 #define GPURECONSTRUCTIONICPU_H
 
 #include "GPUReconstruction.h"
-#include "GPUReconstructionHelpers.h"
 #include "GPUConstantMem.h"
 #include <stdexcept>
 #include "utils/timer.h"
@@ -117,13 +116,6 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   virtual void RecordMarker(deviceEvent* ev, int32_t stream) {}
   virtual void SynchronizeGPU() {}
   virtual void ReleaseEvent(deviceEvent ev) {}
-  virtual int32_t StartHelperThreads() { return 0; }
-  virtual int32_t StopHelperThreads() { return 0; }
-  virtual void RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) {}
-  virtual void WaitForHelperThreads() {}
-  virtual int32_t HelperError(int32_t iThread) const { return 0; }
-  virtual int32_t HelperDone(int32_t iThread) const { return 0; }
-  virtual void ResetHelperThreads(int32_t helpers) {}
 
   size_t TransferMemoryResourceToGPU(GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { return TransferMemoryInternal(res, stream, ev, evList, nEvents, true, res->Ptr(), res->PtrDevice()); }
   size_t TransferMemoryResourceToHost(GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { return TransferMemoryInternal(res, stream, ev, evList, nEvents, false, res->PtrDevice(), res->Ptr()); }
@@ -294,7 +286,7 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
   static int32_t id = getNextTimerId();
   timerMeta* timer = getTimerById(id);
   if (timer == nullptr) {
-    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nDeviceHelperThreads + 1, mProcessingSettings.nStreams});
+    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
     timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
   }
   if (num == -1) {
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
index 3522095622ad4..91715fab4f668 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
@@ -41,57 +41,6 @@ GPUReconstructionDeviceBase::GPUReconstructionDeviceBase(const GPUSettingsDevice
 
 GPUReconstructionDeviceBase::~GPUReconstructionDeviceBase() = default;
 
-void* GPUReconstructionDeviceBase::helperWrapper_static(void* arg)
-{
-  GPUReconstructionHelpers::helperParam* par = (GPUReconstructionHelpers::helperParam*)arg;
-  GPUReconstructionDeviceBase* cls = par->cls;
-  return cls->helperWrapper(par);
-}
-
-void* GPUReconstructionDeviceBase::helperWrapper(GPUReconstructionHelpers::helperParam* par)
-{
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("\tHelper thread %d starting", par->num);
-  }
-
-  // cpu_set_t mask; //TODO add option
-  // CPU_ZERO(&mask);
-  // CPU_SET(par->num * 2 + 2, &mask);
-  // sched_setaffinity(0, sizeof(mask), &mask);
-
-  par->mutex[0].lock();
-  while (par->terminate == false) {
-    for (int32_t i = par->num + 1; i < par->count; i += mProcessingSettings.nDeviceHelperThreads + 1) {
-      // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->num, i, par->phase);
-      if ((par->functionCls->*par->function)(i, par->num + 1, par)) {
-        par->error = 1;
-      }
-      if (par->reset) {
-        break;
-      }
-      par->done = i + 1;
-      // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->num, i, par->phase);
-    }
-    ResetThisHelperThread(par);
-    par->mutex[0].lock();
-  }
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("\tHelper thread %d terminating", par->num);
-  }
-  par->mutex[1].unlock();
-  pthread_exit(nullptr);
-  return (nullptr);
-}
-
-void GPUReconstructionDeviceBase::ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par)
-{
-  if (par->reset) {
-    GPUImportant("GPU Helper Thread %d reseting", par->num);
-  }
-  par->reset = false;
-  par->mutex[1].unlock();
-}
-
 int32_t GPUReconstructionDeviceBase::GetGlobalLock(void*& pLock)
 {
 #ifdef _WIN32
@@ -138,86 +87,6 @@ void GPUReconstructionDeviceBase::ReleaseGlobalLock(void* sem)
 #endif
 }
 
-void GPUReconstructionDeviceBase::ResetHelperThreads(int32_t helpers)
-{
-  GPUImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d (%d))", mProcessingSettings.nDeviceHelperThreads, mNSlaveThreads);
-  SynchronizeGPU();
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    mHelperParams[i].reset = true;
-    if (helpers || i >= mProcessingSettings.nDeviceHelperThreads) {
-      pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]);
-    }
-  }
-  GPUImportant("GPU Tracker helper threads have ben reset");
-}
-
-int32_t GPUReconstructionDeviceBase::StartHelperThreads()
-{
-  int32_t nThreads = mProcessingSettings.nDeviceHelperThreads;
-  if (nThreads) {
-    mHelperParams = new GPUReconstructionHelpers::helperParam[nThreads];
-    if (mHelperParams == nullptr) {
-      GPUError("Memory allocation error");
-      ExitDevice();
-      return (1);
-    }
-    for (int32_t i = 0; i < nThreads; i++) {
-      mHelperParams[i].cls = this;
-      mHelperParams[i].terminate = false;
-      mHelperParams[i].reset = false;
-      mHelperParams[i].num = i;
-      for (int32_t j = 0; j < 2; j++) {
-        mHelperParams[i].mutex[j].lock();
-      }
-
-      if (pthread_create(&mHelperParams[i].threadId, nullptr, helperWrapper_static, &mHelperParams[i])) {
-        GPUError("Error starting slave thread");
-        ExitDevice();
-        return (1);
-      }
-    }
-  }
-  mNSlaveThreads = nThreads;
-  return (0);
-}
-
-int32_t GPUReconstructionDeviceBase::StopHelperThreads()
-{
-  if (mNSlaveThreads) {
-    for (int32_t i = 0; i < mNSlaveThreads; i++) {
-      mHelperParams[i].terminate = true;
-      mHelperParams[i].mutex[0].unlock();
-      mHelperParams[i].mutex[1].lock();
-      if (pthread_join(mHelperParams[i].threadId, nullptr)) {
-        GPUError("Error waiting for thread to terminate");
-        return (1);
-      }
-    }
-    delete[] mHelperParams;
-  }
-  mNSlaveThreads = 0;
-  return (0);
-}
-
-void GPUReconstructionDeviceBase::WaitForHelperThreads()
-{
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]);
-  }
-}
-
-void GPUReconstructionDeviceBase::RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t i, int32_t t, GPUReconstructionHelpers::helperParam* p), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count)
-{
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    mHelperParams[i].done = 0;
-    mHelperParams[i].error = 0;
-    mHelperParams[i].function = function;
-    mHelperParams[i].functionCls = functionCls;
-    mHelperParams[i].count = count;
-    pthread_mutex_unlock(&((pthread_mutex_t*)mHelperParams[i].mutex)[0]);
-  }
-}
-
 int32_t GPUReconstructionDeviceBase::InitDevice()
 {
   // cpu_set_t mask;
@@ -262,10 +131,6 @@ int32_t GPUReconstructionDeviceBase::InitDevice()
   mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors");
   AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors);
 
-  if (StartHelperThreads()) {
-    return (1);
-  }
-
   if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) {
     GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message!
   }
@@ -282,10 +147,6 @@ void* GPUReconstructionDeviceBase::GPUProcessorProcessors::SetPointersDeviceProc
 
 int32_t GPUReconstructionDeviceBase::ExitDevice()
 {
-  if (StopHelperThreads()) {
-    return (1);
-  }
-
   int32_t retVal = ExitDevice_Runtime();
   mProcessorsShadow = nullptr;
   mHostMemoryPool = mHostMemoryBase = mDeviceMemoryPool = mDeviceMemoryBase = mHostMemoryPoolEnd = mDeviceMemoryPoolEnd = mHostMemoryPermanent = mDeviceMemoryPermanent = nullptr;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
index 215615f558442..1381fd0f76981 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
@@ -17,7 +17,6 @@
 
 #include "GPUReconstructionCPU.h"
 #include <pthread.h>
-#include "GPUReconstructionHelpers.h"
 #include "GPUChain.h"
 #include <vector>
 
@@ -61,24 +60,10 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
   size_t GPUMemCpyAlways(bool onGpu, void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override = 0;
 
-  int32_t StartHelperThreads() override;
-  int32_t StopHelperThreads() override;
-  void RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) override;
-  int32_t HelperError(int32_t iThread) const override { return mHelperParams[iThread].error; }
-  int32_t HelperDone(int32_t iThread) const override { return mHelperParams[iThread].done; }
-  void WaitForHelperThreads() override;
-  void ResetHelperThreads(int32_t helpers) override;
-  void ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par);
-
   int32_t GetGlobalLock(void*& pLock);
   void ReleaseGlobalLock(void* sem);
 
-  static void* helperWrapper_static(void* arg);
-  void* helperWrapper(GPUReconstructionHelpers::helperParam* par);
-
-  int32_t mDeviceId = -1;                                         // Device ID used by backend
-  GPUReconstructionHelpers::helperParam* mHelperParams = nullptr; // Control Struct for helper threads
-  int32_t mNSlaveThreads = 0;                                     // Number of slave threads currently active
+  int32_t mDeviceId = -1; // Device ID used by backend
 
   struct DebugEvents {
     deviceEvent DebugStart, DebugStop; // Debug timer events
diff --git a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h b/GPU/GPUTracking/Base/GPUReconstructionHelpers.h
deleted file mode 100644
index c55e81905f32f..0000000000000
--- a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file GPUReconstructionHelpers.h
-/// \author David Rohr
-
-#ifndef GPURECONSTRUCTIONHELPERS_H
-#define GPURECONSTRUCTIONHELPERS_H
-
-#include <mutex>
-
-namespace o2
-{
-namespace gpu
-{
-class GPUReconstructionDeviceBase;
-class GPUReconstructionHelpers
-{
- public:
-  class helperDelegateBase
-  {
-  };
-
-  struct helperParam {
-    pthread_t threadId;
-    GPUReconstructionDeviceBase* cls;
-    int32_t num;
-    std::mutex mutex[2];
-    int8_t terminate;
-    helperDelegateBase* functionCls;
-    int32_t (helperDelegateBase::*function)(int32_t, int32_t, helperParam*);
-    int32_t phase;
-    int32_t count;
-    volatile int32_t done;
-    volatile int8_t error;
-    volatile int8_t reset;
-  };
-};
-} // namespace gpu
-} // namespace o2
-
-#endif
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 5dd92d41db29b..6acc7fd1dd537 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -104,7 +104,6 @@ set(HDRS_INSTALL
     Base/GPUConstantMem.h
     Base/GPUParam.inc
     Base/GPUParamRTC.h
-    Base/GPUReconstructionHelpers.h
     Base/GPUReconstructionIncludes.h
     Base/GPUReconstructionIncludesITS.h
     Base/GPUReconstructionKernelMacros.h
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index c10793975453d..ca6f2f370300e 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -252,7 +252,6 @@ AddOption(registerStandaloneInputMemory, bool, false, "registerInputMemory", 0,
 AddOption(ompThreads, int32_t, -1, "omp", 't', "Number of OMP threads to run (-1: all)", min(-1), message("Using %s OMP threads"))
 AddOption(ompKernels, uint8_t, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels")
 AddOption(ompAutoNThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data")
-AddOption(nDeviceHelperThreads, int32_t, 1, "", 0, "Number of CPU helper threads for CPU processing")
 AddOption(nStreams, int8_t, 8, "", 0, "Number of GPU streams / command queues")
 AddOption(nTPCClustererLanes, int8_t, -1, "", 0, "Number of TPC clusterers that can run in parallel (-1 = autoset)")
 AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster max fragment len to a certain value (-1 = autodetect)")
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 06650f9d9c733..0981fea43810a 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -16,7 +16,6 @@
 #define GPUCHAIN_H
 
 #include "GPUReconstructionCPU.h"
-#include "GPUReconstructionHelpers.h"
 
 namespace o2
 {
@@ -111,12 +110,6 @@ class GPUChain
     }
   }
   inline void StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents = 1) { mRec->StreamWaitForEvents(stream, evList, nEvents); }
-  template <class T>
-  void RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count);
-  inline void WaitForHelperThreads() { mRec->WaitForHelperThreads(); }
-  inline int32_t HelperError(int32_t iThread) const { return mRec->HelperError(iThread); }
-  inline int32_t HelperDone(int32_t iThread) const { return mRec->HelperDone(iThread); }
-  inline void ResetHelperThreads(int32_t helpers) { mRec->ResetHelperThreads(helpers); }
   inline int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1) { return mRec->GPUDebug(state, stream); }
   // nEvents is forced to 0 if evList ==  nullptr
   inline void TransferMemoryResourceToGPU(RecoStep step, GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { timeCpy(step, true, &GPUReconstructionCPU::TransferMemoryResourceToGPU, res, stream, ev, evList, nEvents); }
@@ -242,12 +235,6 @@ class GPUChain
   void timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args);
 };
 
-template <class T>
-inline void GPUChain::RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count)
-{
-  mRec->RunHelperThreads((int32_t(GPUReconstructionHelpers::helperDelegateBase::*)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*))function, functionCls, count);
-}
-
 template <bool Always, class T, class S, typename... Args>
 inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args)
 {
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 6d6d82b518097..d827b095773b1 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -16,7 +16,6 @@
 #define GPUCHAINTRACKING_H
 
 #include "GPUChain.h"
-#include "GPUReconstructionHelpers.h"
 #include "GPUDataTypes.h"
 #include <atomic>
 #include <mutex>
@@ -68,7 +67,7 @@ struct GPUTPCCFChainContext;
 struct GPUNewCalibValues;
 struct GPUTriggerOutputs;
 
-class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelegateBase
+class GPUChainTracking : public GPUChain
 {
   friend class GPUReconstruction;
 
@@ -314,15 +313,11 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
 
-  std::atomic_flag mLockAtomicOutputBuffer = ATOMIC_FLAG_INIT;
   std::mutex mMutexUpdateCalib;
   std::unique_ptr<GPUChainTrackingFinalContext> mPipelineFinalizationCtx;
   GPUChainTrackingFinalContext* mPipelineNotifyCtx = nullptr;
   std::function<void()> mWaitForFinalInputs;
 
-  int32_t HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par);
-  int32_t HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par);
-
   int32_t OutputStream() const { return mRec->NStreams() - 2; }
 };
 } // namespace gpu
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
index 35a8c6c455048..174b3757d3307 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -55,9 +55,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices()
   if (retVal) {
     SynchronizeGPU();
   }
-  if (retVal >= 2) {
-    ResetHelperThreads(retVal >= 3);
-  }
   return (retVal != 0);
 }
 
@@ -114,9 +111,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       processorsShadow()->tpcTrackers[iSlice].SetGPUTextureBase(mRec->DeviceMemoryBase());
     }
 
-    if (!doSliceDataOnGPU) {
-      RunHelperThreads(&GPUChainTracking::HelperReadEvent, this, NSLICES);
-    }
     if (PrepareTextures()) {
       return (2);
     }
@@ -183,22 +177,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream);
       runKernel<GPUTPCCreateSliceData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
       streamInit[useStream] = true;
-    } else if (!doGPU || iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) {
+    } else {
       if (ReadEvent(iSlice, 0)) {
         GPUError("Error reading event");
         error = 1;
         continue;
       }
-    } else {
-      if (GetProcessingSettings().debugLevel >= 3) {
-        GPUInfo("Waiting for helper thread %d", iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1);
-      }
-      while (HelperDone(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1) < (int32_t)iSlice) {
-      }
-      if (HelperError(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1)) {
-        error = 1;
-        continue;
-      }
     }
     if (GetProcessingSettings().deterministicGPUReconstruction) {
       runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::hitData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}});
@@ -297,9 +281,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
     if (doGPU) {
       ReleaseEvent(mEvents->init);
     }
-    if (!doSliceDataOnGPU) {
-      WaitForHelperThreads();
-    }
 
     if (!GetProcessingSettings().trackletSelectorInPipeline) {
       if (GetProcessingSettings().trackletConstructorInPipeline) {
@@ -359,7 +340,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       if (param().rec.tpc.globalTracking) {
         mWriteOutputDone.fill(0);
       }
-      RunHelperThreads(&GPUChainTracking::HelperOutput, this, NSLICES);
 
       uint32_t tmpSlice = 0;
       for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
@@ -402,12 +382,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
         }
 
         if (GetProcessingSettings().debugLevel >= 3) {
-          GPUInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1));
+          GPUInfo("Data ready for slice %d", iSlice);
         }
         mSliceSelectorReady = iSlice;
 
         if (param().rec.tpc.globalTracking) {
-          for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a += GetProcessingSettings().nDeviceHelperThreads + 1) {
+          for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a++) {
             uint32_t tmpSlice2 = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(tmpSlice2a);
             uint32_t sliceLeft, sliceRight;
             GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice2, sliceLeft, sliceRight);
@@ -419,12 +399,9 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
             }
           }
         } else {
-          if (iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) {
-            WriteOutput(iSlice, 0);
-          }
+          WriteOutput(iSlice, 0);
         }
       }
-      WaitForHelperThreads();
     }
     if (!(GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) && param().rec.tpc.globalTracking) {
       std::vector<bool> blocking(NSLICES * mRec->NStreams());
@@ -518,43 +495,9 @@ void GPUChainTracking::WriteOutput(int32_t iSlice, int32_t threadId)
   if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Running WriteOutput for slice %d on thread %d\n", iSlice, threadId);
   }
-  if (GetProcessingSettings().nDeviceHelperThreads) {
-    while (mLockAtomicOutputBuffer.test_and_set(std::memory_order_acquire)) {
-    }
-  }
   processors()->tpcTrackers[iSlice].WriteOutputPrepare();
-  if (GetProcessingSettings().nDeviceHelperThreads) {
-    mLockAtomicOutputBuffer.clear();
-  }
   processors()->tpcTrackers[iSlice].WriteOutput();
   if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Finished WriteOutput for slice %d on thread %d\n", iSlice, threadId);
   }
 }
-
-int32_t GPUChainTracking::HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) { return ReadEvent(iSlice, threadId); }
-
-int32_t GPUChainTracking::HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par)
-{
-  if (param().rec.tpc.globalTracking) {
-    uint32_t tmpSlice = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(iSlice);
-    uint32_t sliceLeft, sliceRight;
-    GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice, sliceLeft, sliceRight);
-
-    while (mSliceSelectorReady < (int32_t)tmpSlice || mSliceSelectorReady < (int32_t)sliceLeft || mSliceSelectorReady < (int32_t)sliceRight) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    GlobalTracking(tmpSlice, 0);
-    WriteOutput(tmpSlice, 0);
-  } else {
-    while (mSliceSelectorReady < iSlice) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    WriteOutput(iSlice, threadId);
-  }
-  return 0;
-}