AliceO2Group · davidrohr · Dec 11, 2025 · Dec 3, 2025 · Dec 4, 2025 · Dec 3, 2025
@@ -88,8 +88,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
   int32_t ExitDevice() override;
   int32_t GetThread();
 
-  virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
-
   // Pointers to tracker classes
   GPUProcessorProcessors mProcShadow; // Host copy of tracker objects that will be used on the GPU
   GPUConstantMem*& mProcessorsShadow = mProcShadow.mProcessorsProc;

@@ -470,27 +470,6 @@ void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEve
 
 void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
 
-int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
-{
-  if (GetProcessingSettings().stuckProtection) {
-    cl_int tmp = 0;
-    for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) {
-      usleep(50);
-      clGetEventInfo(event.get<cl_event>(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr);
-      if (tmp == CL_COMPLETE) {
-        break;
-      }
-    }
-    if (tmp != CL_COMPLETE) {
-      mGPUStuck = 1;
-      GPUErrorReturn("GPU Stuck, future processing in this component is disabled, skipping event (GPU Event State %d)", (int32_t)tmp);
-    }
-  } else {
-    clFinish(mInternals->command_queue[stream]);
-  }
-  return 0;
-}
-
 void GPUReconstructionOCL::SynchronizeGPU()
 {
   for (int32_t i = 0; i < mNStreams; i++) {

@@ -43,7 +43,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
   virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
 
   void SynchronizeGPU() override;
-  int32_t DoStuckProtection(int32_t stream, deviceEvent event) override;
   int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override;
   void SynchronizeStream(int32_t stream) override;
   void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) override;

@@ -301,7 +301,6 @@ BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Proces
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
 AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
 AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
-AddOption(stuckProtection, int32_t, 0, "", 0, "Timeout in us, When AMD GPU is stuck, just continue processing and skip tracking, do not crash or stall the chain")
 AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidates for single input track during propagation")
 AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking")
 AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
@@ -383,7 +382,6 @@ AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run th
 AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
 AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
 AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
-AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug")
 AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)

@@ -224,8 +224,6 @@ class GPUChain
 
   inline GPUChain* GetNextChainInQueue() { return mRec->GetNextChainInQueue(); }
 
-  virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
-
   template <class T, class S, typename... Args>
   bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args)
   {

@@ -475,7 +475,7 @@ int32_t GPUChainTracking::ForceInitQA()
     qa.reset(new GPUQA(this));
   }
   if (!GetQA()->IsInitialized()) {
-    return GetQA()->InitQA();
+    return GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic);
   }
   return 0;
 }
@@ -690,7 +690,7 @@ int32_t GPUChainTracking::RunChain()
   }
   const bool needQA = GPUQA::QAAvailable() && (GetProcessingSettings().runQA || (GetProcessingSettings().eventDisplay && (mIOPtrs.nMCInfosTPC || GetProcessingSettings().runMC)));
   if (needQA && GetQA()->IsInitialized() == false) {
-    if (GetQA()->InitQA(GetProcessingSettings().runQA ? -GetProcessingSettings().runQA : -1)) {
+    if (GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic)) {
       return 1;
     }
   }

@@ -306,6 +306,7 @@ class GPUChainTracking : public GPUChain
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
   void WriteReducedClusters();
+  void SortClusters(bool buildNativeGPU, bool propagateMCLabels, o2::tpc::ClusterNativeAccess* clusterAccess, o2::tpc::ClusterNative* clusters);
   template <int32_t I>
   int32_t RunTRDTrackingInternal();
   uint32_t StreamForSector(uint32_t sector) const;

@@ -57,6 +57,8 @@
 #include "utils/VcShim.h"
 #include "utils/strtag.h"
 #include <fstream>
+#include <numeric>
+#include <vector>
 
 using namespace o2::gpu;
 using namespace o2::tpc;
@@ -762,14 +764,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   ClusterNative* tmpNativeClusters = nullptr;
   std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
 
-  // setup MC Labels
-  bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
+  const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
+  const bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
+  const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
+  const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);
 
   auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
-  bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
-  bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
-
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
   if (buildNativeGPU) {
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
@@ -1281,21 +1282,20 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
   ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
-  if (propagateMCLabels) {
-    // TODO: write to buffer directly
+  if (propagateMCLabels) { // TODO: write to buffer directly
     o2::dataformats::MCTruthContainer<o2::MCCompLabel> mcLabels;
     std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
-    if (!GetProcessingSettings().tpcWriteClustersAfterRejection && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) {
-      if (!mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator) {
+    auto& labelOutputControl = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
+    if (!GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) {
+      if (!labelOutputControl->allocator) {
         throw std::runtime_error("Cluster MC Label buffer missing");
       }
-      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator(0));
+      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutputControl->allocator(0));
       buffer = {&container->first, &container->second};
     } else {
       mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
       mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
-      buffer.first = mIOMem.clusterNativeMCBuffer.get();
-      buffer.second = mIOMem.clusterNativeMCView.get();
+      buffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
     }
 
     assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
@@ -1350,15 +1350,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   if (doGPU && synchronizeCalibUpdate) {
     SynchronizeStream(0);
   }
-  if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
-    for (uint32_t i = 0; i < NSECTORS; i++) {
-      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
-        std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
-      }
-    }
-    if (buildNativeGPU) {
-      GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
-    }
+  if (sortClusters) {
+    SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters);
   }
   mRec->MemoryScalers()->nTPCHits = nClsTotal;
   mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
@@ -1374,3 +1367,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #endif
   return 0;
 }
+
+void GPUChainTracking::SortClusters(bool buildNativeGPU, bool propagateMCLabels, ClusterNativeAccess* clusterAccess, ClusterNative* clusters)
+{
+  if (propagateMCLabels) {
+    std::vector<uint32_t> clsOrder(clusterAccess->nClustersTotal);
+    std::iota(clsOrder.begin(), clsOrder.end(), 0);
+    std::vector<ClusterNative> tmpClusters;
+    for (uint32_t i = 0; i < NSECTORS; i++) {
+      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+        const uint32_t offset = clusterAccess->clusterOffset[i][j];
+        std::sort(&clsOrder[offset], &clsOrder[offset + clusterAccess->nClusters[i][j]], [&clusters](const uint32_t a, const uint32_t b) {
+          return clusters[a] < clusters[b];
+        });
+        tmpClusters.resize(clusterAccess->nClusters[i][j]);
+        memcpy(tmpClusters.data(), &clusters[offset], clusterAccess->nClusters[i][j] * sizeof(tmpClusters[0]));
+        for (uint32_t k = 0; k < tmpClusters.size(); k++) {
+          clusters[offset + k] = tmpClusters[clsOrder[offset + k] - offset];
+        }
+      }
+    }
+    tmpClusters.clear();
+
+    std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
+    GPUOutputControl* labelOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
+    std::unique_ptr<ConstMCLabelContainerView> tmpUniqueContainerView;
+    std::unique_ptr<ConstMCLabelContainer> tmpUniqueContainerBuffer;
+    if (labelOutput && labelOutput->allocator) {
+      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* labelContainer = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutput->allocator(0));
+      labelBuffer = {&labelContainer->first, &labelContainer->second};
+    } else {
+      tmpUniqueContainerView = std::move(mIOMem.clusterNativeMCView);
+      tmpUniqueContainerBuffer = std::move(mIOMem.clusterNativeMCBuffer);
+      mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
+      mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
+      labelBuffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
+    }
+
+    o2::dataformats::MCLabelContainer tmpContainer;
+    for (uint32_t i = 0; i < clusterAccess->nClustersTotal; i++) {
+      for (const auto& element : clusterAccess->clustersMCTruth->getLabels(clsOrder[i])) {
+        tmpContainer.addElement(i, element);
+      }
+    }
+    tmpContainer.flatten_to(*labelBuffer.first);
+    *labelBuffer.second = *labelBuffer.first;
+    clusterAccess->clustersMCTruth = labelBuffer.second;
+  } else {
+    for (uint32_t i = 0; i < NSECTORS; i++) {
+      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+        std::sort(&clusters[clusterAccess->clusterOffset[i][j]], &clusters[clusterAccess->clusterOffset[i][j] + clusterAccess->nClusters[i][j]]);
+      }
+    }
+  }
+  if (buildNativeGPU) {
+    GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)clusters, clusterAccess->nClustersTotal * sizeof(clusters[0]), -1, true);
+  }
+}
@@ -149,9 +149,6 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     GPUTPCTracker& trk = processors()->tpcTrackers[iSector];
     GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk;
     int32_t useStream = StreamForSector(iSector);
-    if (GetProcessingSettings().amdMI100SerializationWorkaround) {
-      SynchronizeStream(useStream); // TODO: Remove this workaround once fixed on MI100
-    }
 
     if (GetProcessingSettings().debugLevel >= 3) {
       GPUInfo("Creating Sector Data (Sector %d)", iSector);

@@ -197,6 +197,9 @@ int32_t ReadConfiguration(int argc, char** argv)
     printf("Can only produce QA pdf output when input files are specified!\n");
     return 1;
   }
+  if (configStandalone.QA.enableLocalOutput && !configStandalone.QA.inputHistogramsOnly && configStandalone.QA.output == "" && configStandalone.QA.plotsDir != "") {
+    configStandalone.QA.output = configStandalone.QA.plotsDir + "/output.root";
+  }
   if (configStandalone.QA.inputHistogramsOnly) {
     configStandalone.rundEdx = false;
     configStandalone.noEvents = true;