AliceO2Group · davidrohr · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025
@@ -16,12 +16,19 @@
 #define GPUTPCCLUSTERREJECTION_H
 
 #include "GPUTPCGMMergerTypes.h"
+#include "GPUCommonMath.h"
 
 namespace o2::gpu
 {
 struct GPUTPCClusterRejection {
+  template <class T, class S>
+  GPUdi() static bool IsTrackRejected(const T& trk, const S& param)
+  {
+    return CAMath::Abs(trk.GetParam().GetQPt() * param.qptB5Scaler) > param.rec.tpc.rejectQPtB5 || trk.MergedLooper();
+  }
+
   template <bool C, class T = void, class S = void>
-  static constexpr inline bool GetProtectionStatus(int32_t attach, bool& physics, bool& protect, T* counts = nullptr, S* mev200 = nullptr)
+  GPUdi() static constexpr bool GetRejectionStatus(int32_t attach, bool& physics, T* counts = nullptr, S* mev200 = nullptr)
   {
     (void)counts; // FIXME: Avoid incorrect -Wunused-but-set-parameter warning
     (void)mev200;
@@ -39,7 +46,6 @@ struct GPUTPCClusterRejection {
       }
       retVal = true;
     } else if (attach & gputpcgmmergertypes::attachTube) {
-      protect = true;
       if constexpr (C) {
         if (*mev200) {
           counts->nTube200++;
@@ -49,7 +55,6 @@ struct GPUTPCClusterRejection {
       }
       retVal = false;
     } else if ((attach & gputpcgmmergertypes::attachGood) == 0) {
-      protect = true;
       if constexpr (C) {
         counts->nRejected++;
       }
@@ -60,16 +65,15 @@ struct GPUTPCClusterRejection {
     }
 
     if (attach & gputpcgmmergertypes::attachProtect) {
-      protect = true;
       retVal = false;
     }
     return retVal;
   }
 
-  static constexpr inline bool GetIsRejected(int32_t attach)
+  GPUdi() static constexpr bool GetIsRejected(int32_t attach)
   {
-    bool physics = false, protect = false;
-    return GetProtectionStatus<false>(attach, physics, protect);
+    bool physics = false;
+    return GetRejectionStatus<false>(attach, physics);
   }
 };
 } // namespace o2::gpu

@@ -111,7 +111,7 @@ void GPUTPCCompression::RegisterMemoryAllocation()
   if (gatherMode == 3) {
     mMemoryResOutputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCCompression::SetPointersOutputGPU, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCCompressionOutputGPU");
   }
-  uint32_t stackScratch = (gatherMode != 3) ? GPUMemoryResource::MEMORY_STACK : 0;
+  uint32_t stackScratch = (gatherMode != 3) ? GPUMemoryResource::MEMORY_STACK : 0; // TODO: Can we use stacked memory also with gather mode 3?
   if (gatherMode < 2) {
     mRec->RegisterMemoryAllocation(this, &GPUTPCCompression::SetPointersOutput, GPUMemoryResource::MEMORY_OUTPUT | stackScratch, "TPCCompressionOutput");
   }

@@ -60,6 +60,7 @@ class GPUTPCCompression : public GPUProcessor
 #ifndef GPUCA_GPUCODE
   void DumpCompressedClusters(std::ostream& out);
 #endif
+  GPUd() bool rejectCluster(int32_t idx, const GPUParam& param, const GPUTrackingInOutPointers& ioPtrs) const;
 
  protected:
   struct memory {
@@ -89,7 +90,6 @@ class GPUTPCCompression : public GPUProcessor
   void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);
   template <class T>
   GPUd() static void truncateSignificantBits(T& val, uint32_t nBits, uint32_t max);
-  GPUd() bool rejectCluster(int32_t idx, GPUParam& param, const GPUTrackingInOutPointers& ioPtrs);
 
   int16_t mMemoryResOutputHost = -1;
   int16_t mMemoryResOutputGPU = -1;

@@ -39,7 +39,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
     if (!trk.OK()) {
       continue;
     }
-    bool rejectTrk = CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper();
+    bool rejectTrk = GPUTPCClusterRejection::IsTrackRejected(trk, param);
     uint32_t nClustersStored = 0;
     CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;
     uint8_t lastRow = 0, lastSector = 0;
@@ -185,7 +185,7 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::opera
   return mClsPtr[a].qTot < mClsPtr[b].qTot;
 }
 
-GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, GPUParam& GPUrestrict() param, const GPUTrackingInOutPointers& GPUrestrict() ioPtrs)
+GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, const GPUParam& GPUrestrict() param, const GPUTrackingInOutPointers& GPUrestrict() ioPtrs) const
 {
   if (mClusterStatus[idx]) {
     return true;
@@ -206,7 +206,7 @@ GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, GPUParam& GPUrestrict(
     }
     int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
     auto& trk = ioPtrs.mergedTracks[id];
-    if (CAMath::Abs(trk.GetParam().GetQPt() * param.qptB5Scaler) > param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
+    if (GPUTPCClusterRejection::IsTrackRejected(trk, param)) {
       return true;
     }
   }

@@ -596,6 +596,7 @@ AddOption(stripDumpedEvents, bool, false, "", 0, "Remove redundant inputs (e.g.
 AddOption(printSettings, int32_t, 0, "", 0, "Print all settings", def(1))
 AddOption(testSyncAsync, bool, false, "syncAsync", 0, "Test first synchronous and then asynchronous processing")
 AddOption(testSync, bool, false, "sync", 0, "Test settings for synchronous phase")
+AddOption(testSyncAsyncQcInSync, bool, false, "syncAsyncSyncQC", 0, "Run QC in sync phase of testSyncAsync")
 AddOption(timeFrameTime, bool, false, "tfTime", 0, "Print some debug information about time frame processing time")
 AddOption(controlProfiler, bool, false, "", 0, "Issues GPU profiler stop and start commands to profile only the relevant processing part")
 AddOption(preloadEvents, bool, false, "", 0, "Preload events into host memory before start processing")

@@ -587,7 +587,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     return ForwardTPCDigits();
   }
 #ifdef GPUCA_TPC_GEOMETRY_O2
-  [[maybe_unused]] int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
+  [[maybe_unused]] int32_t tpcTimeBinCut = (mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut) ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin; // TODO: Implement time bin cut fultering
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST"));
   const auto& threadContext = GetThreadContext();
   const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;

@@ -72,6 +72,7 @@ extern GPUSettingsStandalone configStandalone;
 }
 
 GPUReconstruction *rec, *recAsync, *recPipeline;
+uint32_t syncAsyncDecodedClusters = 0;
 GPUChainTracking *chainTracking, *chainTrackingAsync, *chainTrackingPipeline;
 GPUChainITS *chainITS, *chainITSAsync, *chainITSPipeline;
 std::string eventsDir;
@@ -430,7 +431,7 @@ int32_t SetupReconstruction()
     }
   }
 
-  bool runAsyncQA = procSet.runQA;
+  bool runAsyncQA = procSet.runQA && !configStandalone.testSyncAsyncQcInSync;
   if (configStandalone.testSyncAsync || configStandalone.testSync) {
     // Set settings for synchronous
     if (configStandalone.rundEdx == -1) {
@@ -439,7 +440,9 @@ int32_t SetupReconstruction()
     recSet.useMatLUT = false;
     if (configStandalone.testSyncAsync) {
       procSet.eventDisplay = nullptr;
-      procSet.runQA = false;
+      if (!configStandalone.testSyncAsyncQcInSync) {
+        procSet.runQA = false;
+      }
     }
   }
   if (configStandalone.proc.rtc.optSpecialCode == -1) {
@@ -664,12 +667,12 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
     }
 
     if (tmpRetVal == 0 && configStandalone.testSyncAsync) {
-      if (configStandalone.testSyncAsync) {
-        printf("Running asynchronous phase\n");
-      }
 
       vecpod<char> compressedTmpMem(chainTracking->mIOPtrs.tpcCompressedClusters->totalDataSize);
       memcpy(compressedTmpMem.data(), (const void*)chainTracking->mIOPtrs.tpcCompressedClusters, chainTracking->mIOPtrs.tpcCompressedClusters->totalDataSize);
+      o2::tpc::CompressedClusters tmp(*chainTracking->mIOPtrs.tpcCompressedClusters);
+      syncAsyncDecodedClusters = tmp.nAttachedClusters + tmp.nUnattachedClusters;
+      printf("Running asynchronous phase from %'u compressed clusters\n", syncAsyncDecodedClusters);
 
       chainTrackingAsync->mIOPtrs = ioPtrs;
       chainTrackingAsync->mIOPtrs.tpcCompressedClusters = (o2::tpc::CompressedClustersFlat*)compressedTmpMem.data();
@@ -937,6 +940,11 @@ int32_t main(int argc, char** argv)
           printf("%s (Measured %s time - Extrapolated from %d clusters to %d)\n", stat, configStandalone.proc.debugLevel ? "kernel" : "wall", (int32_t)nClusters, (int32_t)nClsPerTF);
         }
       }
+      if (configStandalone.testSyncAsync && chainTracking->mIOPtrs.clustersNative && chainTrackingAsync->mIOPtrs.clustersNative) {
+        uint32_t rejected = chainTracking->mIOPtrs.clustersNative->nClustersTotal - syncAsyncDecodedClusters;
+        float rejectionPercentage = (rejected) * 100.f / chainTracking->mIOPtrs.clustersNative->nClustersTotal;
+        printf("Cluster Rejection: Sync: %'u, Compressed %'u, Async %'u, Rejected %'u (%7.2f%%)\n", chainTracking->mIOPtrs.clustersNative->nClustersTotal, syncAsyncDecodedClusters, chainTrackingAsync->mIOPtrs.clustersNative->nClustersTotal, rejected, rejectionPercentage);
+      }
 
       if (configStandalone.preloadEvents && configStandalone.proc.doublePipeline) {
         break;