AliceO2Group · davidrohr · Oct 5, 2025 · Sep 24, 2025 · Oct 4, 2025 · Oct 4, 2025
@@ -35,15 +35,15 @@ struct CorrectionMapsLoaderGloOpts;
 namespace reco_workflow
 {
 /// define input and output types of the workflow
-enum struct InputType { PassThrough,      // No processing, just pass through available inputs to the writers, defined by the OutputType
-                        Digitizer,        // directly read digits from channel {TPC:DIGITS}
-                        Digits,           // read digits from file
-                        ClustersHardware, // read hardware clusters in raw page format from file
-                        Clusters,         // read native clusters from file
-                        CompClusters,     // read compressed cluster container
-                        CompClustersCTF,  // compressed clusters from CTF, as flat format
-                        CompClustersFlat, // compressed clusters in flat format, used as input for the entropy encoder
-                        EncodedClusters,  // read encoded clusters
+enum struct InputType { PassThrough,               // No processing, just pass through available inputs to the writers, defined by the OutputType
+                        Digitizer,                 // directly read digits from channel {TPC:DIGITS}
+                        Digits,                    // read digits from file
+                        ClustersHardware,          // read hardware clusters in raw page format from file
+                        Clusters,                  // read native clusters from file
+                        CompClustersRoot,          // read compressed cluster in ROOT format
+                        CompClustersFlat,          // compressed clusters from flat format (e.g. from CTF)
+                        CompClustersFlatForEncode, // compressed clusters in flat format, used as input for the entropy encoder, no gpu-reco
+                        EncodedClusters,           // read encoded clusters
                         ZSRaw,
 };
 
@@ -59,7 +59,8 @@ enum struct OutputType { Digits,
                          ClustersHardware,
                          Clusters,
                          Tracks,
-                         CompClusters,
+                         CompClustersRoot,
+                         CompClustersFlat,
                          EncodedClusters,
                          DisableWriter,
                          SendClustersPerSector,

@@ -80,16 +80,17 @@ const std::unordered_map<std::string, InputType> InputMap{
   {"clustershardware", InputType::ClustersHardware},
   {"clusters", InputType::Clusters},
   {"zsraw", InputType::ZSRaw},
-  {"compressed-clusters", InputType::CompClusters},
-  {"compressed-clusters-ctf", InputType::CompClustersCTF},
-  {"compressed-clusters-flat", InputType::CompClustersFlat}};
+  {"compressed-clusters-root", InputType::CompClustersRoot},
+  {"compressed-clusters-flat", InputType::CompClustersFlat},
+  {"compressed-clusters-flat-for-encode", InputType::CompClustersFlatForEncode}};
 
 const std::unordered_map<std::string, OutputType> OutputMap{
   {"digits", OutputType::Digits},
   {"clustershardware", OutputType::ClustersHardware},
   {"clusters", OutputType::Clusters},
   {"tracks", OutputType::Tracks},
-  {"compressed-clusters", OutputType::CompClusters},
+  {"compressed-clusters-root", OutputType::CompClustersRoot},
+  {"compressed-clusters-flat", OutputType::CompClustersFlat},
   {"encoded-clusters", OutputType::EncodedClusters},
   {"disable-writer", OutputType::DisableWriter},
   {"send-clusters-per-sector", OutputType::SendClustersPerSector},
@@ -118,18 +119,23 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
     return std::find(outputTypes.begin(), outputTypes.end(), type) != outputTypes.end();
   };
 
-  if (filteredInp && !(inputType == InputType::PassThrough && isEnabled(OutputType::Tracks) && isEnabled(OutputType::Clusters) && isEnabled(OutputType::SendClustersPerSector))) {
-    throw std::invalid_argument("filtered-input option must be provided only with pass-through input and clusters,tracks,send-clusters-per-sector output");
+  if (filteredInp && !(inputType == InputType::PassThrough)) {
+    throw std::invalid_argument("filtered-input option must be provided only with pass-through input");
   }
 
-  bool decompressTPC = inputType == InputType::CompClustersCTF || inputType == InputType::CompClusters;
+  bool decompressTPC = inputType == InputType::CompClustersFlat || inputType == InputType::CompClustersRoot;
   // Disable not applicable settings depending on TPC input, no need to disable manually
   if (decompressTPC && (isEnabled(OutputType::Clusters) || isEnabled(OutputType::Tracks))) {
     caClusterer = false;
     zsOnTheFly = false;
     propagateMC = false;
   }
-  if (inputType == InputType::ZSRaw || inputType == InputType::CompClustersFlat) {
+  if (inputType == InputType::CompClustersFlatForEncode || inputType == InputType::CompClustersRoot || inputType == InputType::CompClustersFlat) {
+    caClusterer = false;
+    zsOnTheFly = false;
+    propagateMC = false;
+  }
+  if (inputType == InputType::ZSRaw) {
     caClusterer = true;
     zsOnTheFly = false;
     propagateMC = false;
@@ -225,7 +231,7 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
       if (sclOpts.requestCTPLumi) { // need CTP digits (lumi) reader
         specs.emplace_back(o2::ctp::getDigitsReaderSpec(false));
       }
-    } else if (inputType == InputType::CompClusters) {
+    } else if (inputType == InputType::CompClustersRoot) {
       // TODO: need to check if we want to store the MC labels alongside with compressed clusters
       // for the moment reading of labels is disabled (last parameter is false)
       // TODO: make a different publisher spec for only one output spec, for now using the
@@ -248,8 +254,9 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
   // output matrix
   // Note: the ClusterHardware format is probably a deprecated legacy format and also the
   // ClusterDecoderRawSpec
-  bool produceCompClusters = isEnabled(OutputType::CompClusters);
-  bool runGPUReco = (produceTracks || produceCompClusters || (isEnabled(OutputType::Clusters) && caClusterer) || inputType == InputType::CompClustersCTF) && inputType != InputType::CompClustersFlat;
+  bool produceCompClustersRoot = isEnabled(OutputType::CompClustersRoot);
+  bool produceCompClustersFlat = isEnabled(OutputType::CompClustersFlat);
+  bool runGPUReco = (produceTracks || produceCompClustersRoot || produceCompClustersFlat || (isEnabled(OutputType::Clusters) && caClusterer) || inputType == InputType::CompClustersFlat) && inputType != InputType::CompClustersFlatForEncode;
   bool runHWDecoder = !caClusterer && (runGPUReco || isEnabled(OutputType::Clusters));
   bool runClusterer = !caClusterer && (runHWDecoder || isEnabled(OutputType::ClustersHardware));
   bool zsDecoder = inputType == InputType::ZSRaw;
@@ -460,13 +467,13 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
     cfg.enableMShape = sclOpts.enableMShapeCorrection;
     cfg.enableCTPLumi = sclOpts.requestCTPLumi;
     cfg.decompressTPC = decompressTPC;
-    cfg.decompressTPCFromROOT = decompressTPC && inputType == InputType::CompClusters;
+    cfg.decompressTPCFromROOT = decompressTPC && inputType == InputType::CompClustersRoot;
     cfg.caClusterer = caClusterer;
     cfg.zsDecoder = zsDecoder;
     cfg.zsOnTheFly = zsOnTheFly;
     cfg.outputTracks = produceTracks;
-    cfg.outputCompClusters = produceCompClusters;
-    cfg.outputCompClustersFlat = runClusterEncoder;
+    cfg.outputCompClustersRoot = produceCompClustersRoot;
+    cfg.outputCompClustersFlat = produceCompClustersFlat || runClusterEncoder;
     cfg.outputCAClusters = isEnabled(OutputType::Clusters) && (caClusterer || decompressTPC);
     cfg.outputQA = isEnabled(OutputType::QA);
     cfg.outputSharedClusterMap = (isEnabled(OutputType::Clusters) || inputType == InputType::Clusters) && isEnabled(OutputType::Tracks) && !isEnabled(OutputType::NoSharedClusterMap);
@@ -500,7 +507,7 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
   //
   // selected by output type 'encoded-clusters'
   if (runClusterEncoder) {
-    specs.emplace_back(o2::tpc::getEntropyEncoderSpec(!runGPUReco && inputType != InputType::CompClustersFlat, selIR));
+    specs.emplace_back(o2::tpc::getEntropyEncoderSpec(!runGPUReco && inputType != InputType::CompClustersFlatForEncode, selIR));
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -547,7 +554,7 @@ framework::WorkflowSpec getWorkflow(CompletionPolicyData* policyData, std::vecto
   // a writer process for compressed clusters container
   //
   // selected by output type 'compressed-clusters'
-  if (produceCompClusters && !isEnabled(OutputType::DisableWriter)) {
+  if (produceCompClustersRoot && !isEnabled(OutputType::DisableWriter)) {
     // defining the track writer process using the generic RootTreeWriter and generator tool
     //
     // defaults

@@ -57,8 +57,8 @@ void customize(std::vector<o2::framework::ConfigParamSpec>& workflowOptions)
   using namespace o2::framework;
 
   std::vector<ConfigParamSpec> options{
-    {"input-type", VariantType::String, "digits", {"digitizer, digits, zsraw, clustershw, clusters, compressed-clusters, compressed-clusters-ctf, pass-through"}},
-    {"output-type", VariantType::String, "tracks", {"digits, zsraw, clustershw, clusters, tracks, compressed-clusters, encoded-clusters, disable-writer, send-clusters-per-sector, qa, no-shared-cluster-map, tpc-triggers"}},
+    {"input-type", VariantType::String, "digits", {"digitizer, digits, zsraw, clustershw, clusters, compressed-clusters-root, compressed-clusters-ctf, compressed-clusters-flat-for-encode, pass-through"}},
+    {"output-type", VariantType::String, "tracks", {"digits, zsraw, clustershw, clusters, tracks, compressed-clusters-root, compressed-clusters-flat, encoded-clusters, disable-writer, send-clusters-per-sector, qa, no-shared-cluster-map, tpc-triggers"}},
     {"disable-root-input", o2::framework::VariantType::Bool, false, {"disable root-files input reader"}},
     {"no-ca-clusterer", VariantType::Bool, false, {"Use HardwareClusterer instead of clusterer of GPUCATracking"}},
     {"disable-mc", VariantType::Bool, false, {"disable sending of MC information"}},
@@ -155,8 +155,6 @@ WorkflowSpec defineDataProcessing(ConfigContext const& cfgc)
     gDispatchTrigger = o2::framework::ConcreteDataTypeMatcher{"TPC", "DIGITS"};
   } else if (inputType == "clustershw") {
     gDispatchTrigger = o2::framework::ConcreteDataTypeMatcher{"TPC", "CLUSTERHW"};
-  } else if (inputType == "clustersnative") {
-    gDispatchTrigger = o2::framework::ConcreteDataTypeMatcher{"TPC", "CLUSTERNATIVE"};
   } else if (inputType == "zsraw") {
     gDispatchTrigger = o2::framework::ConcreteDataTypeMatcher{"TPC", "RAWDATA"};
   }

@@ -89,6 +89,7 @@ class GPUTPCCompression : public GPUProcessor
   void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);
   template <class T>
   GPUd() static void truncateSignificantBits(T& val, uint32_t nBits, uint32_t max);
+  GPUd() bool rejectCluster(int32_t idx, GPUParam& param, const GPUTrackingInOutPointers& ioPtrs);
 
   int16_t mMemoryResOutputHost = -1;
   int16_t mMemoryResOutputGPU = -1;

@@ -183,6 +183,31 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::opera
   return mClsPtr[a].qTot < mClsPtr[b].qTot;
 }
 
+GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, GPUParam& GPUrestrict() param, const GPUTrackingInOutPointers& GPUrestrict() ioPtrs)
+{
+  if (mClusterStatus[idx]) {
+    return true;
+  }
+  int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
+  bool unattached = attach == 0;
+
+  if (unattached) {
+    if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
+      return true;
+    }
+  } else if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
+    if (GPUTPCClusterRejection::GetIsRejected(attach)) {
+      return true;
+    }
+    int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
+    auto& trk = ioPtrs.mergedTracks[id];
+    if (CAMath::Abs(trk.GetParam().GetQPt() * param.qptB5Scaler) > param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 template <>
 GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
 {
@@ -208,33 +233,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
     const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
     for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
       const int32_t idx = idOffset + i;
-      int32_t storeCluster = 0;
-      do {
-        if (i >= clusters->nClusters[iSector][iRow]) {
-          break;
-        }
-        if (compressor.mClusterStatus[idx]) {
-          break;
-        }
-        int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
-        bool unattached = attach == 0;
-
-        if (unattached) {
-          if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
-            break;
-          }
-        } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
-          if (GPUTPCClusterRejection::GetIsRejected(attach)) {
-            break;
-          }
-          int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
-          auto& trk = ioPtrs.mergedTracks[id];
-          if (CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
-            break;
-          }
-        }
-        storeCluster = 1;
-      } while (false);
+      int32_t storeCluster = i < clusters->nClusters[iSector][iRow] && !compressor.rejectCluster(idx, param, ioPtrs);
 
       GPUbarrier();
       int32_t myIndex = work_group_scan_inclusive_add(storeCluster);

@@ -245,6 +245,7 @@ struct GPUTrackingInOutPointers {
   uint32_t nOutputClusRefsTPCO2 = 0;
   const o2::MCCompLabel* outputTracksTPCO2MC = nullptr;
   const o2::tpc::CompressedClustersFlat* tpcCompressedClusters = nullptr;
+  const o2::tpc::ClusterNativeAccess* clustersNativeReduced = nullptr;
 
   // TPC links
   int32_t* tpcLinkITS = nullptr;

@@ -361,6 +361,7 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum n
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
 AddOption(tpcApplyClusterFilterOnCPU, uint8_t, 0, "", 0, "Apply custom cluster filter of GPUTPCClusterFilter class, 0: off, 1: debug, 2: PbPb23")
+AddOption(tpcWriteClustersAfterRejection, bool, false, "", 0, "Apply TPC rejection strategy before writing clusters")
 AddOption(oclPlatformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select, -2 query all platforms (also incompatible))")
 AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
 AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")

@@ -273,6 +273,10 @@ bool GPUChainTracking::ValidateSettings()
     GPUError("Clusterizer and merger Sanity checks only supported when not running on GPU");
     return false;
   }
+  if (GetProcessingSettings().tpcWriteClustersAfterRejection && (mRec->IsGPU() || param().rec.tpc.compressionTypeMask || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCCompression))) {
+    GPUError("tpcWriteClustersAfterRejection requires compressionTypeMask = 0, no GPU usage, and compression enabled");
+    return false;
+  }
   if (GetProcessingSettings().doublePipeline) {
     if (!GetRecoStepsOutputs().isOnlySet(GPUDataTypes::InOutType::TPCMergedTracks, GPUDataTypes::InOutType::TPCCompressedClusters, GPUDataTypes::InOutType::TPCClusters)) {
       GPUError("Invalid outputs for double pipeline mode 0x%x", (uint32_t)GetRecoStepsOutputs());
@@ -543,6 +547,10 @@ void GPUChainTracking::ClearIOPointers()
   std::memset((void*)&mIOPtrs, 0, sizeof(mIOPtrs));
   mIOMem.~InOutMemory();
   new (&mIOMem) InOutMemory;
+  mClusterNativeAccessReduced.reset(nullptr);
+  if (mClusterNativeAccess.get()) {
+    memset((void*)mClusterNativeAccess.get(), 0, sizeof(*mClusterNativeAccess));
+  }
 }
 
 void GPUChainTracking::AllocateIOMemory()

@@ -265,7 +265,7 @@ class GPUChainTracking : public GPUChain
   std::unique_ptr<o2::trd::GeometryFlat> mTRDGeometryU;              // TRD Geometry
 
   // Ptrs to internal buffers
-  std::unique_ptr<o2::tpc::ClusterNativeAccess> mClusterNativeAccess;
+  std::unique_ptr<o2::tpc::ClusterNativeAccess> mClusterNativeAccess, mClusterNativeAccessReduced;
   std::array<GPUOutputControl*, GPUTrackingOutputs::count()> mSubOutputControls = {nullptr};
   std::unique_ptr<GPUTriggerOutputs> mTriggerBuffer;
 
@@ -305,6 +305,7 @@ class GPUChainTracking : public GPUChain
   void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
+  void WriteReducedClusters();
   template <int32_t I>
   int32_t RunTRDTrackingInternal();
   uint32_t StreamForSector(uint32_t sector) const;