davidrohr · alibuild · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
@@ -270,15 +270,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
   }
   if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
     mProcessingSettings.nTPCClustererLanes = 1;
-    if (mProcessingSettings.trackletConstructorInPipeline < 0) {
-      mProcessingSettings.trackletConstructorInPipeline = 1;
-    }
-    if (mProcessingSettings.trackletSelectorInPipeline < 0) {
-      mProcessingSettings.trackletSelectorInPipeline = 1;
-    }
-    if (mProcessingSettings.trackletSelectorSectors < 0) {
-      mProcessingSettings.trackletSelectorSectors = 1;
-    }
   }
   if (mProcessingSettings.createO2Output > 1 && mProcessingSettings.runQA && mProcessingSettings.qcRunFraction == 100.f) {
     mProcessingSettings.createO2Output = 1;
@@ -296,9 +287,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
 
   UpdateAutomaticProcessingSettings();
   GPUCA_GPUReconstructionUpdateDefaults();
-  if (!mProcessingSettings.trackletConstructorInPipeline) {
-    mProcessingSettings.trackletSelectorInPipeline = false;
-  }
   if (!mProcessingSettings.rtc.enable) {
     mProcessingSettings.rtc.optConstexpr = false;
   }

@@ -31,7 +31,6 @@
 #include "GPUOutputControl.h"
 #include "GPUMemoryResource.h"
 #include "GPUConstantMem.h"
-#include "GPUTPCSectorOutput.h"
 #include "GPULogging.h"
 
 namespace o2::its

@@ -18,7 +18,6 @@
 #include "GPUChain.h"
 
 #include "GPUTPCClusterData.h"
-#include "GPUTPCSectorOutput.h"
 #include "GPUTPCSectorOutCluster.h"
 #include "GPUTPCGMMergedTrack.h"
 #include "GPUTPCGMMergedTrackHit.h"
@@ -66,28 +65,25 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
   if (x.nThreads != 1) {
     throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
   }
-  uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
-  for (uint32_t k = 0; k < num; k++) {
-    int32_t nThreads = getNKernelHostThreads(false);
-    if (nThreads > 1) {
-      if (mProcessingSettings.debugLevel >= 5) {
-        printf("Running %d Threads\n", nThreads);
-      }
-      tbb::this_task_arena::isolate([&] {
-        mThreading->activeThreads->execute([&] {
-          tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
-            typename T::GPUSharedMemory smem;
-            for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
-              T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
-            }
-          });
+  int32_t nThreads = getNKernelHostThreads(false);
+  if (nThreads > 1) {
+    if (mProcessingSettings.debugLevel >= 5) {
+      printf("Running %d Threads\n", nThreads);
+    }
+    tbb::this_task_arena::isolate([&] {
+      mThreading->activeThreads->execute([&] {
+        tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
+          typename T::GPUSharedMemory smem;
+          for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
+            T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
+          }
         });
       });
-    } else {
-      for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
-        typename T::GPUSharedMemory smem;
-        T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
-      }
+    });
+  } else {
+    for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
+      typename T::GPUSharedMemory smem;
+      T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
     }
   }
 }

@@ -49,7 +49,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
  public:
   ~GPUReconstructionCPU() override;
-  static constexpr krnlRunRange krnlRunRangeNone{0, -1};
+  static constexpr krnlRunRange krnlRunRangeNone{0};
   static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
 
   template <class S, int32_t I = 0, typename... Args>
@@ -77,7 +77,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
 
-#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                                                     \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                                   \
   inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
   {                                                                                                                                                                                          \
     if (cpuFallback) {                                                                                                                                                                       \
@@ -161,7 +161,8 @@ inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
     throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
   }
   if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
+    GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)"
+                                                                                                                                                                                          : mDeviceName.c_str());
   }
   if (nThreads == 0 || nBlocks == 0) {
     return;

@@ -16,7 +16,6 @@
 #include "GPUReconstructionIncludes.h"
 
 #include "GPUTPCTracker.h"
-#include "GPUTPCSectorOutput.h"
 
 using namespace o2::gpu;
 

@@ -30,15 +30,6 @@
 #include <fstream>
 
 #define GPUCA_GPUReconstructionUpdateDefaults()                                             \
-  if (mProcessingSettings.trackletConstructorInPipeline < 0) {                              \
-    mProcessingSettings.trackletConstructorInPipeline = GPUCA_CONSTRUCTOR_IN_PIPELINE;      \
-  }                                                                                         \
-  if (mProcessingSettings.trackletSelectorInPipeline < 0) {                                 \
-    mProcessingSettings.trackletSelectorInPipeline = GPUCA_SELECTOR_IN_PIPELINE;            \
-  }                                                                                         \
-  if (mProcessingSettings.trackletSelectorSectors < 0) {                                    \
-    mProcessingSettings.trackletSelectorSectors = GPUCA_TRACKLET_SELECTOR_SECTOR_COUNT;     \
-  }                                                                                         \
   if (mProcessingSettings.alternateBorderSort < 0) {                                        \
     mProcessingSettings.alternateBorderSort = GPUCA_ALTERNATE_BORDER_SORT;                  \
   }                                                                                         \

@@ -35,113 +35,59 @@
 #ifndef GPUCA_KRNL_REG
 #define GPUCA_KRNL_REG(...)
 #endif
-#define GPUCA_KRNL_REG_INTERNAL_PROP(...) GPUCA_M_STRIP(__VA_ARGS__)
 #ifndef GPUCA_KRNL_CUSTOM
 #define GPUCA_KRNL_CUSTOM(...)
 #endif
-#define GPUCA_KRNL_CUSTOM_INTERNAL_PROP(...)
-#define GPUCA_ATTRRES_REG(XX, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, XX))(num) GPUCA_ATTRRES2(XX, __VA_ARGS__)
-#define GPUCA_ATTRRES2_REG(XX, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, XX))(num) GPUCA_ATTRRES3(XX, __VA_ARGS__)
-#define GPUCA_ATTRRES_CUSTOM(XX, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, XX))(args) GPUCA_ATTRRES2(XX, __VA_ARGS__)
-#define GPUCA_ATTRRES2_CUSTOM(XX, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, XX))(args) GPUCA_ATTRRES3(XX, __VA_ARGS__)
-#define GPUCA_ATTRRES_NONE(XX, ...)
-#define GPUCA_ATTRRES2_NONE(XX, ...)
-#define GPUCA_ATTRRES_(XX, ...)
-#define GPUCA_ATTRRES2_(XX, ...)
-#define GPUCA_ATTRRES3(XX) // 3 attributes not supported
-#define GPUCA_ATTRRES2(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES2_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
-#define GPUCA_ATTRRES(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
-// GPU Kernel entry point for single sector
-#define GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, ...) \
-  GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t iSector_internal GPUCA_M_STRIP(x_arguments))
-#ifdef GPUCA_KRNL_DEFONLY
-#define GPUCA_KRNLGPU_SINGLE(...) GPUCA_KRNLGPU_SINGLE_DEF(__VA_ARGS__);
-#else
-#define GPUCA_KRNLGPU_SINGLE(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
-  { \
-    GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
-    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[iSector_internal] GPUCA_M_STRIP(x_forward)); \
-  }
-#endif
+#define GPUCA_KRNL_REG_EXTRREG(...) GPUCA_M_STRIP(__VA_ARGS__)
+#define GPUCA_KRNL_CUSTOM_EXTRREG(MODE, ...) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
+#define GPUCA_KRNL_NONE_EXTRREG(MODE, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_REG(MODE, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, MODE))(num) GPUCA_ATTRRES_XREG (MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_CUSTOM(MODE, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, MODE))(args) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_NONE(MODE, none, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_(MODE, ...)
+#define GPUCA_ATTRRES_XNONE(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_XCUSTOM(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_XREG(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
+
+// GPU Kernel entry point
+#define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \
+  GPUg() void GPUCA_ATTRRES(, GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))
 
-// GPU Kernel entry point for multiple sector
-#define GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, ...) \
-  GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)(GPUCA_CONSMEM_PTR int32_t firstSector, int32_t nSectorCount GPUCA_M_STRIP(x_arguments))
 #ifdef GPUCA_KRNL_DEFONLY
-#define GPUCA_KRNLGPU_MULTI(...) GPUCA_KRNLGPU_MULTI_DEF(__VA_ARGS__);
+#define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__);
 #else
-#define GPUCA_KRNLGPU_MULTI(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
+#define GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, ...) \
+  GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
   { \
-    const int32_t iSector_internal = nSectorCount * (get_group_id(0) + (get_num_groups(0) % nSectorCount != 0 && nSectorCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); \
-    const int32_t nSectorBlockOffset = get_num_groups(0) * iSector_internal / nSectorCount; \
-    const int32_t sectorBlockId = get_group_id(0) - nSectorBlockOffset; \
-    const int32_t sectorGridDim = get_num_groups(0) * (iSector_internal + 1) / nSectorCount - get_num_groups(0) * (iSector_internal) / nSectorCount; \
     GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
-    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(sectorGridDim, get_local_size(0), sectorBlockId, get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[firstSector + iSector_internal] GPUCA_M_STRIP(x_forward)); \
+    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[_iSector_internal] GPUCA_M_STRIP(x_forward)); \
   }
 #endif
 
-// GPU Host wrapper pre- and post-parts
-#define GPUCA_KRNL_PRE(x_class, ...) \
+// GPU Host wrappers for kernel
+#define GPUCA_KRNL_HOST(x_class, ...) \
+  GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
   template <> class GPUCA_KRNL_BACKEND_CLASS::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
    public: \
     template <typename T, typename... Args> \
     static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
     { \
       auto& x = _xyz.x; \
-      auto& y = _xyz.y;
-
-#define GPUCA_KRNL_POST() \
+      auto& y = _xyz.y; \
+      GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
     } \
   };
 
-// GPU Host wrappers for single kernel, multi-sector, or auto-detection
-#define GPUCA_KRNL_single(...) \
-  GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  if (y.num > 1) { \
-    throw std::runtime_error("Kernel called with invalid number of sectors"); \
-  } else { \
-    GPUCA_KRNL_CALL_single(__VA_ARGS__) \
-  } \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_multi(...) \
-  GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_(...) GPUCA_KRNL_single(__VA_ARGS__)
-#define GPUCA_KRNL_simple(...) GPUCA_KRNL_single(__VA_ARGS__)
-#define GPUCA_KRNL_both(...) \
-  GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
-  GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  if (y.num <= 1) { \
-    GPUCA_KRNL_CALL_single(__VA_ARGS__) \
-  } else { \
-    GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
-  } \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_LOAD_(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
-#define GPUCA_KRNL_LOAD_simple(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
-#define GPUCA_KRNL_LOAD_both(...) \
-  GPUCA_KRNL_LOAD_single(__VA_ARGS__) \
-  GPUCA_KRNL_LOAD_multi(__VA_ARGS__)
-
 #define GPUCA_KRNL_PROP(x_class, x_attributes) \
   template <> gpu_reconstruction_kernels::krnlProperties GPUCA_KRNL_BACKEND_CLASS::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>() { \
-    gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes)))}; \
+    gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_EXTRREG, GPUCA_M_STRIP(x_attributes))}; \
     return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \
   }
 
-// Generate GPU kernel and host wrapper
-#define GPUCA_KRNL_WRAP(x_func, x_class, x_attributes, ...) GPUCA_M_CAT(x_func, GPUCA_M_STRIP_FIRST(x_attributes))(x_class, x_attributes, __VA_ARGS__)
 #endif // GPUCA_GPUCODE
 
-#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (GPUCA_M_STRIP(x_attributes), REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
+#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
 
 #endif // O2_GPU_GPURECONSTRUCTIONKERNELMACROS_H
 // clang-format on
@@ -41,11 +41,8 @@ struct krnlExec {
 };
 struct krnlRunRange {
   constexpr krnlRunRange() = default;
-  constexpr krnlRunRange(uint32_t a) : start(a), num(0) {}
-  constexpr krnlRunRange(uint32_t s, int32_t n) : start(s), num(n) {}
-
-  uint32_t start = 0;
-  int32_t num = 0;
+  constexpr krnlRunRange(uint32_t v) : index(v) {}
+  uint32_t index = 0;
 };
 struct krnlEvent {
   constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
@@ -63,7 +60,7 @@ struct krnlProperties {
 };
 
 struct krnlSetup {
-  krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0, -1}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
+  krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
   krnlExec x;
   krnlRunRange y;
   krnlEvent z;
@@ -98,7 +95,7 @@ class GPUReconstructionKernels : public T
   template <class S, int32_t I = 0, typename... Args>
   using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
 
-#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                              \
   virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args)                                          \
   {                                                                                                                                                     \
     T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args);                                                                                 \