From c9c881887de08e577102288a589e042e0416a6b8 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Thu, 14 Aug 2025 13:46:19 +0200
Subject: [PATCH] ITS: GPU: prepare to lazy loading of data

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  79 ++-
 .../GPU/ITStrackingGPU/TrackerTraitsGPU.h     |   2 +-
 .../GPU/ITStrackingGPU/TrackingKernels.h      |  55 +-
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   | 150 +++++-
 .../ITS/tracking/GPU/cuda/CMakeLists.txt      |  48 +-
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 470 ++++++++++--------
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    | 151 +++---
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 445 +++++------------
 8 files changed, 708 insertions(+), 692 deletions(-)
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 5c10b01412b4e..27b987fb9a84a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -13,13 +13,14 @@
 #ifndef TRACKINGITSGPU_INCLUDE_TIMEFRAMEGPU_H
 #define TRACKINGITSGPU_INCLUDE_TIMEFRAMEGPU_H
 
+#include <gsl/gsl>
+#include <bitset>
+
 #include "ITStracking/BoundedAllocator.h"
 #include "ITStracking/TimeFrame.h"
 #include "ITStracking/Configuration.h"
 #include "ITStrackingGPU/Utils.h"
 
-#include <gsl/gsl>
-
 namespace o2::its::gpu
 {
 
@@ -28,7 +29,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
 {
  public:
   TimeFrameGPU();
-  ~TimeFrameGPU();
+  ~TimeFrameGPU() = default;
 
   /// Most relevant operations
   void registerHostMemory(const int);
@@ -37,18 +38,25 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
   void initDeviceSAFitting();
   void loadIndexTableUtils(const int);
-  void loadTrackingFrameInfoDevice(const int);
-  void loadUnsortedClustersDevice(const int);
-  void loadClustersDevice(const int);
-  void loadClustersIndexTables(const int iteration);
-  void createUsedClustersDevice(const int);
+  void loadTrackingFrameInfoDevice(const int, const int);
+  void createTrackingFrameInfoDeviceArray(const int);
+  void loadUnsortedClustersDevice(const int, const int);
+  void createUnsortedClustersDeviceArray(const int);
+  void loadClustersDevice(const int, const int);
+  void createClustersDeviceArray(const int);
+  void loadClustersIndexTables(const int, const int);
+  void createClustersIndexTablesArray(const int iteration);
+  void createUsedClustersDevice(const int, const int);
+  void createUsedClustersDeviceArray(const int);
   void loadUsedClustersDevice();
-  void loadROframeClustersDevice(const int);
+  void loadROFrameClustersDevice(const int, const int);
+  void createROFrameClustersDeviceArray(const int);
   void loadMultiplicityCutMask(const int);
   void loadVertices(const int);
 
   ///
-  void createTrackletsLUTDevice(const int);
+  void createTrackletsLUTDevice(const int, const int);
+  void createTrackletsLUTDeviceArray(const int);
   void loadTrackletsDevice();
   void loadTrackletsLUTDevice();
   void loadCellsDevice();
@@ -57,11 +65,14 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void loadTrackSeedsChi2Device();
   void loadRoadsDevice();
   void loadTrackSeedsDevice(bounded_vector<CellSeed>&);
-  void createTrackletsBuffers();
+  void createTrackletsBuffers(const int);
+  void createTrackletsBuffersArray(const int);
   void createCellsBuffers(const int);
+  void createCellsBuffersArray(const int);
   void createCellsDevice();
-  void createCellsLUTDevice();
-  void createNeighboursIndexTablesDevice();
+  void createCellsLUTDevice(const int);
+  void createCellsLUTDeviceArray(const int);
+  void createNeighboursIndexTablesDevice(const int);
   void createNeighboursDevice(const unsigned int layer);
   void createNeighboursLUTDevice(const int, const unsigned int);
   void createTrackITSExtDevice(bounded_vector<CellSeed>&);
@@ -70,10 +81,17 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
+
+  /// synchronization
   auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
   auto& getStreams() { return mGpuStreams; }
   void syncStream(const size_t stream);
-  void syncStreams();
+  void syncStreams(const bool = true);
+  void waitEvent(const int, const int);
+  void recordEvent(const int);
+  void recordEvents(const int = 0, const int = nLayers);
+
+  /// cleanup
   virtual void wipe() final;
 
   /// interface
@@ -102,19 +120,19 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
   std::vector<unsigned int> getClusterSizes();
   const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
-  const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
-  Tracklet** getDeviceArrayTracklets() { return mTrackletsDevice.data(); }
+  const int** getDeviceROFrameClusters() const { return mROFramesClustersDeviceArray; }
+  Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; }
   int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
   int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
-  CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); }
+  CellSeed** getDeviceArrayCells() { return mCellsDeviceArray; }
   CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
   int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
   uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; }
 
-  void setDevicePropagator(const o2::base::PropagatorImpl<float>*) override;
+  void setDevicePropagator(const o2::base::PropagatorImpl<float>* p) final { this->mPropagatorDevice = p; }
 
   // Host-specific getters
   gsl::span<int, nLayers - 1> getNTracklets() { return mNTracklets; }
@@ -126,7 +144,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   // Host-available device getters
   gsl::span<int*> getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; }
   gsl::span<int*> getDeviceCellLUTs() { return mCellsLUTDevice; }
-  gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
+  gsl::span<Tracklet*> getDeviceTracklets() { return mTrackletsDevice; }
   gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }
 
   // Overridden getters
@@ -137,7 +155,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
  private:
   void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream
   void allocMem(void**, size_t, bool);               // Abstract owned and unowned memory allocations on default stream
-  bool mHostRegistered = false;
   TimeFrameGPUParameters mGpuParams;
 
   // Host-available device buffer sizes
@@ -161,19 +178,21 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const Cluster** mUnsortedClustersDeviceArray;
   const int** mClustersIndexTablesDeviceArray;
   const unsigned char** mUsedClustersDeviceArray;
-  const int** mROFrameClustersDeviceArray;
+  const int** mROFramesClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   std::array<int*, nLayers - 1> mTrackletsLUTDevice;
   std::array<int*, nLayers - 2> mCellsLUTDevice;
   std::array<int*, nLayers - 3> mNeighboursLUTDevice;
 
-  int** mCellsLUTDeviceArray;
-  int** mNeighboursCellDeviceArray;
-  int** mNeighboursCellLUTDeviceArray;
-  int** mTrackletsLUTDeviceArray;
+  Tracklet** mTrackletsDeviceArray{nullptr};
+  int** mCellsLUTDeviceArray{nullptr};
+  int** mNeighboursCellDeviceArray{nullptr};
+  int** mNeighboursCellLUTDeviceArray{nullptr};
+  int** mTrackletsLUTDeviceArray{nullptr};
   std::array<CellSeed*, nLayers - 2> mCellsDevice;
-  std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
-  CellSeed* mTrackSeedsDevice;
+  CellSeed** mCellsDeviceArray;
+  std::array<int*, nLayers - 3> mNeighboursIndexTablesDevice;
+  CellSeed* mTrackSeedsDevice{nullptr};
   std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
   o2::track::TrackParCovF** mCellSeedsDeviceArray;
   std::array<float*, nLayers - 2> mCellSeedsChi2Device;
@@ -188,6 +207,12 @@ class TimeFrameGPU : public TimeFrame<nLayers>
 
   // State
   Streams mGpuStreams;
+  std::bitset<nLayers + 1> mPinnedUnsortedClusters{0};
+  std::bitset<nLayers + 1> mPinnedClusters{0};
+  std::bitset<nLayers + 1> mPinnedClustersIndexTables{0};
+  std::bitset<nLayers + 1> mPinnedUsedClusters{0};
+  std::bitset<nLayers + 1> mPinnedROFramesClusters{0};
+  std::bitset<nLayers + 1> mPinnedTrackingFrameInfo{0};
 
   // Temporary buffer for storing output tracks from GPU tracking
   bounded_vector<TrackITSExt> mTrackITSExt;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
index 1654f8cc8cf94..d5c3e8ac74925 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
@@ -24,7 +24,7 @@ class TrackerTraitsGPU final : public TrackerTraits<nLayers>
 {
  public:
   TrackerTraitsGPU() = default;
-  ~TrackerTraitsGPU() override = default;
+  ~TrackerTraitsGPU() final = default;
 
   void adoptTimeFrame(TimeFrame<nLayers>* tf) final;
   void initialiseTimeFrame(const int iteration) final;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index a7bf4c70bc5c2..23b23d2b3f3ab 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -13,6 +13,10 @@
 #ifndef ITSTRACKINGGPU_TRACKINGKERNELS_H_
 #define ITSTRACKINGGPU_TRACKINGKERNELS_H_
 
+#include <gsl/gsl>
+
+#include "ITStracking/BoundedAllocator.h"
+#include "ITStrackingGPU/Utils.h"
 #include "DetectorsBase/Propagator.h"
 #include "GPUCommonDef.h"
 
@@ -25,43 +29,43 @@ namespace gpu
 
 #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
 
-GPUdi() int4 getEmptyBinsRect()
+GPUdii() int4 getEmptyBinsRect()
 {
   return int4{0, 0, 0, 0};
 }
 
-GPUd() bool fitTrack(TrackITSExt& track,
-                     int start,
-                     int end,
-                     int step,
-                     float chi2clcut,
-                     float chi2ndfcut,
-                     float maxQoverPt,
-                     int nCl,
-                     float Bz,
-                     TrackingFrameInfo** tfInfos,
-                     const o2::base::Propagator* prop,
-                     o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorImpl<float>::MatCorrType::USEMatCorrNONE);
+GPUdii() bool fitTrack(TrackITSExt& track,
+                       int start,
+                       int end,
+                       int step,
+                       float chi2clcut,
+                       float chi2ndfcut,
+                       float maxQoverPt,
+                       int nCl,
+                       float Bz,
+                       TrackingFrameInfo** tfInfos,
+                       const o2::base::Propagator* prop,
+                       o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorImpl<float>::MatCorrType::USEMatCorrNONE);
 
 template <int nLayers = 7>
-GPUg() void fitTrackSeedsKernel(
-  CellSeed* trackSeeds,
-  const TrackingFrameInfo** foundTrackingFrameInfo,
-  o2::its::TrackITSExt* tracks,
-  const float* minPts,
-  const unsigned int nSeeds,
-  const float Bz,
-  const int startLevel,
-  float maxChi2ClusterAttachment,
-  float maxChi2NDF,
-  const o2::base::Propagator* propagator,
-  const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT);
+GPUg() void fitTrackSeedsKernel(CellSeed* trackSeeds,
+                                const TrackingFrameInfo** foundTrackingFrameInfo,
+                                o2::its::TrackITSExt* tracks,
+                                const float* minPts,
+                                const unsigned int nSeeds,
+                                const float Bz,
+                                const int startLevel,
+                                float maxChi2ClusterAttachment,
+                                float maxChi2NDF,
+                                const o2::base::Propagator* propagator,
+                                const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT);
 #endif
 } // namespace gpu
 
 template <int nLayers = 7>
 void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  const uint8_t* multMask,
+                                 const int layer,
                                  const int startROF,
                                  const int endROF,
                                  const int maxROF,
@@ -94,6 +98,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
 template <int nLayers = 7>
 void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    const uint8_t* multMask,
+                                   const int layer,
                                    const int startROF,
                                    const int endROF,
                                    const int maxROF,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index cd860c47ebd9c..15fe6f05f7850 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -17,9 +17,12 @@
 #define ITSTRACKINGGPU_UTILS_H_
 
 #include <vector>
+#include <string>
+#include <tuple>
 
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
+#include "GPUCommonLogger.h"
 
 #ifndef __HIPCC__
 #define THRUST_NAMESPACE thrust::cuda
@@ -27,19 +30,21 @@
 #define THRUST_NAMESPACE thrust::hip
 #endif
 
+#ifdef ITS_GPU_LOG
+#define GPULog(...) LOGP(info, __VA_ARGS__)
+#else
+#define GPULog(...)
+#endif
+
 namespace o2::its
 {
 
 template <typename T1, typename T2>
-struct gpuPair {
-  T1 first;
-  T2 second;
-};
+using gpuPair = std::pair<T1, T2>;
 
 namespace gpu
 {
 
-// Poor man implementation of a span-like struct. It is very limited.
 template <typename T>
 struct gpuSpan {
   using value_type = T;
@@ -83,11 +88,6 @@ struct gpuSpan<const T> {
   unsigned int _size;
 };
 
-enum class Task {
-  Tracker = 0,
-  Vertexer = 1
-};
-
 // Abstract stream class
 class Stream
 {
@@ -96,22 +96,27 @@ class Stream
   using Handle = hipStream_t;
   static constexpr Handle DefaultStream = 0;
   static constexpr unsigned int DefaultFlag = hipStreamNonBlocking;
+  using Event = hipEvent_t;
 #elif defined(__CUDACC__)
   using Handle = cudaStream_t;
   static constexpr Handle DefaultStream = 0;
   static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking;
+  using Event = cudaEvent_t;
 #else
   using Handle = void*;
   static constexpr Handle DefaultStream = nullptr;
   static constexpr unsigned int DefaultFlag = 0;
+  using Event = void*;
 #endif
 
   Stream(unsigned int flags = DefaultFlag)
   {
 #if defined(__HIPCC__)
     GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags));
+    GPUChkErrS(hipEventCreateWithFlags(&mEvent, hipEventDisableTiming));
 #elif defined(__CUDACC__)
     GPUChkErrS(cudaStreamCreateWithFlags(&mHandle, flags));
+    GPUChkErrS(cudaEventCreateWithFlags(&mEvent, cudaEventDisableTiming));
 #endif
   }
 
@@ -121,49 +126,162 @@ class Stream
     if (mHandle != DefaultStream) {
 #if defined(__HIPCC__)
       GPUChkErrS(hipStreamDestroy(mHandle));
+      GPUChkErrS(hipEventDestroy(mEvent));
 #elif defined(__CUDACC__)
       GPUChkErrS(cudaStreamDestroy(mHandle));
+      GPUChkErrS(cudaEventDestroy(mEvent));
 #endif
     }
   }
 
   operator bool() const { return mHandle != DefaultStream; }
   const Handle& get() { return mHandle; }
+  const Handle& getStream() { return mHandle; }
+  const Event& getEvent() { return mEvent; }
   void sync() const
   {
 #if defined(__HIPCC__)
     GPUChkErrS(hipStreamSynchronize(mHandle));
 #elif defined(__CUDACC__)
     GPUChkErrS(cudaStreamSynchronize(mHandle));
+#endif
+  }
+  void record()
+  {
+#if defined(__HIPCC__)
+    GPUChkErrS(hipEventRecord(mEvent, mHandle));
+#elif defined(__CUDACC__)
+    GPUChkErrS(cudaEventRecord(mEvent, mHandle));
 #endif
   }
 
  private:
   Handle mHandle{DefaultStream};
+  Event mEvent{nullptr};
 };
-static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!");
 
 // Abstract vector for streams.
-// Handles specifically wrap around.
 class Streams
 {
  public:
   size_t size() const noexcept { return mStreams.size(); }
   void resize(size_t n) { mStreams.resize(n); }
   void clear() { mStreams.clear(); }
-  auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; }
+  auto& operator[](size_t i) { return mStreams[i]; }
   void push_back(const Stream& stream) { mStreams.push_back(stream); }
-  void sync()
+  void sync(bool device = true)
   {
-    for (auto& s : mStreams) {
-      s.sync();
+    if (device) {
+#if defined(__HIPCC__)
+      GPUChkErrS(hipDeviceSynchronize());
+#elif defined(__CUDACC__)
+      GPUChkErrS(cudaDeviceSynchronize());
+#endif
+    } else {
+      for (auto& s : mStreams) {
+        s.sync();
+      }
     }
   }
+  void waitEvent(size_t iStream, size_t iEvent)
+  {
+#if defined(__HIPCC__)
+    GPUChkErrS(hipStreamWaitEvent(mStreams[iStream].get(), mStreams[iEvent].getEvent()));
+#elif defined(__CUDACC__)
+    GPUChkErrS(cudaStreamWaitEvent(mStreams[iStream].get(), mStreams[iEvent].getEvent()));
+#endif
+  }
 
  private:
   std::vector<Stream> mStreams;
 };
 
+#ifdef ITS_MEASURE_GPU_TIME
+class GPUTimer
+{
+ public:
+  GPUTimer(const std::string& name)
+    : mName(name)
+  {
+    mStreams.emplace_back(Stream::DefaultStream);
+    startTimers();
+  }
+  GPUTimer(Streams& streams, const std::string& name)
+    : mName(name)
+  {
+    for (size_t i{0}; i < streams.size(); ++i) {
+      mStreams.push_back(streams[i].get());
+    }
+    startTimers();
+  }
+  GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0)
+    : mName(name)
+  {
+    for (size_t sta{start}; sta < end; ++sta) {
+      mStreams.push_back(streams[sta].get());
+    }
+    startTimers();
+  }
+  GPUTimer(Stream& stream, const std::string& name, const int id = 0)
+    : mName(name)
+  {
+    mStreams.push_back(stream.get());
+    mName += ":id" + std::to_string(id);
+    startTimers();
+  }
+  ~GPUTimer()
+  {
+    for (size_t i{0}; i < mStreams.size(); ++i) {
+      float ms = 0.0f;
+#if defined(__HIPCC__)
+      GPUChkErrS(hipEventRecord(mStops[i], mStreams[i]));
+      GPUChkErrS(hipEventSynchronize(mStops[i]));
+      GPUChkErrS(hipEventElapsedTime(&ms, mStarts[i], mStops[i]));
+      GPUChkErrS(hipEventDestroy(mStarts[i]));
+      GPUChkErrS(hipEventDestroy(mStops[i]));
+#elif defined(__CUDACC__)
+      GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i]));
+      GPUChkErrS(cudaEventSynchronize(mStops[i]));
+      GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i]));
+      GPUChkErrS(cudaEventDestroy(mStarts[i]));
+      GPUChkErrS(cudaEventDestroy(mStops[i]));
+#endif
+      LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms);
+    }
+  }
+
+  void startTimers()
+  {
+    mStarts.resize(mStreams.size());
+    mStops.resize(mStreams.size());
+    for (size_t i{0}; i < mStreams.size(); ++i) {
+#if defined(__HIPCC__)
+      GPUChkErrS(hipEventCreate(&mStarts[i]));
+      GPUChkErrS(hipEventCreate(&mStops[i]));
+      GPUChkErrS(hipEventRecord(mStarts[i], mStreams[i]));
+#elif defined(__CUDACC__)
+      GPUChkErrS(cudaEventCreate(&mStarts[i]));
+      GPUChkErrS(cudaEventCreate(&mStops[i]));
+      GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i]));
+#endif
+    }
+  }
+
+ private:
+  std::string mName;
+  std::vector<Stream::Event> mStarts, mStops;
+  std::vector<Stream::Handle> mStreams;
+};
+#else // ITS_MEASURE_GPU_TIME not defined
+class GPUTimer
+{
+ public:
+  template <typename... Args>
+  GPUTimer(Args&&...)
+  {
+  }
+};
+#endif
 } // namespace gpu
 } // namespace o2::its
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
index 9769930504f29..1f6a046a81350 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -11,29 +11,29 @@
 
 # CUDA
 if(CUDA_ENABLED)
-find_package(CUDAToolkit)
-message(STATUS "Building ITS CUDA tracker")
-# add_compile_options(-O0 -g -lineinfo -fPIC)
-# add_compile_definitions(ITS_MEASURE_GPU_TIME)
-o2_add_library(ITStrackingCUDA
-               SOURCES ClusterLinesGPU.cu
-                       TrackerTraitsGPU.cxx
-                       TimeFrameGPU.cu
-                       TracerGPU.cu
-                       TrackingKernels.cu
-                       VertexingKernels.cu
-                       VertexerTraitsGPU.cxx
-               PUBLIC_INCLUDE_DIRECTORIES ../
-               PUBLIC_LINK_LIBRARIES O2::ITStracking
-                                     O2::SimConfig
-                                     O2::SimulationDataFormat
-                                     O2::ReconstructionDataFormats
-                                     O2::GPUCommon
-               PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider
-               TARGETVARNAME targetName)
-
-set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::ITStracking,COMPILE_DEFINITIONS>)
-set_target_cuda_arch(${targetName})
+    find_package(CUDAToolkit)
+    message(STATUS "Building ITS CUDA tracker")
+    # add_compile_options(-O0 -g -lineinfo -fPIC -DGPU_FORCE_DEVICE_ASSERTS=ON)
+    # add_compile_definitions(ITS_MEASURE_GPU_TIME)
+    # add_compile_definitions(ITS_GPU_LOG)
+    o2_add_library(ITStrackingCUDA
+                   SOURCES ClusterLinesGPU.cu
+                           TrackerTraitsGPU.cxx
+                           TimeFrameGPU.cu
+                           TracerGPU.cu
+                           TrackingKernels.cu
+                           VertexingKernels.cu
+                           VertexerTraitsGPU.cxx
+                   PUBLIC_INCLUDE_DIRECTORIES ../
+                   PUBLIC_LINK_LIBRARIES O2::ITStracking
+                                         O2::SimConfig
+                                         O2::SimulationDataFormat
+                                         O2::ReconstructionDataFormats
+                                         O2::GPUCommon
+                   PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider
+                   TARGETVARNAME targetName)
 
+    set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::ITStracking,COMPILE_DEFINITIONS>)
+    set_target_cuda_arch(${targetName})
 endif()
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 72a1f98d1b78b..4f3b52d56a793 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -9,20 +9,16 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 ///
+
 #include <cuda_runtime.h>
-#include <thrust/fill.h>
-#include <thrust/execution_policy.h>
 
+#include <unistd.h>
+#include <vector>
+
+#include "ITStrackingGPU/TimeFrameGPU.h"
 #include "ITStracking/Constants.h"
 #include "ITStracking/BoundedAllocator.h"
-
 #include "ITStrackingGPU/Utils.h"
-#include "ITStrackingGPU/TimeFrameGPU.h"
-#include "ITStrackingGPU/TracerGPU.h"
-
-#include <unistd.h>
-#include <vector>
-#include <fmt/format.h>
 
 #include "GPUCommonDef.h"
 #include "GPUCommonMath.h"
@@ -32,85 +28,12 @@
 namespace o2::its::gpu
 {
 
-#ifdef ITS_MEASURE_GPU_TIME
-class GPUTimer
-{
- public:
-  GPUTimer(Streams& streams, const std::string& name)
-    : mName(name)
-  {
-    for (size_t i{0}; i < streams.size(); ++i) {
-      mStreams.push_back(streams[i].get());
-    }
-    startTimers();
-  }
-  GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0)
-    : mName(name)
-  {
-    for (size_t sta{start}; sta < end; ++sta) {
-      mStreams.push_back(streams[sta].get());
-    }
-    startTimers();
-  }
-  GPUTimer(Stream& stream, const std::string& name)
-    : mName(name)
-  {
-    mStreams.push_back(stream.get());
-    startTimers();
-  }
-  ~GPUTimer()
-  {
-    for (size_t i{0}; i < mStreams.size(); ++i) {
-      GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i]));
-      GPUChkErrS(cudaEventSynchronize(mStops[i]));
-      float ms = 0.0f;
-      GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i]));
-      LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms);
-      GPUChkErrS(cudaEventDestroy(mStarts[i]));
-      GPUChkErrS(cudaEventDestroy(mStops[i]));
-    }
-  }
-
-  void startTimers()
-  {
-    mStarts.resize(mStreams.size());
-    mStops.resize(mStreams.size());
-    for (size_t i{0}; i < mStreams.size(); ++i) {
-      GPUChkErrS(cudaEventCreate(&mStarts[i]));
-      GPUChkErrS(cudaEventCreate(&mStops[i]));
-      GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i]));
-    }
-  }
-
- private:
-  std::string mName;
-  std::vector<cudaEvent_t> mStarts, mStops;
-  std::vector<cudaStream_t> mStreams;
-};
-
-#define GPULog(...) LOGP(info, __VA_ARGS__)
-#else // ITS_MEASURE_GPU_TIME not defined
-class GPUTimer
-{
- public:
-  template <typename... Args>
-  GPUTimer(Args&&...)
-  {
-  }
-};
-
-#define GPULog(...)
-#endif
-
 template <int nLayers>
 TimeFrameGPU<nLayers>::TimeFrameGPU()
 {
   this->mIsGPU = true;
 }
 
-template <int nLayers>
-TimeFrameGPU<nLayers>::~TimeFrameGPU() = default;
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator)
 {
@@ -133,16 +56,10 @@ void TimeFrameGPU<nLayers>::allocMem(void** ptr, size_t size, bool extAllocator)
   }
 }
 
-template <int nLayers>
-void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<float>* propagator)
-{
-  this->mPropagatorDevice = propagator;
-}
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 {
-  GPUTimer timer(mGpuStreams[0], "loading indextable utils");
+  GPUTimer timer("loading indextable utils");
   if (!iteration) {
     GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB);
     allocMem(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), this->getExtAllocator());
@@ -152,114 +69,174 @@ void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
+void TimeFrameGPU<nLayers>::createUnsortedClustersDeviceArray(const int iteration)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "loading unsorted clusters");
-    for (int iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator());
-      GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
-    }
-    mGpuStreams.sync();
+    GPUTimer timer("creating unsorted clusters array");
     allocMem(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpy(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice));
+    mPinnedUnsortedClusters.set(nLayers);
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
+      mPinnedUnsortedClusters.set(iLayer);
+    }
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration, const int layer)
+{
+  if (!iteration) {
+    GPUTimer timer(mGpuStreams[layer], "loading unsorted clusters", layer);
+    GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[layer].size(), layer, this->mUnsortedClusters[layer].size() * sizeof(Cluster) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[layer], this->mUnsortedClusters[layer].data(), this->mUnsortedClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mUnsortedClustersDeviceArray[layer], &mUnsortedClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
   }
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
+void TimeFrameGPU<nLayers>::createClustersDeviceArray(const int iteration)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
+    GPUTimer timer("creating sorted clusters array");
+    allocMem(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
+    mPinnedClusters.set(nLayers);
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
+      mPinnedClusters.set(iLayer);
     }
-    mGpuStreams.sync();
-    allocMem(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
-    GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpy(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice));
   }
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
+void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration, const int layer)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
+    GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer);
+    GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[layer].size(), layer, this->mClusters[layer].size() * sizeof(Cluster) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mClustersDevice[layer], this->mClusters[layer].data(), this->mClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mClustersDeviceArray[layer], &mClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createClustersIndexTablesArray(const int iteration)
+{
+  if (!iteration) {
+    GPUTimer timer("creating clustersindextable array");
+    allocMem(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable));
+    mPinnedClustersIndexTables.set(nLayers);
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
-      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
+      GPUChkErrS(cudaHostRegister(this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+      mPinnedClustersIndexTables.set(iLayer);
     }
-    mGpuStreams.sync();
-    allocMem(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice));
   }
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
+void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration, const int layer)
+{
+  if (!iteration) {
+    GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer);
+    GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", layer, this->mIndexTables[layer].size(), this->mIndexTables[layer].size() * sizeof(int) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[layer], this->mIndexTables[layer].data(), this->mIndexTables[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mClustersIndexTablesDeviceArray[layer], &mClustersIndexTablesDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createUsedClustersDeviceArray(const int iteration)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "creating used clusters flags");
+    GPUTimer timer("creating used clusters flags");
+    allocMem(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaHostRegisterPortable));
+    mPinnedUsedClusters.set(nLayers);
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer], this->getExtAllocator());
-      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer].get()));
+      GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaHostRegisterPortable));
+      mPinnedUsedClusters.set(iLayer);
     }
-    mGpuStreams.sync();
-    allocMem(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration, const int layer)
+{
+  if (!iteration) {
+    GPUTimer timer(mGpuStreams[layer], "creating used clusters flags", layer);
+    GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[layer].size(), layer, this->mUsedClusters[layer].size() * sizeof(unsigned char) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[layer], 0, this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mUsedClustersDeviceArray[layer], &mUsedClustersDevice[layer], sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
 {
-  GPUTimer timer(mGpuStreams[0], "loading used clusters flags");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    GPULog("gpu-transfer: loading {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
-    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUTimer timer(mGpuStreams[iLayer], "loading used clusters flags", iLayer);
+    GPULog("gpu-transfer: loading {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
+    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
+void TimeFrameGPU<nLayers>::createROFrameClustersDeviceArray(const int iteration)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "loading ROframe clusters");
+    GPUTimer timer("creating ROFrame clusters array");
+    allocMem(reinterpret_cast<void**>(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable));
+    mPinnedROFramesClusters.set(nLayers);
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
-      GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaHostRegister(this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+      mPinnedROFramesClusters.set(iLayer);
     }
-    mGpuStreams.sync();
-    allocMem(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator());
-    GPUChkErrS(cudaMemcpy(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice));
   }
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
+void TimeFrameGPU<nLayers>::loadROFrameClustersDevice(const int iteration, const int layer)
+{
+  if (!iteration) {
+    GPUTimer timer(mGpuStreams[layer], "loading ROframe clusters", layer);
+    GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[layer].size(), layer, this->mROFramesClusters[layer].size() * sizeof(int) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[layer], this->mROFramesClusters[layer].data(), this->mROFramesClusters[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mROFramesClustersDeviceArray[layer], &mROFramesClustersDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createTrackingFrameInfoDeviceArray(const int iteration)
 {
-  GPUTimer timer(mGpuStreams[0], "loading trackingframeinfo");
   if (!iteration) {
+    GPUTimer timer("creating trackingframeinfo array");
+    allocMem(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
+    mPinnedTrackingFrameInfo.set(nLayers);
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
+      mPinnedTrackingFrameInfo.set(iLayer);
     }
-    mGpuStreams.sync();
-    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration, const int layer)
+{
+  if (!iteration) {
+    GPUTimer timer(mGpuStreams[layer], "loading trackingframeinfo", layer);
+    GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[layer].size(), layer, this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[layer], this->mTrackingFrameInfo[layer].data(), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+    GPUChkErrS(cudaMemcpyAsync(&mTrackingFrameInfoDeviceArray[layer], &mTrackingFrameInfoDevice[layer], sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
   }
 }
 
@@ -267,8 +244,8 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
   if (!iteration || iteration == 3) { // we need to re-load the swapped mult-mask in upc iteration
-    GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask");
-    GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / constants::MB);
+    GPUTimer timer("loading multiplicity cut mask");
+    GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(uint8_t) / constants::MB);
     if (!iteration) { // only allocate on first call
       allocMem(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->getExtAllocator());
     }
@@ -280,7 +257,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
-    GPUTimer timer(mGpuStreams[0], "loading seeding vertices");
+    GPUTimer timer("loading seeding vertices");
     GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB);
     allocMem(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->getExtAllocator());
     GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice));
@@ -291,35 +268,47 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
+void TimeFrameGPU<nLayers>::createTrackletsLUTDeviceArray(const int iteration)
 {
-  GPUTimer timer(mGpuStreams[0], "creating tracklets LUTs");
-  for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
-    const int ncls = this->mClusters[iLayer].size() + 1;
-    if (!iteration) {
-      GPULog("gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, iLayer, ncls * sizeof(int) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), ncls * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
-    }
-    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, ncls * sizeof(int), mGpuStreams[iLayer].get()));
+  if (!iteration) {
+    allocMem(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->getExtAllocator());
   }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration, const int layer)
+{
+  GPUTimer timer(mGpuStreams[layer], "creating tracklets LUTs", layer);
+  const int ncls = this->mClusters[layer].size() + 1;
   if (!iteration) {
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPULog("gpu-allocation: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, layer, ncls * sizeof(int) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(&mTrackletsLUTDeviceArray[layer], &mTrackletsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
   }
+  GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[layer], 0, ncls * sizeof(int), mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createTrackletsBuffers()
+void TimeFrameGPU<nLayers>::createTrackletsBuffersArray(const int iteration)
 {
-  for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
-    GPUTimer timer(mGpuStreams[iLayer], "creating tracklet buffers");
-    mNTracklets[iLayer] = 0;
-    GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
-    GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator());
+  if (!iteration) {
+    GPUTimer timer("creating tracklet buffers array");
+    allocMem(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->getExtAllocator());
   }
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createTrackletsBuffers(const int layer)
+{
+  GPUTimer timer(mGpuStreams[layer], "creating tracklet buffers", layer);
+  mNTracklets[layer] = 0;
+  GPUChkErrS(cudaMemcpyAsync(&mNTracklets[layer], mTrackletsLUTDevice[layer] + this->mClusters[layer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
+  mGpuStreams[layer].sync(); // ensure number of tracklets is correct
+  GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB);
+  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 {
@@ -334,26 +323,22 @@ void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 {
-  GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 2);
+  GPUTimer timer("loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     GPULog("gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {:.2f} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / constants::MB);
-    GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
-  GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  mGpuStreams.sync();
+  GPUChkErrS(cudaMemcpy(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
+void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice(const int layer)
 {
-  GPUTimer timer(mGpuStreams[0], "creating cells neighbours");
-  // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
-  for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
-  }
+  GPUTimer timer(mGpuStreams[layer], "creating cells neighbours", layer);
+  GPULog("gpu-transfer: reserving neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[layer] + 1, layer, (mNCells[layer] + 1) * sizeof(int) / constants::MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[layer], 0, (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
@@ -379,16 +364,32 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createCellsLUTDevice()
+void TimeFrameGPU<nLayers>::createCellsLUTDeviceArray(const int iteration)
 {
-  GPUTimer timer(mGpuStreams, "creating cells LUTs", nLayers - 2);
-  for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
+  if (!iteration) {
+    GPUTimer timer("creating cells LUTs array");
+    allocMem(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->getExtAllocator());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createCellsLUTDevice(const int layer)
+{
+  GPUTimer timer(mGpuStreams[layer], "creating cells LUTs", layer);
+  GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer] + 1, layer, (mNTracklets[layer] + 1) * sizeof(int) / constants::MB);
+  allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[layer], 0, (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer].get()));
+  GPUChkErrS(cudaMemcpyAsync(&mCellsLUTDeviceArray[layer], &mCellsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createCellsBuffersArray(const int iteration)
+{
+  if (!iteration) {
+    GPUTimer timer("creating cells buffers array");
+    allocMem(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mCellsDeviceArray, mCellsDevice.data(), mCellsDevice.size() * sizeof(CellSeed*), cudaMemcpyHostToDevice));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -397,8 +398,10 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
   GPUTimer timer(mGpuStreams[layer], "creating cells buffers");
   mNCells[layer] = 0;
   GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
+  mGpuStreams[layer].sync(); // ensure number of cells is correct
   GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
@@ -415,43 +418,45 @@ void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadRoadsDevice()
 {
-  GPUTimer timer(mGpuStreams[0], "loading roads device");
+  GPUTimer timer("loading roads device");
   GPULog("gpu-transfer: loading {} roads, for {:.2f} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road<nLayers - 2>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mRoadsDevice), this->mRoads.size() * sizeof(Road<nLayers - 2>), mGpuStreams[0], this->getExtAllocator());
+  allocMem(reinterpret_cast<void**>(&mRoadsDevice), this->mRoads.size() * sizeof(Road<nLayers - 2>), this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpy(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds)
 {
-  GPUTimer timer(mGpuStreams[0], "loading track seeds");
+  GPUTimer timer("loading track seeds");
   GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
+  allocMem(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
 {
-  GPUTimer timer(mGpuStreams[layer], "reserving neighbours");
+  GPUTimer timer(mGpuStreams[layer], "reserving neighbours", layer);
+  this->mNNeighbours[layer] = 0;
   GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
-  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
+  mGpuStreams[layer].sync(); // ensure number of neighbours is correct
+  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer].get()));
-  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
+  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  GPUTimer timer(mGpuStreams[0], "reserving tracks");
+  GPUTimer timer("reserving tracks");
   mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
   GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
+  allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->getExtAllocator());
+  GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt)));
   GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
 }
 
@@ -480,26 +485,25 @@ void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>& neighbours, const int layer)
 {
-  GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours from layer {}", layer));
+  GPUTimer timer(mGpuStreams[layer], "downloading neighbours from layer", layer);
   GPULog("gpu-transfer: downloading {} neighbours, for {:.2f} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / constants::MB);
-  // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
-  GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut, const int layer)
 {
-  GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours LUT from layer {}", layer));
+  GPUTimer timer(mGpuStreams[layer], "downloading neighbours LUT from layer", layer);
   GPULog("gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", lut.size(), layer, lut.size() * sizeof(int) / constants::MB);
-  GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  GPUTimer timer(mGpuStreams[0], "downloading tracks");
+  GPUTimer timer("downloading tracks");
   GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
-  GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
   GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
   GPUChkErrS(cudaHostUnregister(seeds.data()));
 }
@@ -507,16 +511,36 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>&
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
 {
-  GPUTimer timer(mGpuStreams[0], "unregistering host memory");
+  GPUTimer timer("unregistering host memory");
   GPULog("unregistering host memory");
+
+  auto checkedUnregisterEntry = [](auto& bits, auto& vec, int layer) {
+    if (bits.test(layer)) {
+      GPUChkErrS(cudaHostUnregister(vec[layer].data()));
+      bits.reset(layer);
+    }
+  };
+  auto checkedUnregisterArray = [](auto& bits, auto& vec) {
+    if (bits.test(nLayers)) {
+      GPUChkErrS(cudaHostUnregister(vec.data()));
+      bits.reset(nLayers);
+    }
+  };
+
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    GPUChkErrS(cudaHostUnregister(this->mUnsortedClusters[iLayer].data()));
-    GPUChkErrS(cudaHostUnregister(this->mClusters[iLayer].data()));
-    GPUChkErrS(cudaHostUnregister(this->mTrackingFrameInfo[iLayer].data()));
+    checkedUnregisterEntry(mPinnedUsedClusters, this->mUsedClusters, iLayer);
+    checkedUnregisterEntry(mPinnedUnsortedClusters, this->mUnsortedClusters, iLayer);
+    checkedUnregisterEntry(mPinnedClusters, this->mClusters, iLayer);
+    checkedUnregisterEntry(mPinnedClustersIndexTables, this->mIndexTables, iLayer);
+    checkedUnregisterEntry(mPinnedTrackingFrameInfo, this->mTrackingFrameInfo, iLayer);
+    checkedUnregisterEntry(mPinnedROFramesClusters, this->mROFramesClusters, iLayer);
   }
-  GPUChkErrS(cudaHostUnregister(mTrackingFrameInfoDevice.data()));
-  GPUChkErrS(cudaHostUnregister(mUnsortedClustersDevice.data()));
-  GPUChkErrS(cudaHostUnregister(mClustersDevice.data()));
+  checkedUnregisterArray(mPinnedUsedClusters, mUsedClustersDevice);
+  checkedUnregisterArray(mPinnedUnsortedClusters, mUnsortedClustersDevice);
+  checkedUnregisterArray(mPinnedClusters, mClustersDevice);
+  checkedUnregisterArray(mPinnedClustersIndexTables, mClustersIndexTablesDevice);
+  checkedUnregisterArray(mPinnedTrackingFrameInfo, mTrackingFrameInfoDevice);
+  checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice);
 }
 
 template <int nLayers>
@@ -537,9 +561,29 @@ void TimeFrameGPU<nLayers>::syncStream(const size_t stream)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::syncStreams()
+void TimeFrameGPU<nLayers>::syncStreams(const bool device)
 {
-  mGpuStreams.sync();
+  mGpuStreams.sync(device);
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::waitEvent(const int stream, const int event)
+{
+  mGpuStreams.waitEvent(stream, event);
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::recordEvent(const int event)
+{
+  mGpuStreams[event].record();
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::recordEvents(const int start, const int end)
+{
+  for (int i{start}; i < end; ++i) {
+    recordEvent(i);
+  }
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index bef271a1b0129..a6dfc041e4c71 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -11,6 +11,7 @@
 ///
 
 #include <array>
+#include <vector>
 #include <unistd.h>
 
 #include "DataFormatsITS/TrackITS.h"
@@ -27,15 +28,21 @@ template <int nLayers>
 void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
 {
   mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers);
-  mTimeFrameGPU->loadClustersDevice(iteration);
-  mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
-  mTimeFrameGPU->loadClustersIndexTables(iteration);
-  mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
-  mTimeFrameGPU->loadMultiplicityCutMask(iteration);
+
+  // on default stream
   mTimeFrameGPU->loadVertices(iteration);
-  mTimeFrameGPU->loadROframeClustersDevice(iteration);
-  mTimeFrameGPU->createUsedClustersDevice(iteration);
   mTimeFrameGPU->loadIndexTableUtils(iteration);
+  mTimeFrameGPU->loadMultiplicityCutMask(iteration);
+  mTimeFrameGPU->createUsedClustersDeviceArray(iteration);
+  mTimeFrameGPU->createClustersDeviceArray(iteration);
+  mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration);
+  mTimeFrameGPU->createClustersIndexTablesArray(iteration);
+  mTimeFrameGPU->createTrackingFrameInfoDeviceArray(iteration);
+  mTimeFrameGPU->createROFrameClustersDeviceArray(iteration);
+  mTimeFrameGPU->createTrackletsLUTDeviceArray(iteration);
+  mTimeFrameGPU->createTrackletsBuffersArray(iteration);
+  mTimeFrameGPU->createCellsBuffersArray(iteration);
+  mTimeFrameGPU->createCellsLUTDeviceArray(iteration);
 }
 
 template <int nLayers>
@@ -53,40 +60,23 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
   int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0};
   int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
 
-  mTimeFrameGPU->createTrackletsLUTDevice(iteration);
-  countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
-                                       mTimeFrameGPU->getDeviceMultCutMask(),
-                                       startROF,
-                                       endROF,
-                                       mTimeFrameGPU->getNrof(),
-                                       this->mTrkParams[iteration].DeltaROF,
-                                       iVertex,
-                                       mTimeFrameGPU->getDeviceVertices(),
-                                       mTimeFrameGPU->getDeviceROFramesPV(),
-                                       mTimeFrameGPU->getPrimaryVerticesNum(),
-                                       mTimeFrameGPU->getDeviceArrayClusters(),
-                                       mTimeFrameGPU->getClusterSizes(),
-                                       mTimeFrameGPU->getDeviceROframeClusters(),
-                                       mTimeFrameGPU->getDeviceArrayUsedClusters(),
-                                       mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
-                                       mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                                       mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums
-                                       iteration,
-                                       this->mTrkParams[iteration].NSigmaCut,
-                                       mTimeFrameGPU->getPhiCuts(),
-                                       this->mTrkParams[iteration].PVres,
-                                       mTimeFrameGPU->getMinRs(),
-                                       mTimeFrameGPU->getMaxRs(),
-                                       mTimeFrameGPU->getPositionResolutions(),
-                                       this->mTrkParams[iteration].LayerRadii,
-                                       mTimeFrameGPU->getMSangles(),
-                                       mTimeFrameGPU->getExternalAllocator(),
-                                       conf.nBlocksLayerTracklets[iteration],
-                                       conf.nThreadsLayerTracklets[iteration],
-                                       mTimeFrameGPU->getStreams());
-  mTimeFrameGPU->createTrackletsBuffers();
-  computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+  for (int iLayer = 0; iLayer < nLayers; ++iLayer) {
+    // TODO lazy loading of essential data on separate streams
+    mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer);
+    mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->recordEvent(iLayer);
+  }
+
+  // processing starts here
+  for (int iLayer{0}; iLayer < this->mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) {
+    mTimeFrameGPU->createTrackletsLUTDevice(iteration, iLayer);
+    mTimeFrameGPU->waitEvent(iLayer, iLayer + 1); // wait stream until all data is available
+    countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
                                          mTimeFrameGPU->getDeviceMultCutMask(),
+                                         iLayer,
                                          startROF,
                                          endROF,
                                          mTimeFrameGPU->getNrof(),
@@ -97,14 +87,11 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          mTimeFrameGPU->getPrimaryVerticesNum(),
                                          mTimeFrameGPU->getDeviceArrayClusters(),
                                          mTimeFrameGPU->getClusterSizes(),
-                                         mTimeFrameGPU->getDeviceROframeClusters(),
+                                         mTimeFrameGPU->getDeviceROFrameClusters(),
                                          mTimeFrameGPU->getDeviceArrayUsedClusters(),
                                          mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
-                                         mTimeFrameGPU->getDeviceArrayTracklets(),
-                                         mTimeFrameGPU->getDeviceTracklet(),
-                                         mTimeFrameGPU->getNTracklets(),
                                          mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                                         mTimeFrameGPU->getDeviceTrackletsLUTs(),
+                                         mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums
                                          iteration,
                                          this->mTrkParams[iteration].NSigmaCut,
                                          mTimeFrameGPU->getPhiCuts(),
@@ -118,24 +105,69 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          conf.nBlocksLayerTracklets[iteration],
                                          conf.nThreadsLayerTracklets[iteration],
                                          mTimeFrameGPU->getStreams());
+    mTimeFrameGPU->createTrackletsBuffers(iLayer);
+    if (mTimeFrameGPU->getNTracklets()[iLayer] == 0) {
+      return;
+    }
+    computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                           mTimeFrameGPU->getDeviceMultCutMask(),
+                                           iLayer,
+                                           startROF,
+                                           endROF,
+                                           mTimeFrameGPU->getNrof(),
+                                           this->mTrkParams[iteration].DeltaROF,
+                                           iVertex,
+                                           mTimeFrameGPU->getDeviceVertices(),
+                                           mTimeFrameGPU->getDeviceROFramesPV(),
+                                           mTimeFrameGPU->getPrimaryVerticesNum(),
+                                           mTimeFrameGPU->getDeviceArrayClusters(),
+                                           mTimeFrameGPU->getClusterSizes(),
+                                           mTimeFrameGPU->getDeviceROFrameClusters(),
+                                           mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                           mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                           mTimeFrameGPU->getDeviceArrayTracklets(),
+                                           mTimeFrameGPU->getDeviceTracklets(),
+                                           mTimeFrameGPU->getNTracklets(),
+                                           mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
+                                           mTimeFrameGPU->getDeviceTrackletsLUTs(),
+                                           iteration,
+                                           this->mTrkParams[iteration].NSigmaCut,
+                                           mTimeFrameGPU->getPhiCuts(),
+                                           this->mTrkParams[iteration].PVres,
+                                           mTimeFrameGPU->getMinRs(),
+                                           mTimeFrameGPU->getMaxRs(),
+                                           mTimeFrameGPU->getPositionResolutions(),
+                                           this->mTrkParams[iteration].LayerRadii,
+                                           mTimeFrameGPU->getMSangles(),
+                                           mTimeFrameGPU->getExternalAllocator(),
+                                           conf.nBlocksLayerTracklets[iteration],
+                                           conf.nThreadsLayerTracklets[iteration],
+                                           mTimeFrameGPU->getStreams());
+  }
 }
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
 {
-  mTimeFrameGPU->createCellsLUTDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  mTimeFrameGPU->syncStream(0);
-  for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
-    mTimeFrameGPU->syncStream(iLayer + 1);
+  for (int iLayer = 0; iLayer < nLayers; ++iLayer) {
+    // TODO lazy loading of essential data on separate streams
+    mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration, iLayer);
+    mTimeFrameGPU->recordEvent(iLayer);
+  }
 
+  for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
     // if there are no tracklets skip entirely
     const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
     if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) {
       mTimeFrameGPU->getNCells()[iLayer] = 0;
-      continue;
+      return;
     }
+
+    mTimeFrameGPU->createCellsLUTDevice(iLayer);
+    mTimeFrameGPU->waitEvent(iLayer, iLayer + 1); // wait stream until all data is available
+    mTimeFrameGPU->waitEvent(iLayer, iLayer + 2); // wait stream until all data is available
     countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
                       mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
                       mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
@@ -156,6 +188,9 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                       conf.nThreadsLayerCells[iteration],
                       mTimeFrameGPU->getStreams());
     mTimeFrameGPU->createCellsBuffers(iLayer);
+    if (mTimeFrameGPU->getNCells()[iLayer] == 0) {
+      return;
+    }
     computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
                         mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
@@ -180,21 +215,16 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
 {
-  mTimeFrameGPU->createNeighboursIndexTablesDevice();
   const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  mTimeFrameGPU->syncStream(0);
-  for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
-    // ensure that celling is done for iLayer and iLayer+1 is done
-    mTimeFrameGPU->syncStream(iLayer + 1);
-
+  for (int iLayer{0}; iLayer < this->mTrkParams[iteration].NeighboursPerRoad(); ++iLayer) {
     const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
     if (!nextLayerCellsNum || !currentLayerCellsNum) {
       mTimeFrameGPU->getNNeighbours()[iLayer] = 0;
       continue;
     }
-
+    mTimeFrameGPU->createNeighboursIndexTablesDevice(iLayer);
     mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
     countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
@@ -214,6 +244,9 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                conf.nThreadsFindNeighbours[iteration],
                                mTimeFrameGPU->getStream(iLayer));
     mTimeFrameGPU->createNeighboursDevice(iLayer);
+    if (mTimeFrameGPU->getNNeighbours()[iLayer] == 0) {
+      continue;
+    }
     computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                  mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
                                  mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -236,8 +269,8 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                                                                mTimeFrameGPU->getStream(iLayer),
                                                                                mTimeFrameGPU->getExternalAllocator());
   }
-  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
-};
+  mTimeFrameGPU->syncStreams(false);
+}
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 401d98ad63560..9b3df193abe34 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -14,7 +14,6 @@
 #include <array>
 #include <unistd.h>
 
-#include <cub/cub.cuh>
 #include <thrust/execution_policy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
@@ -23,16 +22,17 @@
 #include <thrust/functional.h>
 #include <thrust/unique.h>
 #include <thrust/remove.h>
-#include <thrust/sequence.h>
 
 #include "ITStracking/Constants.h"
+#include "ITStracking/Definitions.h"
 #include "ITStracking/IndexTableUtils.h"
 #include "ITStracking/MathUtils.h"
 #include "ITStracking/ExternalAllocator.h"
+#include "ITStracking/Tracklet.h"
+#include "ITStracking/Cluster.h"
+#include "ITStracking/Cell.h"
 #include "DataFormatsITS/TrackITS.h"
-#include "ReconstructionDataFormats/Vertex.h"
 
-#include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
 #include "ITStrackingGPU/Utils.h"
 
@@ -43,8 +43,6 @@ using namespace o2::track;
 
 namespace o2::its
 {
-using Vertex = o2::dataformats::Vertex<o2::dataformats::TimeStamp<int>>;
-
 namespace gpu
 {
 
@@ -92,9 +90,9 @@ struct TypedAllocator {
   ExternalAllocator* mInternalAllocator;
 };
 
-GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
-                              const o2::its::IndexTableUtils& utils,
-                              const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
+GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
+                                const o2::its::IndexTableUtils& utils,
+                                const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
 {
   const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
   const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi;
@@ -112,18 +110,18 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde
               utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
 }
 
-GPUd() bool fitTrack(TrackITSExt& track,
-                     int start,
-                     int end,
-                     int step,
-                     float chi2clcut,
-                     float chi2ndfcut,
-                     float maxQoverPt,
-                     int nCl,
-                     float bz,
-                     const TrackingFrameInfo** tfInfos,
-                     const o2::base::Propagator* prop,
-                     o2::base::PropagatorF::MatCorrType matCorrType)
+GPUdii() bool fitTrack(TrackITSExt& track,
+                       int start,
+                       int end,
+                       int step,
+                       float chi2clcut,
+                       float chi2ndfcut,
+                       float maxQoverPt,
+                       int nCl,
+                       float bz,
+                       const TrackingFrameInfo** tfInfos,
+                       const o2::base::Propagator* prop,
+                       o2::base::PropagatorF::MatCorrType matCorrType)
 {
   for (int iLayer{start}; iLayer != end; iLayer += step) {
     if (track.getClusterIndex(iLayer) == constants::UnusedIndex) {
@@ -164,10 +162,10 @@ GPUd() bool fitTrack(TrackITSExt& track,
   return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5);
 }
 
-GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
-                                             const Cluster& cluster2,
-                                             const TrackingFrameInfo& tf3,
-                                             const float bz)
+GPUdii() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
+                                               const Cluster& cluster2,
+                                               const TrackingFrameInfo& tf3,
+                                               const float bz)
 {
   const float ca = o2::gpu::CAMath::Cos(tf3.alphaTrackingFrame), sa = o2::gpu::CAMath::Sin(tf3.alphaTrackingFrame);
   const float x1 = cluster1.xCoordinate * ca + cluster1.yCoordinate * sa;
@@ -368,7 +366,7 @@ GPUg() void fitTrackSeedsKernel(
   }
 }
 
-template <bool initRun, int nLayers = 7> // Version for new tracker to supersede the old one
+template <bool initRun, int nLayers = 7>
 GPUg() void computeLayerCellNeighboursKernel(
   CellSeed** cellSeedArray,
   int* neighboursLUT,
@@ -519,7 +517,7 @@ GPUg() void computeLayerCellsKernel(
   }
 }
 
-template <bool initRun = true, int nLayers = 7>
+template <bool initRun>
 GPUg() void computeLayerTrackletsMultiROFKernel(
   const IndexTableUtils* utils,
   const uint8_t* multMask,
@@ -578,6 +576,11 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
       if (usedClusters[layerIndex][currentCluster.clusterId]) {
         continue;
       }
+      if constexpr (!initRun) {
+        if (trackletsLUT[layerIndex][currentSortedIndex] == trackletsLUT[layerIndex][currentSortedIndex + 1]) {
+          continue;
+        }
+      }
 
       const float inverseR0{1.f / currentCluster.radius};
       for (int iV{startVtx}; iV < endVtx; ++iV) {
@@ -642,7 +645,6 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
   }
 }
 
-template <int nLayers = 7>
 GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
                                               int* trackletsLookUpTable,
                                               const int nTracklets)
@@ -741,133 +743,6 @@ GPUg() void processNeighboursKernel(const int layer,
   }
 }
 
-/////////////////////////////////////////
-// Debug Kernels
-/////////////////////////////////////////
-
-template <typename T>
-GPUd() void pPointer(T* ptr)
-{
-  printf("[%p]\t", ptr);
-}
-
-template <typename... Args>
-GPUg() void printPointersKernel(std::tuple<Args...> args)
-{
-  auto print_all = [&](auto... ptrs) {
-    (pPointer(ptrs), ...);
-  };
-  std::apply(print_all, args);
-}
-
-template <typename T>
-struct trackletSortEmptyFunctor {
-  GPUhd() bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs.firstClusterIndex > rhs.firstClusterIndex;
-  }
-};
-
-template <typename T>
-struct trackletSortIndexFunctor {
-  GPUhd() bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs.firstClusterIndex < rhs.firstClusterIndex || (lhs.firstClusterIndex == rhs.firstClusterIndex && lhs.secondClusterIndex < rhs.secondClusterIndex);
-  }
-};
-
-GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    for (int i{0}; i < size; ++i) {
-      if (!(i % len)) {
-        printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size);
-      }
-      printf("%d\t", v[i]);
-    }
-    printf("\n");
-  }
-}
-
-GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    for (int i{0}; i < rowLength; ++i) {
-      if (!(i % len)) {
-        printf("\n row %d: ===> %d/%d\t", row, i, (int)rowLength);
-      }
-      printf("%d\t", mat[row][i]);
-    }
-    printf("\n");
-  }
-}
-
-GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned int size, const int len = 150, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    for (int i{0}; i < size; ++i) {
-      if (!(i % len)) {
-        printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size);
-      }
-      printf("%p\t", (void*)v[i]);
-    }
-    printf("\n");
-  }
-}
-
-GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    printf("vertices: \n");
-    for (int i{0}; i < size; ++i) {
-      printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ());
-    }
-  }
-}
-
-GPUg() void printNeighbours(const gpuPair<int, int>* neighbours,
-                            const int* nNeighboursIndexTable,
-                            const unsigned int nCells,
-                            const unsigned int tId = 0)
-{
-  for (unsigned int iNeighbour{0}; iNeighbour < nNeighboursIndexTable[nCells]; ++iNeighbour) {
-    if (threadIdx.x == tId) {
-      printf("%d -> %d\n", neighbours[iNeighbour].first, neighbours[iNeighbour].second);
-    }
-  }
-}
-
-GPUg() void printTrackletsLUTPerROF(const int layerId,
-                                    const int** ROFClusters,
-                                    int** luts,
-                                    const int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    for (auto rofId{0}; rofId < 2304; ++rofId) {
-      int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId];
-      if (!nClus) {
-        continue;
-      }
-      printf("rof: %d (%d) ==> ", rofId, nClus);
-
-      for (int iC{0}; iC < nClus; ++iC) {
-        int nT = luts[layerId][ROFClusters[layerId][rofId] + iC];
-        printf("%d\t", nT);
-      }
-      printf("\n");
-    }
-  }
-}
-
-GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0)
-{
-  for (unsigned int iCell{0}; iCell < nCells; ++iCell) {
-    if (threadIdx.x == tId) {
-      seed[iCell].printCell();
-    }
-  }
-}
-
 GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
   if (alloc) {
@@ -885,45 +760,12 @@ GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullp
     GPUChkErrS(cudaFreeAsync(p, stream));
   }
 }
-
-template <typename T>
-GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
-{
-  void* d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
-}
-
-template <typename Vector>
-GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
-{
-  cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
-}
-
-template <typename T>
-GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
-{
-  void* d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
-  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
-}
-
-template <typename Vector>
-GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, o2::its::ExternalAllocator* alloc = nullptr)
-{
-  cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
-}
 } // namespace gpu
 
 template <int nLayers>
 void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  const uint8_t* multMask,
+                                 const int layer,
                                  const int startROF,
                                  const int endROF,
                                  const int maxROF,
@@ -953,41 +795,41 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  const int nThreads,
                                  gpu::Streams& streams)
 {
-  for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    gpu::computeLayerTrackletsMultiROFKernel<true><<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
-      utils,
-      multMask,
-      iLayer,
-      startROF,
-      endROF,
-      maxROF,
-      deltaROF,
-      vertices,
-      rofPV,
-      nVertices,
-      vertexId,
-      clusters,
-      ROFClusters,
-      usedClusters,
-      clustersIndexTables,
-      nullptr,
-      trackletsLUTs,
-      iteration,
-      NSigmaCut,
-      phiCuts[iLayer],
-      resolutionPV,
-      minRs[iLayer + 1],
-      maxRs[iLayer + 1],
-      resolutions[iLayer],
-      radii[iLayer + 1] - radii[iLayer],
-      mulScatAng[iLayer]);
-    gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc);
-  }
+  gpu::computeLayerTrackletsMultiROFKernel<true><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
+    utils,
+    multMask,
+    layer,
+    startROF,
+    endROF,
+    maxROF,
+    deltaROF,
+    vertices,
+    rofPV,
+    nVertices,
+    vertexId,
+    clusters,
+    ROFClusters,
+    usedClusters,
+    clustersIndexTables,
+    nullptr,
+    trackletsLUTs,
+    iteration,
+    NSigmaCut,
+    phiCuts[layer],
+    resolutionPV,
+    minRs[layer + 1],
+    maxRs[layer + 1],
+    resolutions[layer],
+    radii[layer + 1] - radii[layer],
+    mulScatAng[layer]);
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(streams[layer].get());
+  thrust::exclusive_scan(nosync_policy, trackletsLUTsHost[layer], trackletsLUTsHost[layer] + nClusters[layer] + 1, trackletsLUTsHost[layer]);
 }
 
 template <int nLayers>
 void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    const uint8_t* multMask,
+                                   const int layer,
                                    const int startROF,
                                    const int endROF,
                                    const int maxROF,
@@ -1020,66 +862,45 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    const int nThreads,
                                    gpu::Streams& streams)
 {
-  for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    gpu::computeLayerTrackletsMultiROFKernel<false><<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
-      utils,
-      multMask,
-      iLayer,
-      startROF,
-      endROF,
-      maxROF,
-      deltaROF,
-      vertices,
-      rofPV,
-      nVertices,
-      vertexId,
-      clusters,
-      ROFClusters,
-      usedClusters,
-      clustersIndexTables,
-      tracklets,
-      trackletsLUTs,
-      iteration,
-      NSigmaCut,
-      phiCuts[iLayer],
-      resolutionPV,
-      minRs[iLayer + 1],
-      maxRs[iLayer + 1],
-      resolutions[iLayer],
-      radii[iLayer + 1] - radii[iLayer],
-      mulScatAng[iLayer]);
-    if (nTracklets[iLayer]) {
-      Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr};
-      size_t n = nTracklets[iLayer];
-      size_t sort_temp_bytes = 0;
-      GPUChkErrS(cub::DeviceMergeSort::SortKeys(nullptr, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
-      void* sort_temp_storage = nullptr;
-      gpu::allocateMemory(&sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
-      GPUChkErrS(cub::DeviceMergeSort::SortKeys(sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
-      gpu::allocateMemory(reinterpret_cast<void**>(&tracklets_out), n * sizeof(Tracklet), streams[iLayer].get(), alloc);
-      size_t unique_temp_bytes = 0;
-      int* num_selected = nullptr;
-      gpu::allocateMemory(reinterpret_cast<void**>(&num_selected), sizeof(int), streams[iLayer].get(), alloc);
-      GPUChkErrS(cub::DeviceSelect::Unique(nullptr, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
-      void* unique_temp_storage = nullptr;
-      gpu::allocateMemory(&unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
-      GPUChkErrS(cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
-      GPUChkErrS(cudaMemcpyAsync(tracklets_in, tracklets_out, n * sizeof(Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get()));
-      GPUChkErrS(cudaMemcpyAsync(&nTracklets[iLayer], num_selected, sizeof(int), cudaMemcpyDeviceToHost, streams[iLayer].get()));
-      streams[iLayer].sync();
-      gpu::deallocateMemory(tracklets_out, n * sizeof(Tracklet), streams[iLayer].get(), alloc);
-      gpu::deallocateMemory(sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
-      gpu::deallocateMemory(unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
-      gpu::deallocateMemory(num_selected, sizeof(int), streams[iLayer].get(), alloc);
-    }
-    if (iLayer > 0) {
-      GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get()));
-      gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
-        spanTracklets[iLayer],
-        trackletsLUTsHost[iLayer],
-        nTracklets[iLayer]);
-      gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc);
-    }
+  gpu::computeLayerTrackletsMultiROFKernel<false><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
+    utils,
+    multMask,
+    layer,
+    startROF,
+    endROF,
+    maxROF,
+    deltaROF,
+    vertices,
+    rofPV,
+    nVertices,
+    vertexId,
+    clusters,
+    ROFClusters,
+    usedClusters,
+    clustersIndexTables,
+    tracklets,
+    trackletsLUTs,
+    iteration,
+    NSigmaCut,
+    phiCuts[layer],
+    resolutionPV,
+    minRs[layer + 1],
+    maxRs[layer + 1],
+    resolutions[layer],
+    radii[layer + 1] - radii[layer],
+    mulScatAng[layer]);
+  thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[layer]);
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(streams[layer].get());
+  thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[layer], gpu::sort_tracklets());
+  auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[layer], gpu::equal_tracklets());
+  nTracklets[layer] = unique_end - tracklets_ptr;
+  if (layer) {
+    GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[layer], 0, (nClusters[layer] + 1) * sizeof(int), streams[layer].get()));
+    gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0, streams[layer].get()>>>(
+      spanTracklets[layer],
+      trackletsLUTsHost[layer],
+      nTracklets[layer]);
+    thrust::exclusive_scan(nosync_policy, trackletsLUTsHost[layer], trackletsLUTsHost[layer] + nClusters[layer] + 1, trackletsLUTsHost[layer]);
   }
 }
 
@@ -1119,7 +940,8 @@ void countCellsHandler(
     maxChi2ClusterAttachment, // const float
     cellDeltaTanLambdaSigma,  // const float
     nSigmaCut);               // const float
-  gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get(), alloc);
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(streams[layer].get());
+  thrust::exclusive_scan(nosync_policy, cellsLUTsHost, cellsLUTsHost + nTracklets + 1, cellsLUTsHost);
 }
 
 void computeCellsHandler(
@@ -1190,8 +1012,9 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get(), alloc);
-  gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get(), alloc);
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(stream.get());
+  thrust::inclusive_scan(nosync_policy, neighboursLUT, neighboursLUT + nCellsNext, neighboursLUT);
+  thrust::exclusive_scan(nosync_policy, neighboursIndexTable, neighboursIndexTable + nCells + 1, neighboursIndexTable);
 }
 
 void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
@@ -1232,44 +1055,13 @@ int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
                                 gpu::Stream& stream,
                                 o2::its::ExternalAllocator* allocator)
 {
-#ifndef __HIPCC__
-  int* d_num_selected = nullptr;
-  gpu::allocateMemory(reinterpret_cast<void**>(&d_num_selected), sizeof(int), stream.get(), allocator);
-  size_t select_bytes = 0;
-  GPUChkErrS(cub::DeviceSelect::If(nullptr, select_bytes, cellNeighbourPairs, static_cast<gpuPair<int, int>*>(nullptr), d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
-  void* select_temp = nullptr;
-  gpu::allocateMemory(&select_temp, select_bytes, stream.get(), allocator);
-  gpuPair<int, int>* d_temp_valid = nullptr;
-  gpu::allocateMemory(reinterpret_cast<void**>(&d_temp_valid), nNeigh * sizeof(gpuPair<int, int>), stream.get(), allocator);
-  GPUChkErrS(cub::DeviceSelect::If(select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
-  int newSize = 0;
-  GPUChkErrS(cudaMemcpyAsync(&newSize, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream.get()));
-  stream.sync(); // needed to get newSize
-  size_t sort_bytes = 0;
-  GPUChkErrS(cub::DeviceMergeSort::SortPairs(nullptr, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
-  void* sort_temp = nullptr;
-  gpu::allocateMemory(&sort_temp, sort_bytes, stream.get(), allocator);
-  GPUChkErrS(cub::DeviceMergeSort::SortPairs(sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
-  size_t transform_bytes = 0;
-  GPUChkErrS(cub::DeviceTransform::Transform(nullptr, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
-  void* transform_temp = nullptr;
-  gpu::allocateMemory(&transform_temp, transform_bytes, stream.get(), allocator);
-  GPUChkErrS(cub::DeviceTransform::Transform(transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
-  gpu::deallocateMemory(transform_temp, transform_bytes, stream.get(), allocator);
-  gpu::deallocateMemory(d_temp_valid, newSize * sizeof(gpuPair<int, int>), stream.get(), allocator);
-  gpu::deallocateMemory(sort_temp, sort_bytes, stream.get(), allocator);
-  gpu::deallocateMemory(d_num_selected, sizeof(int), stream.get(), allocator);
-  gpu::deallocateMemory(select_temp, select_bytes, stream.get(), allocator);
-#else // FIXME using thrust here since hipcub does not yet have DeviceTransform
-  auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get());
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(allocator)).on(stream.get());
   thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
   thrust::device_ptr<int> validNeighs(cellNeighbours);
   auto updatedEnd = thrust::remove_if(nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair<int, int>());
   size_t newSize = updatedEnd - neighVectorPairs;
   thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int, int>());
   thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int, int>());
-#endif
-
   return newSize;
 }
 
@@ -1296,6 +1088,7 @@ void processNeighboursHandler(const int startLayer,
   auto allocInt = gpu::TypedAllocator<int>(alloc);
   auto allocCellSeed = gpu::TypedAllocator<CellSeed>(alloc);
   thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt);
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(gpu::Stream::DefaultStream);
 
   gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(
     startLayer,
@@ -1315,7 +1108,7 @@ void processNeighboursHandler(const int startLayer,
     maxChi2ClusterAttachment,
     propagator,
     matCorrType);
-  gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1, gpu::Stream::DefaultStream, alloc);
+  thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
 
   thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
   thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
@@ -1337,8 +1130,7 @@ void processNeighboursHandler(const int startLayer,
     maxChi2ClusterAttachment,
     propagator,
     matCorrType);
-  GPUChkErrS(cudaPeekAtLastError());
-  GPUChkErrS(cudaDeviceSynchronize());
+  GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
 
   int level = startLevel;
   thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
@@ -1350,7 +1142,7 @@ void processNeighboursHandler(const int startLayer,
     thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
     auto lastCellSeedSize{lastCellSeed.size()};
     foundSeedsTable.resize(lastCellSeedSize + 1);
-    thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
+    thrust::fill(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0);
 
     gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(
       iLayer,
@@ -1370,13 +1162,13 @@ void processNeighboursHandler(const int startLayer,
       maxChi2ClusterAttachment,
       propagator,
       matCorrType);
-    gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size(), gpu::Stream::DefaultStream, alloc);
+    thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
 
     auto foundSeeds{foundSeedsTable.back()};
     updatedCellId.resize(foundSeeds);
-    thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0);
+    thrust::fill(nosync_policy, updatedCellId.begin(), updatedCellId.end(), 0);
     updatedCellSeed.resize(foundSeeds);
-    thrust::fill(updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed());
+    thrust::fill(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed());
 
     gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(
       iLayer,
@@ -1396,11 +1188,10 @@ void processNeighboursHandler(const int startLayer,
       maxChi2ClusterAttachment,
       propagator,
       matCorrType);
-    GPUChkErrS(cudaPeekAtLastError());
-    GPUChkErrS(cudaDeviceSynchronize());
   }
+  GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
   thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
-  auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
+  auto end = thrust::copy_if(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
   auto s{end - outSeeds.begin()};
   seedsHost.reserve(seedsHost.size() + s);
   thrust::copy(outSeeds.begin(), outSeeds.begin() + s, std::back_inserter(seedsHost));
@@ -1434,14 +1225,13 @@ void trackSeedHandler(CellSeed* trackSeeds,
     propagator,                           // const o2::base::Propagator*
     matCorrType);                         // o2::base::PropagatorF::MatCorrType
   thrust::device_ptr<o2::its::TrackITSExt> tr_ptr(tracks);
-
   thrust::sort(tr_ptr, tr_ptr + nSeeds, gpu::compare_track_chi2());
-  GPUChkErrS(cudaPeekAtLastError());
-  GPUChkErrS(cudaDeviceSynchronize());
+  GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
 }
 
 template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                              const uint8_t* multMask,
+                                             const int layer,
                                              const int startROF,
                                              const int endROF,
                                              const int maxROF,
@@ -1473,6 +1263,7 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
 
 template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                                const uint8_t* multMask,
+                                               const int layer,
                                                const int startROF,
                                                const int endROF,
                                                const int maxROF,