From 2df7dca9a7c806dd552bf719c5db539d07b40ec3 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <f3sch.git@outlook.com>
Date: Sat, 17 May 2025 15:47:16 +0200
Subject: [PATCH 1/3] ITS: GPU: print kernel params

prints gpu kernel params

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
index 09d9cee06d9f9..5c3a43540f833 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
@@ -39,6 +39,9 @@ Tracker::Tracker(TrackerTraits7* traits) : mTraits(traits)
 {
   /// Initialise standard configuration with 1 iteration
   mTrkParams.resize(1);
+  if (traits->isGPU()) {
+    ITSGpuTrackingParamConfig::Instance().printKeyValues(true, true);
+  }
 }
 
 void Tracker::clustersToTracks(const LogFunc& logger, const LogFunc& error)

From 180148c0acb320128b45cf1acf5252b41508bbac Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Mon, 30 Jun 2025 14:55:45 +0200
Subject: [PATCH 2/3] ITS: GPU use ms for tracklets

uses multiple streams for trackleting

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  11 +-
 .../GPU/ITStrackingGPU/TrackingKernels.h      |   6 +-
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   |  90 ++++--
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 303 ++++++++----------
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    |   8 +-
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  |  88 ++---
 6 files changed, 269 insertions(+), 237 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index d41591e6ff25c..8b3e9bddd18d6 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -23,8 +23,6 @@
 namespace o2::its::gpu
 {
 
-class Stream;
-
 class DefaultGPUAllocator : public ExternalAllocator
 {
   void* allocate(size_t size) override;
@@ -81,10 +79,11 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void downloadCellsLUTDevice();
   void unregisterRest();
   template <Task task>
-  Stream& getStream(const size_t stream)
+  auto& getStream(const size_t stream)
   {
-    return *mGpuStreams[stream];
+    return mGpuStreams[stream];
   }
+  auto& getStreams() { return mGpuStreams; }
   void wipe(const int);
 
   /// interface
@@ -146,7 +145,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   int getNumberOfNeighbours() const final;
 
  private:
-  void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
+  void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations
   bool mHostRegistered = false;
   TimeFrameGPUParameters mGpuParams;
 
@@ -200,7 +199,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 
   // State
-  std::vector<Stream*> mGpuStreams;
+  Streams mGpuStreams;
   size_t mAvailMemGB;
   bool mFirstInit = true;
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index b847aacd9bba5..a058f7e5fab0c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -84,7 +84,8 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  std::vector<float>& radii,
                                  bounded_vector<float>& mulScatAng,
                                  const int nBlocks,
-                                 const int nThreads);
+                                 const int nThreads,
+                                 gpu::Streams& streams);
 
 template <int nLayers = 7>
 void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
@@ -117,7 +118,8 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    std::vector<float>& radii,
                                    bounded_vector<float>& mulScatAng,
                                    const int nBlocks,
-                                   const int nThreads);
+                                   const int nThreads,
+                                   gpu::Streams& streams);
 
 void countCellsHandler(const Cluster** sortedClusters,
                        const Cluster** unsortedClusters,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 74c118009d67d..454e39e04a661 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -16,12 +16,14 @@
 #ifndef ITSTRACKINGGPU_UTILS_H_
 #define ITSTRACKINGGPU_UTILS_H_
 
+#include <vector>
+
 #include "GPUCommonDef.h"
+#include "GPUCommonHelpers.h"
 
-namespace o2
-{
-namespace its
+namespace o2::its
 {
+
 template <typename T1, typename T2>
 struct gpuPair {
   T1 first;
@@ -31,11 +33,6 @@ struct gpuPair {
 namespace gpu
 {
 
-template <typename T>
-void discardResult(const T&)
-{
-}
-
 // Poor man implementation of a span-like struct. It is very limited.
 template <typename T>
 struct gpuSpan {
@@ -85,19 +82,74 @@ enum class Task {
   Vertexer = 1
 };
 
-template <class T>
-GPUhd() T* getPtrFromRuler(int index, T* src, const int* ruler, const int stride = 1)
+// Abstract stream class
+class Stream
 {
-  return src + ruler[index] * stride;
-}
+ public:
+#if defined(__HIPCC__)
+  using Handle = hipStream_t;
+  static constexpr Handle Default = 0;
+#elif defined(__CUDACC__)
+  using Handle = cudaStream_t;
+  static constexpr Handle Default = 0;
+#else
+  using Handle = void*;
+  static constexpr Handle Default = nullptr;
+#endif
+
+  Stream(unsigned int flags = 0)
+  {
+#if defined(__HIPCC__)
+    GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags));
+#elif defined(__CUDACC__)
+    GPUChkErrS(cudaStreamCreateWithFlags(&mHandle, flags));
+#endif
+  }
 
-template <class T>
-GPUhd() const T* getPtrFromRuler(int index, const T* src, const int* ruler, const int stride = 1)
+  Stream(Handle h) : mHandle(h) {}
+  ~Stream()
+  {
+    if (mHandle != Default) {
+#if defined(__HIPCC__)
+      GPUChkErrS(hipStreamDestroy(mHandle));
+#elif defined(__CUDACC__)
+      GPUChkErrS(cudaStreamDestroy(mHandle));
+#endif
+    }
+  }
+
+  operator bool() const { return mHandle != Default; }
+  const Handle& get() { return mHandle; }
+  void sync() const
+  {
+#if defined(__HIPCC__)
+    GPUChkErrS(hipStreamSynchronize(mHandle));
+#elif defined(__CUDACC__)
+    GPUChkErrS(cudaStreamSynchronize(mHandle));
+#endif
+  }
+
+ private:
+  Handle mHandle{Default};
+};
+static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!");
+
+// Abstract vector for streams.
+// Handles specifically wrap around.
+class Streams
 {
-  return src + ruler[index] * stride;
-}
+ public:
+  size_t size() const noexcept { return mStreams.size(); }
+  void resize(size_t n) { mStreams.resize(n); }
+  void clear() { mStreams.clear(); }
+  auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; }
+  void push_back(const Stream& stream) { mStreams.push_back(stream); }
+
+ private:
+  std::vector<Stream> mStreams;
+};
+
 } // namespace gpu
-} // namespace its
-} // namespace o2
+} // namespace o2::its
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index b336073604b62..13851b4cdc1aa 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -58,38 +58,6 @@ using constants::MB;
 
 namespace gpu
 {
-class Stream final
-{
- public:
-  Stream();
-  ~Stream();
-
-  [[nodiscard]] const cudaStream_t& get() const;
-
- private:
-  cudaStream_t mStream;
-};
-
-Stream::Stream()
-{
-  GPUChkErrS(cudaStreamCreate(&mStream));
-}
-
-Stream::~Stream()
-{
-  GPUChkErrS(cudaStreamDestroy(mStream));
-}
-
-const cudaStream_t& Stream::get() const
-{
-  return mStream;
-}
-
-void* DefaultGPUAllocator::allocate(size_t size)
-{
-  LOGP(fatal, "Called DefaultGPUAllocator::allocate with size {}", size);
-  return nullptr; // to be implemented
-}
 
 template <int nLayers>
 TimeFrameGPU<nLayers>::TimeFrameGPU()
@@ -101,13 +69,13 @@ template <int nLayers>
 TimeFrameGPU<nLayers>::~TimeFrameGPU() = default;
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream* strPtr, bool extAllocator)
+void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator)
 {
   if (extAllocator) {
     *ptr = this->mAllocator->allocate(size);
   } else {
     LOGP(debug, "Calling default CUDA allocator");
-    GPUChkErrS(cudaMallocAsync(reinterpret_cast<void**>(ptr), size, strPtr->get()));
+    GPUChkErrS(cudaMallocAsync(reinterpret_cast<void**>(ptr), size, stream.get()));
   }
 }
 
@@ -120,31 +88,31 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading indextable utils");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
   if (!iteration) {
     LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator());
   }
   LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
-  GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading unsorted clusters");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading unsorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -152,17 +120,17 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading sorted clusters");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), nullptr, this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -170,15 +138,15 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading sorted clusters");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), nullptr, this->getExtAllocator());
-      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -186,72 +154,72 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating used clusters flags");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, this->getExtAllocator());
-      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0]->get()));
+      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator());
+      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading used clusters flags");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / MB);
-    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading ROframe clusters");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), nullptr, this->getExtAllocator());
-      GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+      GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading trackingframeinfo");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading trackingframeinfo");
   if (!iteration) {
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading multiplicity cut mask");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
     LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -259,67 +227,67 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading seeding vertices");
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
     LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+    allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating tracklets LUTs");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     if (!iteration) {
       LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", this->mClusters[iLayer].size() + 1, iLayer, (this->mClusters[iLayer].size() + 1) * sizeof(int) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), nullptr, this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
     }
-    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
   }
   if (!iteration) {
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsBuffers()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells buffers");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     mNTracklets[iLayer] = 0;
     GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
     LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[0], this->getExtAllocator());
   }
-  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, this->getExtAllocator());
+  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading tracklets");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / MB);
     GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading tracklets");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
     GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
@@ -327,90 +295,90 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
   }
   GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells neighbours");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
   // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, this->getExtAllocator());
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0]->get()));
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
     if (iLayer < nLayers - 3) {
       mNNeighbours[iLayer] = 0;
     }
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighboursLUT");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
   LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, this->getExtAllocator()); // We need one element more to move exc -> inc
-  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
+  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadCellsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading cell seeds");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), nullptr, this->getExtAllocator());
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), nullptr, this->getExtAllocator()); // accessory for the neigh. finding.
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0]->get()));
-    GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // accessory for the neigh. finding.
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
+  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells LUTs");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0]->get()));
+    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
+  GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells buffers");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
   mNCells[layer] = 0;
   GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
   LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, this->getExtAllocator());
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
 
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading cells LUTs");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
     GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
@@ -419,128 +387,128 @@ void TimeFrameGPU<nLayers>::loadRoadsDevice()
   LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road<nLayers - 2>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mRoadsDevice), this->mRoads.size() * sizeof(Road<nLayers - 2>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading track seeds");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds");
   LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   this->mCellsNeighbours[layer].clear();
   this->mCellsNeighbours[layer].resize(neighbours.size());
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving tracks");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving tracks");
   mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
   LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
   GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading cells");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cells");
   for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     this->mCells[iLayer].resize(mNCells[iLayer]);
-    GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading cell luts");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts");
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
     this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
-    GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0]->get()));
+    GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>& neighbours, const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), fmt::format("downloading neighbours from layer {}", layer));
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
-  GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut, const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), fmt::format("downloading neighbours LUT from layer {}", layer));
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB);
-  GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0]->get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading tracks");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks");
   LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB);
-  GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0]->get()));
+  GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
   GPUChkErrS(cudaHostUnregister(seeds.data()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterRest()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "unregistering rest of the host memory");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory");
   LOGP(debug, "unregistering rest of the host memory...");
   GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
   GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get());
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
@@ -563,10 +531,7 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
                                        IndexTableUtils* utils,
                                        const TimeFrameGPUParameters* gpuParam)
 {
-  mGpuStreams.resize(mGpuParams.nTimeFrameChunks);
-  for (auto& str : mGpuStreams) {
-    str = new Stream();
-  }
+  mGpuStreams.resize(nLayers);
   o2::its::TimeFrame<nLayers>::initialise(iteration, trkParam, maxLayers);
 }
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index b32189f3fabe3..d804d0062764f 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -49,13 +49,13 @@ template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int iROFslice, int iVertex)
 {
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
-  mTimeFrameGPU->createTrackletsLUTDevice(iteration);
 
   const Vertex diamondVert({this->mTrkParams[iteration].Diamond[0], this->mTrkParams[iteration].Diamond[1], this->mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
   gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
   int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0};
   int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
 
+  mTimeFrameGPU->createTrackletsLUTDevice(iteration);
   countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
                                        mTimeFrameGPU->getDeviceMultCutMask(),
                                        startROF,
@@ -83,7 +83,8 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                        this->mTrkParams[iteration].LayerRadii,
                                        mTimeFrameGPU->getMSangles(),
                                        conf.nBlocks,
-                                       conf.nThreads);
+                                       conf.nThreads,
+                                       mTimeFrameGPU->getStreams());
   mTimeFrameGPU->createTrackletsBuffers();
   computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
                                          mTimeFrameGPU->getDeviceMultCutMask(),
@@ -115,7 +116,8 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          this->mTrkParams[iteration].LayerRadii,
                                          mTimeFrameGPU->getMSangles(),
                                          conf.nBlocks,
-                                         conf.nThreads);
+                                         conf.nThreads,
+                                         mTimeFrameGPU->getStreams());
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 8c6367c221583..8245aee33718c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -890,11 +890,14 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  std::vector<float>& radii,
                                  bounded_vector<float>& mulScatAng,
                                  const int nBlocks,
-                                 const int nThreads)
+                                 const int nThreads,
+                                 gpu::Streams& streams)
 {
   for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
     gpu::computeLayerTrackletsMultiROFKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                                     o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+                                                     o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
+                                                     0,
+                                                     streams[iLayer].get()>>>(
       utils,
       multMask,
       iLayer,
@@ -921,7 +924,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
       resolutions[iLayer],
       radii[iLayer + 1] - radii[iLayer],
       mulScatAng[iLayer]);
-    gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1);
+    gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get());
   }
 }
 
@@ -956,45 +959,52 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    std::vector<float>& radii,
                                    bounded_vector<float>& mulScatAng,
                                    const int nBlocks,
-                                   const int nThreads)
+                                   const int nThreads,
+                                   gpu::Streams& streams)
 {
   for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    gpu::computeLayerTrackletsMultiROFKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS), o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(utils,
-                                                                                                                                                multMask,
-                                                                                                                                                iLayer,
-                                                                                                                                                startROF,
-                                                                                                                                                endROF,
-                                                                                                                                                maxROF,
-                                                                                                                                                deltaROF,
-                                                                                                                                                vertices,
-                                                                                                                                                rofPV,
-                                                                                                                                                nVertices,
-                                                                                                                                                vertexId,
-                                                                                                                                                clusters,
-                                                                                                                                                ROFClusters,
-                                                                                                                                                usedClusters,
-                                                                                                                                                clustersIndexTables,
-                                                                                                                                                tracklets,
-                                                                                                                                                trackletsLUTs,
-                                                                                                                                                iteration,
-                                                                                                                                                NSigmaCut,
-                                                                                                                                                phiCuts[iLayer],
-                                                                                                                                                resolutionPV,
-                                                                                                                                                minRs[iLayer + 1],
-                                                                                                                                                maxRs[iLayer + 1],
-                                                                                                                                                resolutions[iLayer],
-                                                                                                                                                radii[iLayer + 1] - radii[iLayer],
-                                                                                                                                                mulScatAng[iLayer]);
+    gpu::computeLayerTrackletsMultiROFKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
+                                                      o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
+                                                      0,
+                                                      streams[iLayer].get()>>>(
+      utils,
+      multMask,
+      iLayer,
+      startROF,
+      endROF,
+      maxROF,
+      deltaROF,
+      vertices,
+      rofPV,
+      nVertices,
+      vertexId,
+      clusters,
+      ROFClusters,
+      usedClusters,
+      clustersIndexTables,
+      tracklets,
+      trackletsLUTs,
+      iteration,
+      NSigmaCut,
+      phiCuts[iLayer],
+      resolutionPV,
+      minRs[iLayer + 1],
+      maxRs[iLayer + 1],
+      resolutions[iLayer],
+      radii[iLayer + 1] - radii[iLayer],
+      mulScatAng[iLayer]);
     thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
-    thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
-    auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
+    auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get());
+    thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
+    auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
     nTracklets[iLayer] = unique_end - tracklets_ptr;
     if (iLayer > 0) {
-      GPUChkErrS(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
+      GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get()));
       gpu::compileTrackletsLookupTableKernel<<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                               o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
-        spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
-      gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1);
+                                               o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
+                                               0,
+                                               streams[iLayer].get()>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
+      gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get());
     }
   }
 }
@@ -1350,7 +1360,8 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                              std::vector<float>& radii,
                                              bounded_vector<float>& mulScatAng,
                                              const int nBlocks,
-                                             const int nThreads);
+                                             const int nThreads,
+                                             gpu::Streams& streams);
 
 template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                                const uint8_t* multMask,
@@ -1382,7 +1393,8 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                                std::vector<float>& radii,
                                                bounded_vector<float>& mulScatAng,
                                                const int nBlocks,
-                                               const int nThreads);
+                                               const int nThreads,
+                                               gpu::Streams& streams);
 
 template void processNeighboursHandler<7>(const int startLayer,
                                           const int startLevel,

From 26a6670f0fff8c560b0f64fc8ff03d26c6475a19 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <f3sch.git@outlook.com>
Date: Mon, 9 Jun 2025 11:35:44 +0200
Subject: [PATCH 3/3] ITS: add GPUTimer and use streams

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 211 ++++++++++--------
 1 file changed, 114 insertions(+), 97 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 13851b4cdc1aa..d5ea573a2f0e8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -14,6 +14,7 @@
 #include <thrust/execution_policy.h>
 
 #include "ITStracking/Constants.h"
+#include "ITStracking/BoundedAllocator.h"
 
 #include "ITStrackingGPU/Utils.h"
 #include "ITStrackingGPU/TimeFrameGPU.h"
@@ -21,6 +22,8 @@
 
 #include <unistd.h>
 #include <thread>
+#include <tuple>
+#include <vector>
 #include <fmt/format.h>
 
 #include "GPUCommonDef.h"
@@ -28,27 +31,6 @@
 #include "GPUCommonLogger.h"
 #include "GPUCommonHelpers.h"
 
-#ifdef ITS_MEASURE_GPU_TIME
-#define START_GPU_STREAM_TIMER(stream, name)        \
-  cudaEvent_t event_start, event_stop;              \
-  GPUChkErrS(cudaEventCreate(&event_start));        \
-  GPUChkErrS(cudaEventCreate(&event_stop));         \
-  GPUChkErrS(cudaEventRecord(event_start, stream)); \
-  const std::string task_name = name;
-
-#define STOP_GPU_STREAM_TIMER(stream)                                                \
-  GPUChkErrS(cudaEventRecord(event_stop, stream));                                   \
-  GPUChkErrS(cudaEventSynchronize(event_stop));                                      \
-  float ms;                                                                          \
-  GPUChkErrS(cudaEventElapsedTime(&ms, event_start, event_stop));                    \
-  std::cout << "Elapsed time for " << task_name << ": " << ms << " ms" << std::endl; \
-  GPUChkErrS(cudaEventDestroy(event_start));                                         \
-  GPUChkErrS(cudaEventDestroy(event_stop));
-#else
-#define START_GPU_STREAM_TIMER(stream, name)
-#define STOP_GPU_STREAM_TIMER(stream)
-#endif
-
 namespace o2
 {
 namespace its
@@ -59,6 +41,71 @@ using constants::MB;
 namespace gpu
 {
 
+#ifdef ITS_MEASURE_GPU_TIME
+class GPUTimer
+{
+ public:
+  GPUTimer(Streams& streams, const std::string& name)
+    : mName(name)
+  {
+    for (size_t i{0}; i < streams.size(); ++i) {
+      mStreams.push_back(streams[i].get());
+    }
+    startTimers();
+  }
+  GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0)
+    : mName(name)
+  {
+    for (size_t sta{start}; sta < end; ++sta) {
+      mStreams.push_back(streams[sta].get());
+    }
+    startTimers();
+  }
+  GPUTimer(Stream& stream, const std::string& name)
+    : mName(name)
+  {
+    mStreams.push_back(stream.get());
+    startTimers();
+  }
+  ~GPUTimer()
+  {
+    for (size_t i{0}; i < mStreams.size(); ++i) {
+      GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i]));
+      GPUChkErrS(cudaEventSynchronize(mStops[i]));
+      float ms = 0.0f;
+      GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i]));
+      LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms);
+      GPUChkErrS(cudaEventDestroy(mStarts[i]));
+      GPUChkErrS(cudaEventDestroy(mStops[i]));
+    }
+  }
+
+  void startTimers()
+  {
+    mStarts.resize(mStreams.size());
+    mStops.resize(mStreams.size());
+    for (size_t i{0}; i < mStreams.size(); ++i) {
+      GPUChkErrS(cudaEventCreate(&mStarts[i]));
+      GPUChkErrS(cudaEventCreate(&mStops[i]));
+      GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i]));
+    }
+  }
+
+ private:
+  std::string mName;
+  std::vector<cudaEvent_t> mStarts, mStops;
+  std::vector<cudaStream_t> mStreams;
+};
+#else // ITS_MEASURE_GPU_TIME not defined
+class GPUTimer
+{
+ public:
+  GPUTimer(Stream&, const std::string&) {}
+  GPUTimer(Streams&, const std::string&) {}
+  GPUTimer(Streams&, const std::string&, int, int = 0) {}
+};
+#endif
+
 template <int nLayers>
 TimeFrameGPU<nLayers>::TimeFrameGPU()
 {
@@ -88,21 +135,20 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
+  GPUTimer timer(mGpuStreams[0], "loading indextable utils");
   if (!iteration) {
     LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator());
   }
   LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
   GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading unsorted clusters");
+    GPUTimer timer(mGpuStreams[0], "loading unsorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
@@ -112,7 +158,6 @@ void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
     allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -120,7 +165,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
+    GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
@@ -130,7 +175,6 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
     allocMemAsync(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -138,7 +182,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
+    GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
@@ -146,7 +190,6 @@ void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
     }
     allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -154,7 +197,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+    GPUTimer timer(mGpuStreams[0], "creating used clusters flags");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator());
@@ -162,26 +205,24 @@ void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
     }
     allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags");
+  GPUTimer timer(mGpuStreams[0], "loading used clusters flags");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / MB);
     GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
+    GPUTimer timer(mGpuStreams[0], "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
@@ -189,14 +230,13 @@ void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
     }
     allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading trackingframeinfo");
+  GPUTimer timer(mGpuStreams[0], "loading trackingframeinfo");
   if (!iteration) {
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
@@ -208,18 +248,16 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
     GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
+    GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask");
     LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
@@ -227,81 +265,76 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
-    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
+    GPUTimer timer(mGpuStreams[0], "loading seeding vertices");
     LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs");
+  GPUTimer timer(mGpuStreams, "creating tracklets LUTs", nLayers - 1);
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     if (!iteration) {
       LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", this->mClusters[iLayer].size() + 1, iLayer, (this->mClusters[iLayer].size() + 1) * sizeof(int) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
     }
-    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
   }
   if (!iteration) {
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsBuffers()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
+  GPUTimer timer(mGpuStreams, "creating cells buffers", nLayers - 1);
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     mNTracklets[iLayer] = 0;
     GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
     LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[0], this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator());
   }
   allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
+  GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 1);
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / MB);
     GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
+  GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
     GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
   GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
+  GPUTimer timer(mGpuStreams[0], "creating cells neighbours");
   // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
   allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
@@ -314,76 +347,70 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
       mNNeighbours[iLayer] = 0;
     }
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
+  GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT");
   LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
   GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadCellsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
+  GPUTimer timer(mGpuStreams, "loading cell seeds", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // accessory for the neigh. finding.
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
-    GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[iLayer], this->getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); // accessory for the neigh. finding.
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
   allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs");
+  GPUTimer timer(mGpuStreams, "creating cells LUTs", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
+    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
   }
   allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
+  GPUTimer timer(mGpuStreams[0], "creating cells buffers");
   mNCells[layer] = 0;
   GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
   LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
-
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs");
+  GPUTimer timer(mGpuStreams, "loading cells LUTs", nLayers - 3);
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
     GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadRoadsDevice()
 {
+  GPUTimer timer(mGpuStreams[0], "loading roads device");
   LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road<nLayers - 2>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mRoadsDevice), this->mRoads.size() * sizeof(Road<nLayers - 2>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
@@ -393,30 +420,28 @@ void TimeFrameGPU<nLayers>::loadRoadsDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds");
+  GPUTimer timer(mGpuStreams[0], "loading track seeds");
   LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
+  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
+  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
   this->mCellsNeighbours[layer].clear();
   this->mCellsNeighbours[layer].resize(neighbours.size());
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
@@ -424,58 +449,53 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std
   GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
+  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving tracks");
+  GPUTimer timer(mGpuStreams[0], "reserving tracks");
   mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
   LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
   GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cells");
+  GPUTimer timer(mGpuStreams, "downloading cells", nLayers - 2);
   for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     this->mCells[iLayer].resize(mNCells[iLayer]);
-    GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts");
+  GPUTimer timer(mGpuStreams, "downloading cell luts", nLayers - 3);
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
     this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
-    GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
   }
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>& neighbours, const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
+  GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
   GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
@@ -484,31 +504,28 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<bounded_ve
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut, const int layer)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer));
+  GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours LUT from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB);
   GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks");
+  GPUTimer timer(mGpuStreams[0], "downloading tracks");
   LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB);
   GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
   GPUChkErrS(cudaHostUnregister(seeds.data()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterRest()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory");
+  GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
   LOGP(debug, "unregistering rest of the host memory...");
   GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
   GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
-  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>