diff --git a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt
index f9565307d35f6..291ddffbf9475 100644
--- a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt
@@ -9,7 +9,6 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-#add_compile_options(-O0 -g -fPIC -fno-omit-frame-pointer)
 o2_add_library(ITStracking
                TARGETVARNAME targetName
                SOURCES src/ClusterLines.cxx
@@ -37,6 +36,7 @@ o2_add_library(ITStracking
                PRIVATE_LINK_LIBRARIES
                        O2::Steer
                        TBB::tbb)
+# target_compile_options(${targetName} PRIVATE -O0 -g -fPIC -fno-omit-frame-pointer)
 
 o2_add_library(ITSTrackingInterface
                TARGETVARNAME targetName
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 8b3e9bddd18d6..0ad08fd88ccf5 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -23,11 +23,6 @@
 namespace o2::its::gpu
 {
 
-class DefaultGPUAllocator : public ExternalAllocator
-{
-  void* allocate(size_t size) override;
-};
-
 template <int nLayers = 7>
 class TimeFrameGPU : public TimeFrame<nLayers>
 {
@@ -84,7 +79,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
     return mGpuStreams[stream];
   }
   auto& getStreams() { return mGpuStreams; }
-  void wipe(const int);
+  virtual void wipe() final;
 
   /// interface
   int getNClustersInRofSpan(const int, const int, const int) const;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 454e39e04a661..802973d5f4000 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -21,6 +21,12 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
 
+#ifndef __HIPCC__
+#define THRUST_NAMESPACE thrust::cuda
+#else
+#define THRUST_NAMESPACE thrust::hip
+#endif
+
 namespace o2::its
 {
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index d5ea573a2f0e8..f6d9157b0da68 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -21,8 +21,6 @@
 #include "ITStrackingGPU/TracerGPU.h"
 
 #include <unistd.h>
-#include <thread>
-#include <tuple>
 #include <vector>
 #include <fmt/format.h>
 
@@ -31,14 +29,7 @@
 #include "GPUCommonLogger.h"
 #include "GPUCommonHelpers.h"
 
-namespace o2
-{
-namespace its
-{
-using constants::GB;
-using constants::MB;
-
-namespace gpu
+namespace o2::its::gpu
 {
 
 #ifdef ITS_MEASURE_GPU_TIME
@@ -96,14 +87,19 @@ class GPUTimer
   std::vector<cudaEvent_t> mStarts, mStops;
   std::vector<cudaStream_t> mStreams;
 };
+
+#define GPULog(...) LOGP(info, __VA_ARGS__)
 #else // ITS_MEASURE_GPU_TIME not defined
 class GPUTimer
 {
  public:
-  GPUTimer(Stream&, const std::string&) {}
-  GPUTimer(Streams&, const std::string&) {}
-  GPUTimer(Streams&, const std::string&, int, int = 0) {}
+  template <typename... Args>
+  GPUTimer(Args&&...)
+  {
+  }
 };
+
+#define GPULog(...)
 #endif
 
 template <int nLayers>
@@ -121,7 +117,7 @@ void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream& strea
   if (extAllocator) {
     *ptr = this->mAllocator->allocate(size);
   } else {
-    LOGP(debug, "Calling default CUDA allocator");
+    GPULog("Calling default CUDA allocator");
     GPUChkErrS(cudaMallocAsync(reinterpret_cast<void**>(ptr), size, stream.get()));
   }
 }
@@ -137,10 +133,10 @@ void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 {
   GPUTimer timer(mGpuStreams[0], "loading indextable utils");
   if (!iteration) {
-    LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
+    GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator());
   }
-  LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
+  GPULog("gpu-transfer: loading IndexTableUtils object, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB);
   GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
@@ -149,11 +145,11 @@ void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading unsorted clusters");
-    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
+    for (int iLayer{0}; iLayer < nLayers; ++iLayer) {
+      GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / constants::MB);
+      allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
@@ -167,7 +163,7 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / MB);
+      GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / constants::MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
       GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -184,7 +180,7 @@ void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / MB);
+      GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / constants::MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
@@ -199,7 +195,7 @@ void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "creating used clusters flags");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
+      GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
       allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
     }
@@ -213,7 +209,7 @@ void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
 {
   GPUTimer timer(mGpuStreams[0], "loading used clusters flags");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / MB);
+    GPULog("gpu-transfer: loading {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
     GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
 }
@@ -224,7 +220,7 @@ void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / MB);
+      GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / constants::MB);
       allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
@@ -239,7 +235,7 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
   GPUTimer timer(mGpuStreams[0], "loading trackingframeinfo");
   if (!iteration) {
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
+      GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / constants::MB);
       allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[0], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
       GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -253,10 +249,12 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
-  if (!iteration) {
+  if (!iteration || iteration == 3) { // we need to re-load the swapped mult-mask in upc iteration
     GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask");
-    LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator());
+    GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / constants::MB);
+    if (!iteration) { // only allocate on first call
+      allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator());
+    }
     GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
 }
@@ -266,10 +264,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading seeding vertices");
-    LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / MB);
+    GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / MB);
+    GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
@@ -278,13 +276,14 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
 {
-  GPUTimer timer(mGpuStreams, "creating tracklets LUTs", nLayers - 1);
+  GPUTimer timer(mGpuStreams[0], "creating tracklets LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
+    const int ncls = this->mClusters[iLayer].size() + 1;
     if (!iteration) {
-      LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", this->mClusters[iLayer].size() + 1, iLayer, (this->mClusters[iLayer].size() + 1) * sizeof(int) / MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
+      GPULog("gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, iLayer, ncls * sizeof(int) / constants::MB);
+      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), ncls * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
     }
-    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
+    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, ncls * sizeof(int), mGpuStreams[iLayer].get()));
   }
   if (!iteration) {
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
@@ -295,11 +294,11 @@ void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsBuffers()
 {
-  GPUTimer timer(mGpuStreams, "creating cells buffers", nLayers - 1);
-  for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
+  GPUTimer timer(mGpuStreams[0], "creating tracklet buffers");
+  for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     mNTracklets[iLayer] = 0;
-    GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
-    LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB);
+    GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
+    GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator());
   }
   allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator());
@@ -312,7 +311,7 @@ void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 {
   GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 1);
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / MB);
+    GPULog("gpu-transfer: loading {} tracklets on layer {}, for {:.2f} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / constants::MB);
     GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
@@ -323,12 +322,12 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 {
   GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
+    GPULog("gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {:.2f} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / constants::MB);
     GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
   GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
+  GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -340,7 +339,7 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
   GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
+    GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
     if (iLayer < nLayers - 3) {
@@ -353,7 +352,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
 {
   GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT");
-  LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
+  GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
   GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
 }
@@ -363,7 +362,7 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
 {
   GPUTimer timer(mGpuStreams, "loading cell seeds", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / MB);
+    GPULog("gpu-transfer: loading {} cell seeds on layer {}, for {:.2f} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[iLayer], this->getExtAllocator());
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); // accessory for the neigh. finding.
     GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
@@ -378,7 +377,7 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
 {
   GPUTimer timer(mGpuStreams, "creating cells LUTs", nLayers - 2);
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
+    GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
     GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
   }
@@ -391,9 +390,9 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
   GPUTimer timer(mGpuStreams[0], "creating cells buffers");
   mNCells[layer] = 0;
-  GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
-  LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
+  GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
+  GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB);
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[layer], this->getExtAllocator());
 }
 
 template <int nLayers>
@@ -401,7 +400,7 @@ void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
 {
   GPUTimer timer(mGpuStreams, "loading cells LUTs", nLayers - 3);
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
+    GPULog("gpu-transfer: loading cell LUT for {} elements on layer {}, for {:.2f} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / constants::MB);
     GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
@@ -411,7 +410,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadRoadsDevice()
 {
   GPUTimer timer(mGpuStreams[0], "loading roads device");
-  LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road<nLayers - 2>) / MB);
+  GPULog("gpu-transfer: loading {} roads, for {:.2f} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road<nLayers - 2>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mRoadsDevice), this->mRoads.size() * sizeof(Road<nLayers - 2>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -421,7 +420,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds)
 {
   GPUTimer timer(mGpuStreams[0], "loading track seeds");
-  LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB);
+  GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
   GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -431,10 +430,10 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
 {
   GPUTimer timer(mGpuStreams[0], "reserving neighbours");
-  LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
+  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
-  LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
+  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
 }
 
@@ -444,10 +443,10 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std
   GPUTimer timer(mGpuStreams[0], "reserving neighbours");
   this->mCellsNeighbours[layer].clear();
   this->mCellsNeighbours[layer].resize(neighbours.size());
-  LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
+  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
-  LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
+  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
 }
 
@@ -464,7 +463,7 @@ void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeed>& se
 {
   GPUTimer timer(mGpuStreams[0], "reserving tracks");
   mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
-  LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB);
+  GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator());
   GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
   GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
@@ -475,7 +474,7 @@ void TimeFrameGPU<nLayers>::downloadCellsDevice()
 {
   GPUTimer timer(mGpuStreams, "downloading cells", nLayers - 2);
   for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
+    GPULog("gpu-transfer: downloading {} cells on layer: {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
     this->mCells[iLayer].resize(mNCells[iLayer]);
     GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
   }
@@ -486,7 +485,7 @@ void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
 {
   GPUTimer timer(mGpuStreams, "downloading cell luts", nLayers - 3);
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
-    LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
+    GPULog("gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
     this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
     GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
   }
@@ -496,7 +495,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>& neighbours, const int layer)
 {
   GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours from layer {}", layer));
-  LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
+  GPULog("gpu-transfer: downloading {} neighbours, for {:.2f} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / constants::MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
   GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
@@ -505,7 +504,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut, const int layer)
 {
   GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours LUT from layer {}", layer));
-  LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB);
+  GPULog("gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", lut.size(), layer, lut.size() * sizeof(int) / constants::MB);
   GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
 
@@ -513,7 +512,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>& seeds)
 {
   GPUTimer timer(mGpuStreams[0], "downloading tracks");
-  LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB);
+  GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
   GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
   GPUChkErrS(cudaHostUnregister(seeds.data()));
@@ -523,7 +522,7 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterRest()
 {
   GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
-  LOGP(debug, "unregistering rest of the host memory...");
+  GPULog("unregistering rest of the host memory...");
   GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
   GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
 }
@@ -531,6 +530,8 @@ void TimeFrameGPU<nLayers>::unregisterRest()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
 {
+  GPUTimer timer(mGpuStreams[0], "unregistering host memory");
+  GPULog("unregistering host memory");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
     GPUChkErrS(cudaHostUnregister(this->mUnsortedClusters[iLayer].data()));
     GPUChkErrS(cudaHostUnregister(this->mClusters[iLayer].data()));
@@ -552,7 +553,12 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
   o2::its::TimeFrame<nLayers>::initialise(iteration, trkParam, maxLayers);
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::wipe()
+{
+  unregisterHostMemory(0);
+  o2::its::TimeFrame<nLayers>::wipe();
+}
+
 template class TimeFrameGPU<7>;
-} // namespace gpu
-} // namespace its
-} // namespace o2
+} // namespace o2::its::gpu
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 1b1c4af682dc5..a8061e872c029 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -18,10 +18,10 @@
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
 #include "ITStracking/TrackingConfigParam.h"
+#include "ITStracking/Constants.h"
 
 namespace o2::its
 {
-constexpr int UnusedIndex{-1};
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
@@ -48,10 +48,8 @@ void TrackerTraitsGPU<nLayers>::adoptTimeFrame(TimeFrame<nLayers>* tf)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int iROFslice, int iVertex)
 {
-  auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
+  const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  const Vertex diamondVert({this->mTrkParams[iteration].Diamond[0], this->mTrkParams[iteration].Diamond[1], this->mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
-  gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
   int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0};
   int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
 
@@ -128,6 +126,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
 
   for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
     if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
+      mTimeFrameGPU->getNCells()[iLayer] = 0;
       continue;
     }
     const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
@@ -173,9 +172,10 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
   mTimeFrameGPU->createNeighboursIndexTablesDevice();
   const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
+    const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
-
-    if (!nextLayerCellsNum) {
+    if (!nextLayerCellsNum || !currentLayerCellsNum) {
+      mTimeFrameGPU->getNNeighbours()[iLayer] = 0;
       continue;
     }
 
@@ -188,7 +188,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                                      this->mTrkParams[0].MaxChi2ClusterAttachment,
                                                      this->mBz,
                                                      iLayer,
-                                                     mTimeFrameGPU->getNCells()[iLayer],
+                                                     currentLayerCellsNum,
                                                      nextLayerCellsNum,
                                                      1e2,
                                                      conf.nBlocks,
@@ -204,7 +204,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                  this->mTrkParams[0].MaxChi2ClusterAttachment,
                                  this->mBz,
                                  iLayer,
-                                 mTimeFrameGPU->getNCells()[iLayer],
+                                 currentLayerCellsNum,
                                  nextLayerCellsNum,
                                  1e2,
                                  conf.nBlocks,
@@ -251,8 +251,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
                                         conf.nThreads);
     }
     // fixme: I don't want to move tracks back and forth, but I need a way to use a thrust::allocator that is aware of our managed memory.
-    if (!trackSeeds.size()) {
-      LOGP(info, "No track seeds found, skipping track finding");
+    if (trackSeeds.empty()) {
+      LOGP(debug, "No track seeds found, skipping track finding");
       continue;
     }
     mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
@@ -283,7 +283,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       int nShared = 0;
       bool isFirstShared{false};
       for (int iLayer{0}; iLayer < this->mTrkParams[0].NLayers; ++iLayer) {
-        if (track.getClusterIndex(iLayer) == UnusedIndex) {
+        if (track.getClusterIndex(iLayer) == constants::UnusedIndex) {
           continue;
         }
         nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
@@ -296,7 +296,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
 
       std::array<int, 3> rofs{INT_MAX, INT_MAX, INT_MAX};
       for (int iLayer{0}; iLayer < this->mTrkParams[0].NLayers; ++iLayer) {
-        if (track.getClusterIndex(iLayer) == UnusedIndex) {
+        if (track.getClusterIndex(iLayer) == constants::UnusedIndex) {
           continue;
         }
         mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
@@ -320,9 +320,6 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
     }
     mTimeFrameGPU->loadUsedClustersDevice();
   }
-  if (iteration == this->mTrkParams.size() - 1) {
-    mTimeFrameGPU->unregisterHostMemory(0);
-  }
 };
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 38c59d520aa76..fb75764da2e36 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -36,15 +36,6 @@
 #include "ITStrackingGPU/TrackingKernels.h"
 #include "ITStrackingGPU/Utils.h"
 
-#ifndef __HIPCC__
-#define THRUST_NAMESPACE thrust::cuda
-#else
-#define THRUST_NAMESPACE thrust::hip
-#endif
-
-#define GPU_BLOCKS GPUCA_DETERMINISTIC_CODE(1, 99999)
-#define GPU_THREADS GPUCA_DETERMINISTIC_CODE(1, 99999)
-
 // O2 track model
 #include "ReconstructionDataFormats/Track.h"
 #include "DetectorsBase/Propagator.h"
@@ -112,7 +103,6 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde
 
   if (zRangeMax < -utils.getLayerZ(layerIndex) ||
       zRangeMin > utils.getLayerZ(layerIndex) || zRangeMin > zRangeMax) {
-
     return getEmptyBinsRect();
   }
 
@@ -398,22 +388,20 @@ GPUg() void computeLayerCellNeighboursKernel(
       }
       if constexpr (initRun) {
         atomicAdd(neighboursLUT + iNextCell, 1);
-        foundNeighbours++;
         neighboursIndexTable[iCurrentCellIndex]++;
       } else {
         cellNeighbours[neighboursIndexTable[iCurrentCellIndex] + foundNeighbours] = {iCurrentCellIndex, iNextCell};
         foundNeighbours++;
-        // FIXME: this is prone to race conditions: check on level is not atomic
         const int currentCellLevel{currentCellSeed.getLevel()};
         if (currentCellLevel >= nextCellSeed.getLevel()) {
-          cellSeedArray[layerIndex + 1][iNextCell].setLevel(currentCellLevel + 1);
+          atomicMax(cellSeedArray[layerIndex + 1][iNextCell].getLevelPtr(), currentCellLevel + 1);
         }
       }
     }
   }
 }
 
-template <bool initRun, int nLayers = 7>
+template <bool initRun>
 GPUg() void computeLayerCellsKernel(
   const Cluster** sortedClusters,
   const Cluster** unsortedClusters,
@@ -530,8 +518,14 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
   for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) {
     const short rof0 = iROF + startROF;
     auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices);
+    if (primaryVertices.empty()) {
+      continue;
+    }
     const auto startVtx{vertexId >= 0 ? vertexId : 0};
     const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast<int>(primaryVertices.size())) : static_cast<int>(primaryVertices.size())};
+    if ((endVtx - startVtx) <= 0) {
+      continue;
+    }
     const short minROF = o2::gpu::CAMath::Max(startROF, static_cast<int>(rof0 - deltaROF));
     const short maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast<int>(rof0 + deltaROF));
     auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters);
@@ -541,7 +535,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
 
     for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) {
       unsigned int storedTracklets{0};
-      auto currentCluster{clustersCurrentLayer[currentClusterIndex]};
+      const auto& currentCluster{clustersCurrentLayer[currentClusterIndex]};
       const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex};
       if (usedClusters[layerIndex][currentCluster.clusterId]) {
         continue;
@@ -550,14 +544,15 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
       const float inverseR0{1.f / currentCluster.radius};
       for (int iV{startVtx}; iV < endVtx; ++iV) {
         auto& primaryVertex{primaryVertices[iV]};
-        if (primaryVertex.isFlagSet(2) && iteration != 3) {
+        if ((primaryVertex.isFlagSet(Vertex::Flags::UPCMode) && iteration != 3) || (iteration == 3 && !primaryVertex.isFlagSet(Vertex::Flags::UPCMode))) {
           continue;
         }
+
         const float resolution = o2::gpu::CAMath::Sqrt(math_utils::Sq(resolutionPV) / primaryVertex.getNContributors() + math_utils::Sq(positionResolution));
         const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
         const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
         const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
-        const float sqInverseDeltaZ0{1.f / (math_utils::Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
+        const float sqInverseDeltaZ0{1.f / (math_utils::Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + constants::Tolerance)}; /// protecting from overflows adding the detector resolution
         const float sigmaZ{o2::gpu::CAMath::Sqrt(math_utils::Sq(resolution) * math_utils::Sq(tanLambda) * ((math_utils::Sq(inverseR0) + sqInverseDeltaZ0) * math_utils::Sq(meanDeltaR) + 1.f) + math_utils::Sq(meanDeltaR * MSAngle))};
         const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex + 1, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)};
         if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
@@ -579,8 +574,8 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
             int iPhiBin = (selectedBinsRect.y + iPhiCount) % phiBins;
             const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
             const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
-            const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex];
-            const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex];
+            const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1)*tableSize + firstBinIndex];
+            const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1)*tableSize + maxBinIndex];
             for (int nextClusterIndex{firstRowClusterIndex}; nextClusterIndex < maxRowClusterIndex; ++nextClusterIndex) {
               if (nextClusterIndex >= clustersNextLayer.size()) {
                 break;
@@ -591,13 +586,13 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
               }
               const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)};
               const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)};
-              const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex};
               if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - o2::constants::math::TwoPI) < phiCut)) {
                 if constexpr (initRun) {
                   trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums.
                 } else {
                   const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
                   const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
+                  const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex};
                   new (tracklets[layerIndex] + trackletsLUT[layerIndex][currentSortedIndex] + storedTracklets) Tracklet{currentSortedIndex, nextSortedIndex, tanL, phi, rof0, rof1};
                 }
                 ++storedTracklets;
@@ -841,11 +836,9 @@ GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
 {
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out,
-                                           in_out, num_items, stream));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
   GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out,
-                                           in_out, num_items, stream));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
   GPUChkErrS(cudaFree(d_temp_storage));
 }
 
@@ -860,11 +853,9 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
 {
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out,
-                                           in_out, num_items, stream));
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
   GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out,
-                                           in_out, num_items, stream));
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
   GPUChkErrS(cudaFree(d_temp_storage));
 }
 
@@ -907,10 +898,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  gpu::Streams& streams)
 {
   for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    gpu::computeLayerTrackletsMultiROFKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                                     o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
-                                                     0,
-                                                     streams[iLayer].get()>>>(
+    gpu::computeLayerTrackletsMultiROFKernel<true><<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
       utils,
       multMask,
       iLayer,
@@ -976,10 +964,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    gpu::Streams& streams)
 {
   for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    gpu::computeLayerTrackletsMultiROFKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                                      o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
-                                                      0,
-                                                      streams[iLayer].get()>>>(
+    gpu::computeLayerTrackletsMultiROFKernel<false><<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
       utils,
       multMask,
       iLayer,
@@ -1013,10 +998,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
     nTracklets[iLayer] = unique_end - tracklets_ptr;
     if (iLayer > 0) {
       GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get()));
-      gpu::compileTrackletsLookupTableKernel<<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                               o2::gpu::CAMath::Min(nThreads, GPU_THREADS),
-                                               0,
-                                               streams[iLayer].get()>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
+      gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
+        spanTracklets[iLayer],
+        trackletsLUTsHost[iLayer],
+        nTracklets[iLayer]);
       gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get());
     }
   }
@@ -1040,8 +1025,7 @@ void countCellsHandler(
   const int nBlocks,
   const int nThreads)
 {
-  gpu::computeLayerCellsKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                       o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::computeLayerCellsKernel<true><<<nBlocks, nThreads>>>(
     sortedClusters,           // const Cluster**
     unsortedClusters,         // const Cluster**
     tfInfo,                   // const TrackingFrameInfo**
@@ -1076,8 +1060,7 @@ void computeCellsHandler(
   const int nBlocks,
   const int nThreads)
 {
-  gpu::computeLayerCellsKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                        o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::computeLayerCellsKernel<false><<<nBlocks, nThreads>>>(
     sortedClusters,           // const Cluster**
     unsortedClusters,         // const Cluster**
     tfInfo,                   // const TrackingFrameInfo**
@@ -1107,8 +1090,7 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                         const int nBlocks,
                                         const int nThreads)
 {
-  gpu::computeLayerCellNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                                o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
     cellsLayersDevice,
     neighboursLUT,
     neighboursIndexTable,
@@ -1142,8 +1124,7 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const int nThreads)
 {
 
-  gpu::computeLayerCellNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                                 o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::computeLayerCellNeighboursKernel<false><<<nBlocks, nThreads>>>(
     cellsLayersDevice,
     neighboursLUT,
     neighboursIndexTable,
@@ -1198,8 +1179,7 @@ void processNeighboursHandler(const int startLayer,
   thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
                                                                                                              // TODO: fix this.
 
-  gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                       o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(
     startLayer,
     startLevel,
     allCellSeeds,
@@ -1221,8 +1201,7 @@ void processNeighboursHandler(const int startLayer,
 
   thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
   thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
-  gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                        o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(
     startLayer,
     startLevel,
     allCellSeeds,
@@ -1255,8 +1234,7 @@ void processNeighboursHandler(const int startLayer,
     foundSeedsTable.resize(lastCellSeedSize + 1);
     thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
 
-    gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                         o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+    gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(
       iLayer,
       --level,
       allCellSeeds,
@@ -1282,8 +1260,7 @@ void processNeighboursHandler(const int startLayer,
     updatedCellSeed.resize(foundSeeds);
     thrust::fill(updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed());
 
-    gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                                          o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+    gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(
       iLayer,
       level,
       allCellSeeds,
@@ -1326,8 +1303,7 @@ void trackSeedHandler(CellSeed* trackSeeds,
                       const int nThreads)
 {
   thrust::device_vector<float> minPts(minPtsHost);
-  gpu::fitTrackSeedsKernel<<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
-                             o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::fitTrackSeedsKernel<<<nBlocks, nThreads>>>(
     trackSeeds,                           // CellSeed*
     foundTrackingFrameInfo,               // TrackingFrameInfo**
     tracks,                               // TrackITSExt*
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
index cc43b6845a714..e8e475f2232c8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
@@ -12,6 +12,7 @@
 if(HIP_ENABLED)
   message(STATUS "Building ITS HIP tracker")
   set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fgpu-rdc")
+  # set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O0 -g -ggdb -fno-inline -fno-omit-frame-pointer -D__HIP_ENABLE_DEVICE_ASSERT__")
   o2_add_hipified_library(ITStrackingHIP
                  SOURCES ../cuda/ClusterLinesGPU.cu
                          ../cuda/TimeFrameGPU.cu
@@ -28,4 +29,4 @@ if(HIP_ENABLED)
                                        hip::host
                  PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider
                  TARGETVARNAME targetName)
-endif()
\ No newline at end of file
+endif()
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
index 3f0d291d5e51d..5e08e6c48e03a 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -308,7 +308,7 @@ struct TimeFrame {
 
   const o2::base::PropagatorImpl<float>* mPropagatorDevice = nullptr; // Needed only for GPU
 
-  void wipe();
+  virtual void wipe();
 
  private:
   void prepareClusters(const TrackingParameters& trkParam, const int maxLayers = nLayers);
@@ -531,8 +531,8 @@ inline gsl::span<int> TimeFrame<nLayers>::getIndexTable(int rofId, int layer)
   if (rofId < 0 || rofId >= mNrof) {
     return {};
   }
-  return {&mIndexTables[layer][rofId * (mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1)],
-          static_cast<gsl::span<int>::size_type>(mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1)};
+  const int tableSize = mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1;
+  return {&mIndexTables[layer][rofId * tableSize], static_cast<gsl::span<int>::size_type>(tableSize)};
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h
index bf9cb79169566..f123a2a9a1d80 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h
@@ -61,7 +61,6 @@ class ITSTrackingInterface
   void initialise();
   void run(framework::ProcessingContext& pc);
   void printSummary() const;
-  void end();
 
   virtual void updateTimeDependentParams(framework::ProcessingContext& pc);
   virtual void finaliseCCDB(framework::ConcreteDataMatcher& matcher, void* obj);
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
index 2e9ce23719f90..a59c51949b9f9 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
@@ -583,7 +583,6 @@ void TimeFrame<nLayers>::printSliceInfo(const int startROF, const int sliceSize)
 template <int nLayers>
 void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource>& pool)
 {
-  wipe();
   mMemoryPool = pool;
 
   auto initVector = [&]<typename T>(bounded_vector<T>& vec) {
diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
index ba722c410f95c..938356050262d 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
@@ -65,7 +65,6 @@ void Tracker::clustersToTracks(const LogFunc& logger, const LogFunc& error)
          (double)mTimeFrame->getArtefactsMemory() / GB, (double)mTrkParams[iteration].MaxMemory / GB);
     LOGP(error, "Exception: {}", err.what());
     if (mTrkParams[iteration].DropTFUponFailure) {
-      mTimeFrame->wipe();
       mMemoryPool->print();
       ++mNumberOfDroppedTFs;
       error("...Dropping Timeframe...");
@@ -144,17 +143,17 @@ void Tracker::clustersToTracks(const LogFunc& logger, const LogFunc& error)
     error("Uncaught exception, all bets are off...");
   }
 
-  if (mTrkParams[0].PrintMemory) {
-    mTimeFrame->printArtefactsMemory();
-    mMemoryPool->print();
-  }
-
   if (mTimeFrame->hasMCinformation()) {
     computeTracksMClabels();
   }
   rectifyClusterIndices();
   ++mTimeFrameCounter;
   mTotalTime += total;
+
+  if (mTrkParams[0].PrintMemory) {
+    mTimeFrame->printArtefactsMemory();
+    mMemoryPool->print();
+  }
 }
 
 void Tracker::computeRoadsMClabels()
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
index 136ebc647cc38..b46e7a68875e6 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
@@ -107,9 +107,10 @@ void TrackerTraits<nLayers>::computeLayerTracklets(const int iteration, int iROF
 
           for (int iV = startVtx; iV < endVtx; ++iV) {
             const auto& pv = primaryVertices[iV];
-            if (pv.isFlagSet(Vertex::Flags::UPCMode) && iteration != 3) {
+            if ((pv.isFlagSet(Vertex::Flags::UPCMode) && iteration != 3) || (iteration == 3 && !pv.isFlagSet(Vertex::Flags::UPCMode))) {
               continue;
             }
+
             const float resolution = o2::gpu::CAMath::Sqrt(math_utils::Sq(mTimeFrame->getPositionResolution(iLayer)) + math_utils::Sq(mTrkParams[iteration].PVres) / float(pv.getNContributors()));
             const float tanLambda = (currentCluster.zCoordinate - pv.getZ()) * inverseR0;
             const float zAtRmin = tanLambda * (mTimeFrame->getMinR(iLayer + 1) - currentCluster.radius) + currentCluster.zCoordinate;
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
index 00a69a37cb51a..3b05a7655d68c 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
@@ -310,6 +310,7 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc)
       LOGP(info, "ITSTracker pushed {} vertex purities", allVerticesPurities.size());
     }
   }
+  mTimeFrame->wipe();
 }
 
 void ITSTrackingInterface::updateTimeDependentParams(framework::ProcessingContext& pc)
@@ -381,11 +382,6 @@ void ITSTrackingInterface::printSummary() const
   mTracker->printSummary();
 }
 
-void ITSTrackingInterface::end()
-{
-  mTimeFrame->wipe();
-}
-
 void ITSTrackingInterface::setTraitsFromProvider(VertexerTraits* vertexerTraits,
                                                  TrackerTraits7* trackerTraits,
                                                  TimeFrame7* frame)
diff --git a/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx
index c825ec39d3499..be7750964b3e7 100644
--- a/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx
+++ b/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx
@@ -75,7 +75,6 @@ void TrackerDPL::endOfStream(EndOfStreamContext& ec)
 
 void TrackerDPL::end()
 {
-  mITSTrackingInterface.end();
   mITSTrackingInterface.printSummary();
   LOGF(info, "ITS CA-Tracker total timing: Cpu: %.3e Real: %.3e s in %d slots", mTimer.CpuTime(), mTimer.RealTime(), mTimer.Counter() - 1);
 }