diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index a1d52bff11f9d..2bd1550e7c72b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -25,13 +25,13 @@ namespace o2::its::gpu
 {
 
 template <int nLayers = 7>
-class TimeFrameGPU : public TimeFrame<nLayers>
+class TimeFrameGPU final : public TimeFrame<nLayers>
 {
   using typename TimeFrame<nLayers>::CellSeedN;
   using typename TimeFrame<nLayers>::IndexTableUtilsN;
 
  public:
-  TimeFrameGPU();
+  TimeFrameGPU() = default;
   ~TimeFrameGPU() = default;
 
   /// Most relevant operations
@@ -44,13 +44,13 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void loadTrackingFrameInfoDevice(const int, const int);
   void createTrackingFrameInfoDeviceArray(const int);
   void loadUnsortedClustersDevice(const int, const int);
-  void createUnsortedClustersDeviceArray(const int);
+  void createUnsortedClustersDeviceArray(const int, const int = nLayers);
   void loadClustersDevice(const int, const int);
-  void createClustersDeviceArray(const int);
+  void createClustersDeviceArray(const int, const int = nLayers);
   void loadClustersIndexTables(const int, const int);
-  void createClustersIndexTablesArray(const int iteration);
+  void createClustersIndexTablesArray(const int);
   void createUsedClustersDevice(const int, const int);
-  void createUsedClustersDeviceArray(const int);
+  void createUsedClustersDeviceArray(const int, const int = nLayers);
   void loadUsedClustersDevice();
   void loadROFrameClustersDevice(const int, const int);
   void createROFrameClustersDeviceArray(const int);
@@ -85,6 +85,12 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
 
+  /// Vertexer
+  void createVtxTrackletsLUTDevice(const int32_t);
+  void createVtxTrackletsBuffers(const int32_t);
+  void createVtxLinesLUTDevice(const int32_t);
+  void createVtxLinesBuffer(const int32_t);
+
   /// synchronization
   auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
   auto& getStreams() { return mGpuStreams; }
@@ -98,6 +104,8 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   virtual void wipe() final;
 
   /// interface
+  virtual bool isGPU() const noexcept final { return true; }
+  virtual const char* getName() const noexcept { return "GPU"; }
   int getNClustersInRofSpan(const int, const int, const int) const;
   IndexTableUtilsN* getDeviceIndexTableUtils() { return mIndexTableUtilsDevice; }
   int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; }
@@ -122,7 +130,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
   const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
   std::vector<unsigned int> getClusterSizes();
-  const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
+  uint8_t** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
   const int** getDeviceROFrameClusters() const { return mROFramesClustersDeviceArray; }
   Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; }
   int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
@@ -135,6 +143,19 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
   uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; }
 
+  // Vertexer
+  auto& getDeviceNTrackletsPerROF() const noexcept { return mNTrackletsPerROFDevice; }
+  auto& getDeviceNTrackletsPerCluster() const noexcept { return mNTrackletsPerClusterDevice; }
+  auto& getDeviceNTrackletsPerClusterSum() const noexcept { return mNTrackletsPerClusterSumDevice; }
+  int32_t** getDeviceArrayNTrackletsPerROF() const noexcept { return mNTrackletsPerROFDeviceArray; }
+  int32_t** getDeviceArrayNTrackletsPerCluster() const noexcept { return mNTrackletsPerClusterDeviceArray; }
+  int32_t** getDeviceArrayNTrackletsPerClusterSum() const noexcept { return mNTrackletsPerClusterSumDeviceArray; }
+  uint8_t* getDeviceUsedTracklets() const noexcept { return mUsedTrackletsDevice; }
+  int32_t* getDeviceNLinesPerCluster() const noexcept { return mNLinesPerClusterDevice; }
+  int32_t* getDeviceNLinesPerClusterSum() const noexcept { return mNLinesPerClusterSumDevice; }
+  Line* getDeviceLines() const noexcept { return mLinesDevice; }
+  gsl::span<int*> getDeviceTrackletsPerROFs() { return mNTrackletsPerROFDevice; }
+
   void setDevicePropagator(const o2::base::PropagatorImpl<float>* p) final { this->mPropagatorDevice = p; }
 
   // Host-specific getters
@@ -180,7 +201,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const Cluster** mClustersDeviceArray;
   const Cluster** mUnsortedClustersDeviceArray;
   const int** mClustersIndexTablesDeviceArray;
-  const unsigned char** mUsedClustersDeviceArray;
+  uint8_t** mUsedClustersDeviceArray;
   const int** mROFramesClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   std::array<int*, nLayers - 1> mTrackletsLUTDevice;
@@ -208,6 +229,18 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 
+  /// Vertexer
+  std::array<int32_t*, 2> mNTrackletsPerROFDevice;
+  std::array<int32_t*, 2> mNTrackletsPerClusterDevice;
+  std::array<int32_t*, 2> mNTrackletsPerClusterSumDevice;
+  uint8_t* mUsedTrackletsDevice;
+  int32_t* mNLinesPerClusterDevice;
+  int32_t* mNLinesPerClusterSumDevice;
+  int32_t** mNTrackletsPerROFDeviceArray;
+  int32_t** mNTrackletsPerClusterDeviceArray;
+  int32_t** mNTrackletsPerClusterSumDeviceArray;
+  Line* mLinesDevice;
+
   // State
   Streams mGpuStreams;
   std::bitset<nLayers + 1> mPinnedUnsortedClusters{0};
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 15fe6f05f7850..e5b9253ca4090 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -20,15 +20,22 @@
 #include <string>
 #include <tuple>
 
+#include "ITStracking/MathUtils.h"
+#include "ITStracking/ExternalAllocator.h"
+
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
 #include "GPUCommonLogger.h"
+#include "GPUCommonDefAPI.h"
 
+#ifdef GPUCA_GPUCODE
+#include <thrust/device_ptr.h>
 #ifndef __HIPCC__
 #define THRUST_NAMESPACE thrust::cuda
 #else
 #define THRUST_NAMESPACE thrust::hip
 #endif
+#endif
 
 #ifdef ITS_GPU_LOG
 #define GPULog(...) LOGP(info, __VA_ARGS__)
@@ -38,6 +45,10 @@
 
 namespace o2::its
 {
+// FWD declarations
+template <int>
+class IndexTableUtils;
+class Tracklet;
 
 template <typename T1, typename T2>
 using gpuPair = std::pair<T1, T2>;
@@ -282,6 +293,184 @@ class GPUTimer
   }
 };
 #endif
+
+#ifdef GPUCA_GPUCODE
+template <typename T>
+struct TypedAllocator {
+  using value_type = T;
+  using pointer = thrust::device_ptr<T>;
+  using const_pointer = thrust::device_ptr<const T>;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  TypedAllocator() noexcept : mInternalAllocator(nullptr) {}
+  explicit TypedAllocator(ExternalAllocator* a) noexcept : mInternalAllocator(a) {}
+
+  template <typename U>
+  TypedAllocator(const TypedAllocator<U>& o) noexcept : mInternalAllocator(o.mInternalAllocator)
+  {
+  }
+
+  pointer allocate(size_type n)
+  {
+    void* raw = mInternalAllocator->allocate(n * sizeof(T));
+    return thrust::device_pointer_cast(static_cast<T*>(raw));
+  }
+
+  void deallocate(pointer p, size_type n) noexcept
+  {
+    if (!p) {
+      return;
+    }
+    void* raw = thrust::raw_pointer_cast(p);
+    mInternalAllocator->deallocate(static_cast<char*>(raw), n * sizeof(T));
+  }
+
+  bool operator==(TypedAllocator const& o) const noexcept
+  {
+    return mInternalAllocator == o.mInternalAllocator;
+  }
+  bool operator!=(TypedAllocator const& o) const noexcept
+  {
+    return !(*this == o);
+  }
+
+ private:
+  ExternalAllocator* mInternalAllocator;
+};
+
+template <int nLayers>
+GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
+                                const o2::its::IndexTableUtils<nLayers>* utils,
+                                const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
+{
+  const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
+  const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi;
+  const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz;
+  const float phiRangeMax = (maxdeltaphi > o2::constants::math::PI) ? o2::constants::math::TwoPI : currentCluster.phi + maxdeltaphi;
+
+  if (zRangeMax < -utils->getLayerZ(layerIndex) ||
+      zRangeMin > utils->getLayerZ(layerIndex) || zRangeMin > zRangeMax) {
+    return {};
+  }
+
+  return int4{o2::gpu::CAMath::Max(0, utils->getZBinIndex(layerIndex, zRangeMin)),
+              utils->getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
+              o2::gpu::CAMath::Min(utils->getNzBins() - 1, utils->getZBinIndex(layerIndex, zRangeMax)),
+              utils->getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
+}
+
+GPUdii() gpuSpan<const Vertex> getPrimaryVertices(const int rof,
+                                                  const int* roframesPV,
+                                                  const int nROF,
+                                                  const uint8_t* mask,
+                                                  const Vertex* vertices)
+{
+  const int start_pv_id = roframesPV[rof];
+  const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1;
+  size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded
+  return gpuSpan<const Vertex>(&vertices[start_pv_id], delta);
+};
+
+GPUdii() gpuSpan<const Vertex> getPrimaryVertices(const int romin,
+                                                  const int romax,
+                                                  const int* roframesPV,
+                                                  const int nROF,
+                                                  const Vertex* vertices)
+{
+  const int start_pv_id = roframesPV[romin];
+  const int stop_rof = romax >= nROF - 1 ? nROF : romax + 1;
+  return gpuSpan<const Vertex>(&vertices[start_pv_id], roframesPV[stop_rof] - roframesPV[romin]);
+};
+
+GPUdii() gpuSpan<const Cluster> getClustersOnLayer(const int rof,
+                                                   const int totROFs,
+                                                   const int layer,
+                                                   const int** roframesClus,
+                                                   const Cluster** clusters)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<const Cluster>();
+  }
+  const int start_clus_id{roframesClus[layer][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id;
+  return gpuSpan<const Cluster>(&(clusters[layer][start_clus_id]), delta);
+}
+
+GPUdii() gpuSpan<const Tracklet> getTrackletsPerCluster(const int rof,
+                                                        const int totROFs,
+                                                        const int mode,
+                                                        const int** roframesClus,
+                                                        const Tracklet** tracklets)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<const Tracklet>();
+  }
+  const int start_clus_id{roframesClus[1][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
+  return gpuSpan<const Tracklet>(&(tracklets[mode][start_clus_id]), delta);
+}
+
+GPUdii() gpuSpan<int> getNTrackletsPerCluster(const int rof,
+                                              const int totROFs,
+                                              const int mode,
+                                              const int** roframesClus,
+                                              int** ntracklets)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<int>();
+  }
+  const int start_clus_id{roframesClus[1][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
+  return gpuSpan<int>(&(ntracklets[mode][start_clus_id]), delta);
+}
+
+GPUdii() gpuSpan<const int> getNTrackletsPerCluster(const int rof,
+                                                    const int totROFs,
+                                                    const int mode,
+                                                    const int** roframesClus,
+                                                    const int** ntracklets)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<const int>();
+  }
+  const int start_clus_id{roframesClus[1][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
+  return gpuSpan<const int>(&(ntracklets[mode][start_clus_id]), delta);
+}
+
+GPUdii() gpuSpan<int> getNLinesPerCluster(const int rof,
+                                          const int totROFs,
+                                          const int** roframesClus,
+                                          int* nlines)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<int>();
+  }
+  const int start_clus_id{roframesClus[1][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
+  return gpuSpan<int>(&(nlines[start_clus_id]), delta);
+}
+
+GPUdii() gpuSpan<const int> getNLinesPerCluster(const int rof,
+                                                const int totROFs,
+                                                const int** roframesClus,
+                                                const int* nlines)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<const int>();
+  }
+  const int start_clus_id{roframesClus[1][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
+  return gpuSpan<const int>(&(nlines[start_clus_id]), delta);
+}
+#endif
 } // namespace gpu
 } // namespace o2::its
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h
index 5b1d9194e1174..dddc247466c65 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h
@@ -31,32 +31,25 @@
 namespace o2::its
 {
 
-class VertexerTraitsGPU final : public VertexerTraits
+template <int nLayers>
+class VertexerTraitsGPU final : public VertexerTraits<nLayers>
 {
  public:
   void initialise(const TrackingParameters&, const int iteration = 0) final;
-  void adoptTimeFrame(TimeFrame<7>*) noexcept final;
+  void adoptTimeFrame(TimeFrame<nLayers>* tf) noexcept final;
   void computeTracklets(const int iteration = 0) final;
   void computeTrackletMatching(const int iteration = 0) final;
   void computeVertices(const int iteration = 0) final;
   void updateVertexingParameters(const std::vector<VertexingParameters>&, const TimeFrameGPUParameters&) final;
-  void computeVerticesHist();
 
   bool isGPU() const noexcept final { return true; }
   const char* getName() const noexcept final { return "GPU"; }
 
  protected:
-  IndexTableUtils* mDeviceIndexTableUtils;
-  gpu::TimeFrameGPU<7>* mTimeFrameGPU;
+  gpu::TimeFrameGPU<nLayers>* mTimeFrameGPU;
   TimeFrameGPUParameters mTfGPUParams;
 };
 
-inline void VertexerTraitsGPU::adoptTimeFrame(TimeFrame<7>* tf) noexcept
-{
-  mTimeFrameGPU = static_cast<gpu::TimeFrameGPU<7>*>(tf);
-  mTimeFrame = static_cast<TimeFrame<7>*>(tf);
-}
-
 } // namespace o2::its
 
 #endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h
index 059b1cdc29082..67f12bad8486c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h
@@ -12,46 +12,104 @@
 
 #ifndef ITSTRACKINGGPU_VERTEXINGKERNELS_H_
 #define ITSTRACKINGGPU_VERTEXINGKERNELS_H_
-#include "ITStracking/MathUtils.h"
-#include "ITStracking/Configuration.h"
-#include "ITStracking/ClusterLines.h"
-#include "ITStracking/Tracklet.h"
 
+#include <cstdint>
+#include <gsl/span>
+#include <array>
+#include "ITStracking/Tracklet.h"
+#include "ITStracking/Cluster.h"
+#include "ITStracking/ClusterLines.h"
 #include "ITStrackingGPU/Utils.h"
-#include "ITStrackingGPU/ClusterLinesGPU.h"
-#include "ITStrackingGPU/VertexerTraitsGPU.h"
-#include "ITStrackingGPU/TracerGPU.h"
 
-namespace o2::its::gpu
+namespace o2::its
 {
-#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
-template <TrackletMode Mode>
-GPUg() void trackleterKernelMultipleRof(
-  const Cluster* clustersNextLayer,    // 0 2
-  const Cluster* clustersCurrentLayer, // 1 1
-  const int* sizeNextLClusters,
-  const int* sizeCurrentLClusters,
-  const int* nextIndexTables,
-  Tracklet* Tracklets,
-  int* foundTracklets,
-  const IndexTableUtils* utils,
-  const unsigned int startRofId,
-  const unsigned int rofSize,
-  const float phiCut,
-  const size_t maxTrackletsPerCluster);
-#endif
-template <TrackletMode Mode>
-void trackletFinderHandler(const Cluster* clustersNextLayer,    // 0 2
-                           const Cluster* clustersCurrentLayer, // 1 1
-                           const int* sizeNextLClusters,
-                           const int* sizeCurrentLClusters,
-                           const int* nextIndexTables,
-                           Tracklet* Tracklets,
-                           int* foundTracklets,
-                           const IndexTableUtils* utils,
-                           const unsigned int startRofId,
-                           const unsigned int rofSize,
-                           const float phiCut,
-                           const size_t maxTrackletsPerCluster = 1e2);
-} // namespace o2::its::gpu
+
+/// Trackleting
+template <int32_t nLayers>
+void countTrackletsInROFsHandler(const IndexTableUtils<nLayers>* GPUrestrict() utils,
+                                 const uint8_t* GPUrestrict() multMask,
+                                 const int32_t nRofs,
+                                 const int32_t deltaROF,
+                                 const int32_t* GPUrestrict() rofPV,
+                                 const int32_t vertPerRofThreshold,
+                                 const Cluster** GPUrestrict() clusters,
+                                 const uint32_t nClusters,
+                                 const int32_t** GPUrestrict() ROFClusters,
+                                 const uint8_t** GPUrestrict() usedClusters,
+                                 const int32_t** GPUrestrict() clustersIndexTables,
+                                 int32_t** trackletsPerClusterLUTs,
+                                 int32_t** trackletsPerClusterSumLUTs,
+                                 int32_t** trackletsPerROF,
+                                 const std::array<int32_t*, 2>& trackletsPerClusterLUTsHost,
+                                 const std::array<int32_t*, 2>& trackletsPerClusterSumLUTsHost,
+                                 const int32_t iteration,
+                                 const float phiCut,
+                                 const int32_t maxTrackletsPerCluster,
+                                 const int32_t nBlocks,
+                                 const int32_t nThreads,
+                                 gpu::Streams& streams);
+
+template <int32_t nLayers>
+void computeTrackletsInROFsHandler(const IndexTableUtils<nLayers>* GPUrestrict() utils,
+                                   const uint8_t* GPUrestrict() multMask,
+                                   const int32_t nRofs,
+                                   const int32_t deltaROF,
+                                   const int32_t* GPUrestrict() rofPV,
+                                   const int vertPerRofThreshold,
+                                   const Cluster** GPUrestrict() clusters,
+                                   const uint32_t nClusters,
+                                   const int32_t** GPUrestrict() ROFClusters,
+                                   const uint8_t** GPUrestrict() usedClusters,
+                                   const int32_t** GPUrestrict() clustersIndexTables,
+                                   Tracklet** GPUrestrict() foundTracklets,
+                                   const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                   const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                   const int32_t** GPUrestrict() trackletsPerROF,
+                                   const int32_t iteration,
+                                   const float phiCut,
+                                   const int32_t maxTrackletsPerCluster,
+                                   const int32_t nBlocks,
+                                   const int32_t nThreads,
+                                   gpu::Streams& streams);
+
+/// Selection
+void countTrackletsMatchingInROFsHandler(const int32_t nRofs,
+                                         const int32_t deltaROF,
+                                         const uint32_t nClusters,
+                                         const int32_t** GPUrestrict() ROFClusters,
+                                         const Cluster** GPUrestrict() clusters,
+                                         uint8_t** GPUrestrict() usedClusters,
+                                         const Tracklet** GPUrestrict() foundTracklets,
+                                         uint8_t* GPUrestrict() usedTracklets,
+                                         const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                         const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                         int32_t* GPUrestrict() linesPerClusterLUT,
+                                         int32_t* GPUrestrict() linesPerClusterSumLUT,
+                                         const int32_t iteration,
+                                         const float phiCut,
+                                         const float tanLambdaCut,
+                                         const int32_t nBlocks,
+                                         const int32_t nThreads,
+                                         gpu::Streams& streams);
+
+void computeTrackletsMatchingInROFsHandler(const int32_t nRofs,
+                                           const int32_t deltaROF,
+                                           const uint32_t nClusters,
+                                           const int32_t** GPUrestrict() ROFClusters,
+                                           const Cluster** GPUrestrict() clusters,
+                                           const uint8_t** GPUrestrict() usedClusters,
+                                           const Tracklet** GPUrestrict() foundTracklets,
+                                           uint8_t* GPUrestrict() usedTracklets,
+                                           const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                           const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                           const int32_t* GPUrestrict() linesPerClusterSumLUT,
+                                           Line* GPUrestrict() lines,
+                                           const int32_t iteration,
+                                           const float phiCut,
+                                           const float tanLambdaCut,
+                                           const int32_t nBlocks,
+                                           const int32_t nThreads,
+                                           gpu::Streams& streams);
+
+} // namespace o2::its
 #endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
index 3dff67dbccd80..1f6a046a81350 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -22,8 +22,8 @@ if(CUDA_ENABLED)
                            TimeFrameGPU.cu
                            TracerGPU.cu
                            TrackingKernels.cu
-                           # VertexingKernels.cu
-                           # VertexerTraitsGPU.cxx
+                           VertexingKernels.cu
+                           VertexerTraitsGPU.cxx
                    PUBLIC_INCLUDE_DIRECTORIES ../
                    PUBLIC_LINK_LIBRARIES O2::ITStracking
                                          O2::SimConfig
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 965bf27fdd12b..4da91522371f8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -28,12 +28,6 @@
 namespace o2::its::gpu
 {
 
-template <int nLayers>
-TimeFrameGPU<nLayers>::TimeFrameGPU()
-{
-  this->mIsGPU = true;
-}
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator)
 {
@@ -69,14 +63,14 @@ void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createUnsortedClustersDeviceArray(const int iteration)
+void TimeFrameGPU<nLayers>::createUnsortedClustersDeviceArray(const int iteration, const int maxLayers)
 {
   if (!iteration) {
     GPUTimer timer("creating unsorted clusters array");
     allocMem(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
     mPinnedUnsortedClusters.set(nLayers);
-    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) {
       GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
       mPinnedUnsortedClusters.set(iLayer);
     }
@@ -96,14 +90,14 @@ void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration, cons
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createClustersDeviceArray(const int iteration)
+void TimeFrameGPU<nLayers>::createClustersDeviceArray(const int iteration, const int maxLayers)
 {
   if (!iteration) {
     GPUTimer timer("creating sorted clusters array");
     allocMem(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
     mPinnedClusters.set(nLayers);
-    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) {
       GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
       mPinnedClusters.set(iLayer);
     }
@@ -150,15 +144,15 @@ void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration, const i
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createUsedClustersDeviceArray(const int iteration)
+void TimeFrameGPU<nLayers>::createUsedClustersDeviceArray(const int iteration, const int maxLayers)
 {
   if (!iteration) {
     GPUTimer timer("creating used clusters flags");
-    allocMem(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator());
-    GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaHostRegisterPortable));
+    allocMem(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->getExtAllocator());
+    GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(uint8_t*), cudaHostRegisterPortable));
     mPinnedUsedClusters.set(nLayers);
-    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaHostRegisterPortable));
+    for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) {
+      GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(uint8_t), cudaHostRegisterPortable));
       mPinnedUsedClusters.set(iLayer);
     }
   }
@@ -460,6 +454,87 @@ void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& s
   GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createVtxTrackletsLUTDevice(const int32_t iteration)
+{
+  GPUTimer timer("creating vertexer tracklet LUTs");
+  const int32_t ncls = this->mClusters[1].size();
+  for (int32_t iMode{0}; iMode < 2; ++iMode) {
+    if (!iteration) {
+      GPULog("gpu-transfer: creating vertexer tracklets per cluster for {} elements for mode {}, for {:.2f} MB.", ncls, iMode, ncls * sizeof(int32_t) / constants::MB);
+      allocMemAsync(reinterpret_cast<void**>(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator());
+
+      GPULog("gpu-transfer: creating vertexer tracklets per cluster sum for {} elements for mode {}, for {:.2f} MB.", ncls + 1, iMode, (ncls + 1) * sizeof(int32_t) / constants::MB);
+      allocMemAsync(reinterpret_cast<void**>(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator());
+
+      GPULog("gpu-transfer: creating vertexer tracklets per ROF for {} elements for mode {}, for {:.2f} MB.", this->mNrof + 1, iMode, (this->mNrof + 1) * sizeof(int32_t) / constants::MB);
+      allocMemAsync(reinterpret_cast<void**>(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator());
+    }
+    GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[iMode], 0, ncls * sizeof(int32_t), mGpuStreams[iMode].get()));
+    GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterSumDevice[iMode], 0, (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode].get()));
+    GPUChkErrS(cudaMemsetAsync(mNTrackletsPerROFDevice[iMode], 0, (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode].get()));
+  }
+  mGpuStreams[0].sync();
+  mGpuStreams[1].sync();
+  if (!iteration) {
+    allocMem(reinterpret_cast<void**>(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterDeviceArray, mNTrackletsPerClusterDevice.data(), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice));
+
+    allocMem(reinterpret_cast<void**>(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterSumDeviceArray, mNTrackletsPerClusterSumDevice.data(), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice));
+
+    allocMem(reinterpret_cast<void**>(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mNTrackletsPerROFDeviceArray, mNTrackletsPerROFDevice.data(), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice));
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createVtxTrackletsBuffers(const int32_t iteration)
+{
+  GPUTimer timer("creating vertexer tracklet buffers");
+  for (int32_t iMode{0}; iMode < 2; ++iMode) {
+    this->mTotalTracklets[iMode] = 0;
+    GPUChkErrS(cudaMemcpyAsync(&(this->mTotalTracklets[iMode]), mNTrackletsPerClusterSumDevice[iMode] + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost, mGpuStreams[iMode].get()));
+    GPULog("gpu-transfer: creating vertexer tracklets buffer for {} elements on layer {}, for {:.2f} MB.", this->mTotalTracklets[iMode], iMode, this->mTotalTracklets[iMode] * sizeof(Tracklet) / constants::MB);
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->getExtAllocator());
+  }
+  mGpuStreams[0].sync();
+  mGpuStreams[1].sync();
+  allocMem(reinterpret_cast<void**>(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->getExtAllocator());
+  GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpy(mTrackletsDeviceArray, mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaMemcpyHostToDevice));
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createVtxLinesLUTDevice(const int32_t iteration)
+{
+  GPUTimer timer("creating vertexer lines LUT and used tracklets buffer");
+  const int32_t ncls = this->mClusters[1].size();
+
+  GPULog("gpu-transfer: creating vertexer lines per cluster for {} elements , for {:.2f} MB.", ncls, ncls * sizeof(int32_t) / constants::MB);
+  allocMem(reinterpret_cast<void**>(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->getExtAllocator());
+
+  GPULog("gpu-transfer: creating vertexer lines per cluster sum for {} elements , for {:.2f} MB.", ncls + 1, (ncls + 1) * sizeof(int32_t) / constants::MB);
+  allocMem(reinterpret_cast<void**>(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->getExtAllocator());
+
+  const int32_t ntrkls = this->mTotalTracklets[0];
+  GPULog("gpu-transfer: creating vertexer used tracklets for {} elements , for {:.2f} MB.", ntrkls, ntrkls * sizeof(uint8_t) / constants::MB);
+  allocMem(reinterpret_cast<void**>(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->getExtAllocator());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createVtxLinesBuffer(const int32_t iteration)
+{
+  GPUTimer timer("creating vertexer lines buffer and resetting used tracklets");
+  int32_t nlines = 0;
+  GPUChkErrS(cudaMemcpy(&nlines, mNLinesPerClusterDevice + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost));
+  this->mTotalLines = nlines;
+  GPULog("gpu-transfer: creating vertexer lines for {} elements , for {:.2f} MB.", nlines, nlines * sizeof(Line) / constants::MB);
+  allocMem(reinterpret_cast<void**>(&mLinesDevice), nlines * sizeof(Line), this->getExtAllocator());
+  // reset used tracklets
+  GPUChkErrS(cudaMemset(mUsedTrackletsDevice, 0, this->mTotalTracklets[0] * sizeof(uint8_t)));
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadCellsDevice()
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 6a824de851fed..047d42d815e99 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -93,7 +93,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          mTimeFrameGPU->getDeviceArrayClusters(),
                                          mTimeFrameGPU->getClusterSizes(),
                                          mTimeFrameGPU->getDeviceROFrameClusters(),
-                                         mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                         (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(),
                                          mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
                                          mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
                                          mTimeFrameGPU->getDeviceTrackletsLUTs(),
@@ -128,7 +128,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                            mTimeFrameGPU->getDeviceArrayClusters(),
                                            mTimeFrameGPU->getClusterSizes(),
                                            mTimeFrameGPU->getDeviceROFrameClusters(),
-                                           mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                           (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(),
                                            mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
                                            mTimeFrameGPU->getDeviceArrayTracklets(),
                                            mTimeFrameGPU->getDeviceTracklets(),
@@ -300,7 +300,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
                                         mTimeFrameGPU->getDeviceArrayCells(),
                                         mTimeFrameGPU->getDeviceCells()[startLayer],
                                         mTimeFrameGPU->getArrayNCells(),
-                                        mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                        (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(),
                                         mTimeFrameGPU->getDeviceNeighboursAll(),
                                         mTimeFrameGPU->getDeviceNeighboursLUTs(),
                                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 0e5fd21e5e90e..2c43ccd3bb81c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -46,76 +46,6 @@ namespace o2::its
 namespace gpu
 {
 
-template <typename T>
-struct TypedAllocator {
-  using value_type = T;
-  using pointer = thrust::device_ptr<T>;
-  using const_pointer = thrust::device_ptr<const T>;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-
-  TypedAllocator() noexcept : mInternalAllocator(nullptr) {}
-  explicit TypedAllocator(ExternalAllocator* a) noexcept : mInternalAllocator(a) {}
-
-  template <typename U>
-  TypedAllocator(const TypedAllocator<U>& o) noexcept : mInternalAllocator(o.mInternalAllocator)
-  {
-  }
-
-  pointer allocate(size_type n)
-  {
-    void* raw = mInternalAllocator->allocate(n * sizeof(T));
-    return thrust::device_pointer_cast(static_cast<T*>(raw));
-  }
-
-  void deallocate(pointer p, size_type n) noexcept
-  {
-    if (!p) {
-      return;
-    }
-    void* raw = thrust::raw_pointer_cast(p);
-    mInternalAllocator->deallocate(static_cast<char*>(raw), n * sizeof(T));
-  }
-
-  bool operator==(TypedAllocator const& o) const noexcept
-  {
-    return mInternalAllocator == o.mInternalAllocator;
-  }
-  bool operator!=(TypedAllocator const& o) const noexcept
-  {
-    return !(*this == o);
-  }
-
- private:
-  ExternalAllocator* mInternalAllocator;
-};
-
-GPUdii() int4 getEmptyBinsRect()
-{
-  return int4{0, 0, 0, 0};
-}
-
-template <int nLayers>
-GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
-                                const IndexTableUtils<nLayers>& utils,
-                                const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
-{
-  const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
-  const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi;
-  const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz;
-  const float phiRangeMax = (maxdeltaphi > o2::constants::math::PI) ? o2::constants::math::TwoPI : currentCluster.phi + maxdeltaphi;
-
-  if (zRangeMax < -utils.getLayerZ(layerIndex) ||
-      zRangeMin > utils.getLayerZ(layerIndex) || zRangeMin > zRangeMax) {
-    return getEmptyBinsRect();
-  }
-
-  return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex, zRangeMin)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
-              o2::gpu::CAMath::Min(utils.getNzBins() - 1, utils.getZBinIndex(layerIndex, zRangeMax)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
-}
-
 GPUdii() bool fitTrack(TrackITSExt& track,
                        int start,
                        int end,
@@ -272,44 +202,6 @@ struct compare_track_chi2 {
   }
 };
 
-GPUdii() gpuSpan<const Vertex> getPrimaryVertices(const int rof,
-                                                  const int* roframesPV,
-                                                  const int nROF,
-                                                  const uint8_t* mask,
-                                                  const Vertex* vertices)
-{
-  const int start_pv_id = roframesPV[rof];
-  const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1;
-  const size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded
-  return gpuSpan<const Vertex>(&vertices[start_pv_id], delta);
-};
-
-GPUdii() gpuSpan<const Vertex> getPrimaryVertices(const int romin,
-                                                  const int romax,
-                                                  const int* roframesPV,
-                                                  const int nROF,
-                                                  const Vertex* vertices)
-{
-  const int start_pv_id = roframesPV[romin];
-  const int stop_rof = romax >= nROF - 1 ? nROF : romax + 1;
-  return gpuSpan<const Vertex>(&vertices[start_pv_id], roframesPV[stop_rof] - roframesPV[romin]);
-};
-
-GPUdii() gpuSpan<const Cluster> getClustersOnLayer(const int rof,
-                                                   const int totROFs,
-                                                   const int layer,
-                                                   const int** roframesClus,
-                                                   const Cluster** clusters)
-{
-  if (rof < 0 || rof >= totROFs) {
-    return gpuSpan<const Cluster>();
-  }
-  const int start_clus_id{roframesClus[layer][rof]};
-  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
-  const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id;
-  return gpuSpan<const Cluster>(&(clusters[layer][start_clus_id]), delta);
-}
-
 template <int nLayers>
 GPUg() void __launch_bounds__(256, 1) fitTrackSeedsKernel(
   CellSeed<nLayers>* trackSeeds,
@@ -602,7 +494,7 @@ GPUg() void __launch_bounds__(256, 1) computeLayerTrackletsMultiROFKernel(
         const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
         const float sqInverseDeltaZ0{1.f / (math_utils::Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + constants::Tolerance)}; /// protecting from overflows adding the detector resolution
         const float sigmaZ{o2::gpu::CAMath::Sqrt(math_utils::Sq(resolution) * math_utils::Sq(tanLambda) * ((math_utils::Sq(inverseR0) + sqInverseDeltaZ0) * math_utils::Sq(meanDeltaR) + 1.f) + math_utils::Sq(meanDeltaR * MSAngle))};
-        const int4 selectedBinsRect{getBinsRect<nLayers>(currentCluster, layerIndex + 1, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)};
+        const int4 selectedBinsRect{getBinsRect<nLayers>(currentCluster, layerIndex + 1, utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)};
         if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
           continue;
         }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
index 90d654a26a43d..658d3cf0dfb91 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
@@ -11,217 +11,169 @@
 //
 /// \author matteo.concas@cern.ch
 
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <array>
-#include <cassert>
-#include <thread>
-
-#ifdef VTX_DEBUG
-#include "TTree.h"
-#include "TFile.h"
-#endif
+#include <gsl/span>
 
+#include "ITStracking/TrackingConfigParam.h"
 #include "ITStrackingGPU/VertexingKernels.h"
 #include "ITStrackingGPU/VertexerTraitsGPU.h"
 
 namespace o2::its
 {
 
-void VertexerTraitsGPU::initialise(const TrackingParameters& trackingParams, const int iteration)
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::initialise(const TrackingParameters& trackingParams, const int iteration)
+{
+  // FIXME
+  // Two things to fix here:
+  // This loads all necessary data for this step at once, can be overlayed with computation
+  // Also if running with the tracker some data is loaded twice!
+  mTimeFrameGPU->initialise(0, trackingParams, 3, &this->mIndexTableUtils, &mTfGPUParams);
+
+  // FIXME some of these only need to be created once!
+  mTimeFrameGPU->loadIndexTableUtils(iteration);
+  mTimeFrameGPU->createUsedClustersDeviceArray(iteration, 3);
+  mTimeFrameGPU->createClustersDeviceArray(iteration, 3);
+  mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration, 3);
+  mTimeFrameGPU->createClustersIndexTablesArray(iteration);
+  mTimeFrameGPU->createROFrameClustersDeviceArray(iteration);
+  for (int iLayer{0}; iLayer < 3; ++iLayer) {
+    mTimeFrameGPU->loadClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer);
+    mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer);
+    mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer);
+  }
+}
+
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::adoptTimeFrame(TimeFrame<nLayers>* tf) noexcept
 {
-  mTimeFrameGPU->initialise(0, trackingParams, 3, &mIndexTableUtils, &mTfGPUParams);
+  mTimeFrameGPU = static_cast<gpu::TimeFrameGPU<nLayers>*>(tf);
+  this->mTimeFrame = static_cast<TimeFrame<nLayers>*>(tf);
 }
 
-void VertexerTraitsGPU::updateVertexingParameters(const std::vector<VertexingParameters>& vrtPar, const TimeFrameGPUParameters& tfPar)
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::updateVertexingParameters(const std::vector<VertexingParameters>& vrtPar, const TimeFrameGPUParameters& tfPar)
 {
-  mVrtParams = vrtPar;
+  this->mVrtParams = vrtPar;
   mTfGPUParams = tfPar;
-  mIndexTableUtils.setTrackingParameters(vrtPar[0]);
-  for (auto& par : mVrtParams) {
-    par.phiSpan = static_cast<int>(std::ceil(mIndexTableUtils.getNphiBins() * par.phiCut / o2::constants::math::TwoPI));
-    par.zSpan = static_cast<int>(std::ceil(par.zCut * mIndexTableUtils.getInverseZCoordinate(0)));
+  this->mIndexTableUtils.setTrackingParameters(vrtPar[0]);
+  for (auto& par : this->mVrtParams) {
+    par.phiSpan = static_cast<int>(std::ceil(this->mIndexTableUtils.getNphiBins() * par.phiCut / o2::constants::math::TwoPI));
+    par.zSpan = static_cast<int>(std::ceil(par.zCut * this->mIndexTableUtils.getInverseZCoordinate(0)));
   }
 }
 
-void VertexerTraitsGPU::computeTracklets(const int iteration)
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::computeTracklets(const int iteration)
 {
-  if (!mTimeFrameGPU->getClusters().size()) {
+  if (mTimeFrameGPU->getClusters().empty()) {
     return;
   }
-  // std::vector<std::thread> threads(mTimeFrameGPU->getNChunks());
-  // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
-  //   int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()};
-  //   mTimeFrameGPU->getVerticesInChunks()[chunkId].clear();
-  //   mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear();
-  //   mTimeFrameGPU->getLabelsInChunks()[chunkId].clear();
-  //   auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void {
-  //     auto offset = chunkId * rofPerChunk;
-  //     auto maxROF = offset + rofPerChunk;
-  //     while (offset < maxROF) {
-  //       auto rofs = mTimeFrameGPU->loadChunkData<gpu::Task::Vertexer>(chunkId, offset, maxROF);
-  //       RANGE("chunk_gpu_vertexing", 1);
-  //       // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()};
-  //       // timer.Start("vtTrackletFinder");
-  //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer0Layer1><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),         // const Cluster* clustersNextLayer,    // 0 2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-  //         mTimeFrameGPU->getDeviceROframesClusters(0),                   // const int* sizeNextLClusters,
-  //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0),      // const int* nextIndexTables,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),        // Tracklet* Tracklets,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets,
-  //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
-  //         offset,                                                        // const unsigned int startRofId,
-  //         rofs,                                                          // const unsigned int rofSize,
-  //         mVrtParams.phiCut,                                             // const float phiCut,
-  //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
-
-  //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer1Layer2><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2),         // const Cluster* clustersNextLayer,    // 0 2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-  //         mTimeFrameGPU->getDeviceROframesClusters(2),                   // const int* sizeNextLClusters,
-  //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2),      // const int* nextIndexTables,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),        // Tracklet* Tracklets,
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets,
-  //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
-  //         offset,                                                        // const unsigned int startRofId,
-  //         rofs,                                                          // const unsigned int rofSize,
-  //         mVrtParams.phiCut,                                             // const float phiCut,
-  //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
-
-  //       gpu::trackletSelectionKernelMultipleRof<true><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-  //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-  //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
-  //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
-  //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
-  //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
-  //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
-  //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
-
-  //       discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),
-  //                                                   mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize,
-  //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),
-  //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(),
-  //                                                   mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
-  //                                                   mTimeFrameGPU->getStream(chunkId).get()));
-
-  //       // Reset used tracklets
-  //       checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),
-  //                                     false,
-  //                                     sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
-  //                                     mTimeFrameGPU->getStream(chunkId).get()),
-  //                     __FILE__, __LINE__);
-
-  //       gpu::trackletSelectionKernelMultipleRof<false><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-  //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-  //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
-  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
-  //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
-  //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
-  //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
-  //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
-  //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
-
-  //       int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1);
-  //       int lastFoundLines;
-  //       std::vector<int> exclusiveFoundLinesHost(nClusters + 1);
-
-  //       // Obtain whole exclusive sum including nCluster+1 element  (nCluster+1)th element is the total number of found lines.
-  //       checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-  //       checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-  //       exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines;
-
-  //       std::vector<Line> lines(exclusiveFoundLinesHost[nClusters]);
-
-  //       checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-  //       checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get()));
-
-  //       // Compute vertices
-  //       std::vector<ClusterLines> clusterLines;
-  //       std::vector<bool> usedLines;
-  //       for (int rofId{0}; rofId < rofs; ++rofId) {
-  //         auto rof = offset + rofId;
-  //         auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF
-  //         auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof];     // number of clusters for this ROF
-  //         auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof];                                                  // starting line offset for this ROF
-  //         auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof;
-  //         gsl::span<const o2::its::Line> linesInRof(lines.data() + linesOffsetRof, static_cast<gsl::span<o2::its::Line>::size_type>(nLinesRof));
-
-  //         usedLines.resize(linesInRof.size(), false);
-  //         usedLines.assign(linesInRof.size(), false);
-  //         clusterLines.clear();
-  //         clusterLines.reserve(nClustersL1Rof);
-  //         computeVerticesInRof(rof,
-  //                              linesInRof,
-  //                              usedLines,
-  //                              clusterLines,
-  //                              mTimeFrameGPU->getBeamXY(),
-  //                              mTimeFrameGPU->getVerticesInChunks()[chunkId],
-  //                              mTimeFrameGPU->getNVerticesInChunks()[chunkId],
-  //                              mTimeFrameGPU,
-  //                              mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr);
-  //       }
-  //       offset += rofs;
-  //     }
-  //   };
-  //   // Do work
-  //   threads[chunkId] = std::thread(doVertexReconstruction);
-  // }
-  // for (auto& thread : threads) {
-  //   thread.join();
-  // }
-  // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
-  //   int start{0};
-  //   for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) {
-  //     gsl::span<const Vertex> rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast<gsl::span<Vertex>::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])};
-  //     mTimeFrameGPU->addPrimaryVertices(rofVerts);
-  //     if (mTimeFrameGPU->hasMCinformation()) {
-  //       mTimeFrameGPU->getVerticesLabels().emplace_back();
-  //       // TODO: add MC labels
-  //     }
-  //     start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId];
-  //   }
-  // }
-  // mTimeFrameGPU->wipe(3);
-  // }
+  const auto& conf = ITSGpuTrackingParamConfig::Instance();
+
+  mTimeFrameGPU->createVtxTrackletsLUTDevice(iteration);
+  countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                       mTimeFrameGPU->getDeviceMultCutMask(),
+                                       mTimeFrameGPU->getNrof(),
+                                       this->mVrtParams[iteration].deltaRof,
+                                       mTimeFrameGPU->getDeviceROFramesPV(),
+                                       this->mVrtParams[iteration].vertPerRofThreshold,
+                                       mTimeFrameGPU->getDeviceArrayClusters(),
+                                       mTimeFrameGPU->getClusterSizes()[1],
+                                       mTimeFrameGPU->getDeviceROFrameClusters(),
+                                       (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                       mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                       mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(),
+                                       mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(),
+                                       mTimeFrameGPU->getDeviceArrayNTrackletsPerROF(),
+                                       mTimeFrameGPU->getDeviceNTrackletsPerCluster(),
+                                       mTimeFrameGPU->getDeviceNTrackletsPerClusterSum(),
+                                       iteration,
+                                       this->mVrtParams[iteration].phiCut,
+                                       this->mVrtParams[iteration].maxTrackletsPerCluster,
+                                       conf.nBlocksVtxComputeTracklets[iteration],
+                                       conf.nThreadsVtxComputeTracklets[iteration],
+                                       mTimeFrameGPU->getStreams());
+  mTimeFrameGPU->createVtxTrackletsBuffers(iteration);
+  computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                         mTimeFrameGPU->getDeviceMultCutMask(),
+                                         mTimeFrameGPU->getNrof(),
+                                         this->mVrtParams[iteration].deltaRof,
+                                         mTimeFrameGPU->getDeviceROFramesPV(),
+                                         this->mVrtParams[iteration].vertPerRofThreshold,
+                                         mTimeFrameGPU->getDeviceArrayClusters(),
+                                         mTimeFrameGPU->getClusterSizes()[1],
+                                         mTimeFrameGPU->getDeviceROFrameClusters(),
+                                         (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                         mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                         mTimeFrameGPU->getDeviceArrayTracklets(),
+                                         (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(),
+                                         (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(),
+                                         (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerROF(),
+                                         iteration,
+                                         this->mVrtParams[iteration].phiCut,
+                                         this->mVrtParams[iteration].maxTrackletsPerCluster,
+                                         conf.nBlocksVtxComputeTracklets[iteration],
+                                         conf.nThreadsVtxComputeTracklets[iteration],
+                                         mTimeFrameGPU->getStreams());
 }
 
-void VertexerTraitsGPU::computeTrackletMatching(const int iteration)
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::computeTrackletMatching(const int iteration)
 {
-}
+  if (!mTimeFrameGPU->getTotalTrackletsTF(0) || !mTimeFrameGPU->getTotalTrackletsTF(1)) {
+    return;
+  }
 
-void VertexerTraitsGPU::computeVertices(const int iteration)
-{
+  const auto& conf = ITSGpuTrackingParamConfig::Instance();
+  mTimeFrameGPU->createVtxLinesLUTDevice(iteration);
+  countTrackletsMatchingInROFsHandler(mTimeFrameGPU->getNrof(),
+                                      this->mVrtParams[iteration].deltaRof,
+                                      mTimeFrameGPU->getClusterSizes()[1],
+                                      mTimeFrameGPU->getDeviceROFrameClusters(),
+                                      mTimeFrameGPU->getDeviceArrayClusters(),
+                                      mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                      (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
+                                      mTimeFrameGPU->getDeviceUsedTracklets(),
+                                      (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(),
+                                      (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(),
+                                      mTimeFrameGPU->getDeviceNLinesPerCluster(),
+                                      mTimeFrameGPU->getDeviceNLinesPerClusterSum(),
+                                      iteration,
+                                      this->mVrtParams[iteration].phiCut,
+                                      this->mVrtParams[iteration].tanLambdaCut,
+                                      conf.nBlocksVtxComputeMatching[iteration],
+                                      conf.nThreadsVtxComputeMatching[iteration],
+                                      mTimeFrameGPU->getStreams());
+  mTimeFrameGPU->createVtxLinesBuffer(iteration);
+  computeTrackletsMatchingInROFsHandler(mTimeFrameGPU->getNrof(),
+                                        this->mVrtParams[iteration].deltaRof,
+                                        mTimeFrameGPU->getClusterSizes()[1],
+                                        mTimeFrameGPU->getDeviceROFrameClusters(),
+                                        mTimeFrameGPU->getDeviceArrayClusters(),
+                                        nullptr,
+                                        (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
+                                        mTimeFrameGPU->getDeviceUsedTracklets(),
+                                        (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(),
+                                        (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(),
+                                        (const int32_t*)mTimeFrameGPU->getDeviceNLinesPerClusterSum(),
+                                        mTimeFrameGPU->getDeviceLines(),
+                                        iteration,
+                                        this->mVrtParams[iteration].phiCut,
+                                        this->mVrtParams[iteration].tanLambdaCut,
+                                        conf.nBlocksVtxComputeMatching[iteration],
+                                        conf.nThreadsVtxComputeMatching[iteration],
+                                        mTimeFrameGPU->getStreams());
 }
 
-void VertexerTraitsGPU::computeVerticesHist()
+template <int nLayers>
+void VertexerTraitsGPU<nLayers>::computeVertices(const int iteration)
 {
+  LOGP(fatal, "This step is not implemented yet!");
+  mTimeFrameGPU->loadUsedClustersDevice();
 }
 
-VertexerTraits* createVertexerTraitsGPU()
-{
-  return new VertexerTraitsGPU;
-}
+template class VertexerTraitsGPU<7>;
+
 } // namespace o2::its
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
index 126e799efce5d..a2787bb13598d 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
@@ -14,364 +14,454 @@
 #include <cub/cub.cuh>
 
 #include "ITStrackingGPU/VertexingKernels.h"
+#include "ITStracking/Tracklet.h"
+#include "ITStracking/IndexTableUtils.h"
+#include "ITStracking/ClusterLines.h"
+
+#include "GPUCommonMath.h"
 #include "GPUCommonHelpers.h"
+#include "GPUCommonDef.h"
 
-namespace o2
-{
-namespace its
+namespace o2::its
 {
-using math_utils::getNormalizedPhi;
 
 namespace gpu
 {
-template <TrackletMode Mode>
-void trackletFinderHandler(const Cluster* clustersNextLayer,    // 0 2
-                           const Cluster* clustersCurrentLayer, // 1 1
-                           const int* sizeNextLClusters,
-                           const int* sizeCurrentLClusters,
-                           const int* nextIndexTables,
-                           Tracklet* Tracklets,
-                           int* foundTracklets,
-                           const IndexTableUtils* utils,
-                           const unsigned int startRofId,
-                           const unsigned int rofSize,
-                           const float phiCut,
-                           const unsigned int maxTrackletsPerCluster,
-                           const int nBlocks,
-                           const int nThreads)
-{
-  gpu::trackleterKernelMultipleRof<Mode><<<nBlocks, nThreads>>>(
-    clustersNextLayer,       // const Cluster* clustersNextLayer,    // 0 2
-    clustersCurrentLayer,    // const Cluster* clustersCurrentLayer, // 1 1
-    sizeNextLClusters,       // const int* sizeNextLClusters,
-    sizeCurrentLClusters,    // const int* sizeCurrentLClusters,
-    nextIndexTables,         // const int* nextIndexTables,
-    Tracklets,               // Tracklet* Tracklets,
-    foundTracklets,          // int* foundTracklets,
-    utils,                   // const IndexTableUtils* utils,
-    startRofId,              // const unsigned int startRofId,
-    rofSize,                 // const unsigned int rofSize,
-    phiCut,                  // const float phiCut,
-    maxTrackletsPerCluster); // const unsigned int maxTrackletsPerCluster = 1e2
-}
-/*
-
-GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
-                              const float z1, float maxdeltaz, float maxdeltaphi)
-{
-  const float zRangeMin = z1 - maxdeltaz;
-  const float phiRangeMin = currentCluster.phi - maxdeltaphi;
-  const float zRangeMax = z1 + maxdeltaz;
-  const float phiRangeMax = currentCluster.phi + maxdeltaphi;
-
-  if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] ||
-      zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) {
-
-    return getEmptyBinsRect();
-  }
 
-  return int4{o2::gpu::GPUCommonMath::Max(0, getZBinIndex(layerIndex + 1, zRangeMin)),
-              getPhiBinIndex(phiRangeMin),
-              o2::gpu::GPUCommonMath::Min(ZBins - 1, getZBinIndex(layerIndex + 1, zRangeMax)),
-              getPhiBinIndex(phiRangeMax)};
-}
-
-template <typename... Args>
-GPUd() void printOnThread(const unsigned int tId, const char* str, Args... args)
+template <int nLayers, TrackletMode Mode, bool dryRun>
+GPUg() void computeLayerTrackletMutliROFKernel(const Cluster** GPUrestrict() clusters,
+                                               const int32_t** GPUrestrict() rofClusters,
+                                               const uint8_t** GPUrestrict() usedClusters,
+                                               const int32_t** GPUrestrict() clusterIndexTables,
+                                               const float phiCut,
+                                               maybe_const<dryRun, Tracklet>** GPUrestrict() tracklets,
+                                               maybe_const<!dryRun, int32_t>** GPUrestrict() trackletOffsets,
+                                               const IndexTableUtils<nLayers>* GPUrestrict() utils,
+                                               const int32_t nRofs,
+                                               const int32_t deltaRof,
+                                               const int32_t* GPUrestrict() rofPV,
+                                               const int32_t iteration,
+                                               const int32_t verPerRofThreshold,
+                                               const int32_t maxTrackletsPerCluster)
 {
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    printf(str, args...);
-  }
-}
-
-template <typename... Args>
-GPUd() void printOnBlock(const unsigned int bId, const char* str, Args... args)
-{
-  if (blockIdx.x == bId && threadIdx.x == 0) {
-    printf(str, args...);
-  }
-}
-
-GPUg() void printBufferOnThread(const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    for (int i{0}; i < size; ++i) {
-      if (!(i % len)) {
-        printf("\n start: ===>%d/%d\t", i, (int)size);
-      }
-      printf("%d\t", v[i]);
+  constexpr int32_t iMode = (Mode == TrackletMode::Layer0Layer1) ? 0 : 1;
+  const int32_t phiBins(utils->getNphiBins());
+  const int32_t zBins(utils->getNzBins());
+  const int32_t tableSize{phiBins * zBins + 1};
+  extern __shared__ uint16_t storedTrackletsShared[]; // each deltaROF needs its own counters
+  uint16_t* storedTrackletsLocal = storedTrackletsShared + threadIdx.x * (2 * deltaRof + 1);
+  for (uint32_t pivotRofId{blockIdx.x}; pivotRofId < (uint32_t)nRofs; pivotRofId += gridDim.x) {
+    if (iteration && rofPV[pivotRofId] > verPerRofThreshold) {
+      continue;
     }
-    printf("\n");
-  }
-}
-
-GPUg() void printBufferOnThreadF(const float* v, unsigned int size, const unsigned int tId = 0)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    printf("vector :");
-    for (int i{0}; i < size; ++i) {
-      printf("%.9f\t", v[i]);
+    const uint16_t startROF = o2::gpu::CAMath::Max(0, (int)pivotRofId - deltaRof);
+    const uint16_t endROF = o2::gpu::CAMath::Min(nRofs, (int)pivotRofId + deltaRof + 1);
+    const auto clustersCurrentLayer = getClustersOnLayer((int32_t)pivotRofId, nRofs, 1, rofClusters, clusters);
+    if (clustersCurrentLayer.empty()) {
+      continue;
     }
-    printf("\n");
-  }
-}
-
-GPUg() void resetTrackletsKernel(Tracklet* tracklets, const int nTracklets)
-{
-  for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nTracklets; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
-    new (tracklets + iCurrentLayerClusterIndex) Tracklet{};
-  }
-}
-
-GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const unsigned int nClustersMiddleLayer, const int maxTrackletsPerCluster)
-{
-  for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
-    const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
-    for (int iTracklet{0}; iTracklet < nTracklet[iCurrentLayerClusterIndex]; ++iTracklet) {
-      auto& t = tracklets[stride + iTracklet];
-      t.dump();
-    }
-  }
-}
-
-GPUg() void dumpMaximaKernel(const cub::KeyValuePair<int, int>* tmpVertexBins, const int threadId)
-{
-  if (blockIdx.x * blockDim.x + threadIdx.x == threadId) {
-    printf("XmaxBin: %d at index: %d | YmaxBin: %d at index: %d | ZmaxBin: %d at index: %d\n",
-           tmpVertexBins[0].value, tmpVertexBins[0].key,
-           tmpVertexBins[1].value, tmpVertexBins[1].key,
-           tmpVertexBins[2].value, tmpVertexBins[2].key);
-  }
-}
-
-template <TrackletMode Mode>
-GPUg() void trackleterKernelSingleRof(
-  const Cluster* clustersNextLayer,    // 0 2
-  const Cluster* clustersCurrentLayer, // 1 1
-  const int sizeNextLClusters,
-  const int sizeCurrentLClusters,
-  const int* indexTableNext,
-  const float phiCut,
-  Tracklet* Tracklets,
-  int* foundTracklets,
-  const IndexTableUtils* utils,
-  const short rofId,
-  const unsigned int maxTrackletsPerCluster = 1e2)
-{
-  const int phiBins{utils->getNphiBins()};
-  const int zBins{utils->getNzBins()};
-  // loop on layer1 clusters
-  for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < sizeCurrentLClusters; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
-    if (iCurrentLayerClusterIndex < sizeCurrentLClusters) {
-      unsigned int storedTracklets{0};
-      const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
-      const Cluster& currentCluster = clustersCurrentLayer[iCurrentLayerClusterIndex];
-      const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)};
+    auto trackletsPerCluster = getNTrackletsPerCluster(pivotRofId, nRofs, iMode, rofClusters, trackletOffsets);
+    for (uint32_t iCurrentLayerClusterIndex{threadIdx.x}; iCurrentLayerClusterIndex < (uint32_t)clustersCurrentLayer.size(); iCurrentLayerClusterIndex += blockDim.x) {
+      for (int16_t i{0}; i < (int16_t)((2 * deltaRof) + 1); ++i) {
+        storedTrackletsLocal[i] = 0;
+      }
+      const Cluster& GPUrestrict() currentCluster { clustersCurrentLayer[iCurrentLayerClusterIndex] };
+      const int4 selectedBinsRect{getBinsRect(currentCluster, (int)Mode, utils, 0.f, 0.f, 50.f, phiCut / 2)};
       if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) {
         int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
         if (phiBinsNum < 0) {
           phiBinsNum += phiBins;
         }
-        // loop on phi bins next layer
-        for (unsigned int iPhiBin{(unsigned int)selectedBinsRect.y}, iPhiCount{0}; iPhiCount < (unsigned int)phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) {
-          const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
-          const int firstRowClusterIndex{indexTableNext[firstBinIndex]};
-          const int maxRowClusterIndex{indexTableNext[firstBinIndex + zBins]};
-          // loop on clusters next layer
-          for (int iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < sizeNextLClusters; ++iNextLayerClusterIndex) {
-            const Cluster& nextCluster = clustersNextLayer[iNextLayerClusterIndex];
-            if (o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi) < phiCut) {
-              if (storedTracklets < maxTrackletsPerCluster) {
-                if constexpr (Mode == TrackletMode::Layer0Layer1) {
-                  new (Tracklets + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, rofId, rofId};
-                } else {
-                  new (Tracklets + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, rofId, rofId};
+        for (int32_t iPhiBin{selectedBinsRect.y}, iPhiCount{0}; iPhiCount < phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) {
+          for (uint16_t targetRofId{startROF}; targetRofId < endROF; ++targetRofId) {
+            uint16_t& storedTracklets = storedTrackletsLocal[pivotRofId - targetRofId + deltaRof];
+            const int32_t firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
+            const int32_t maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
+            const int32_t firstRowClusterIndex{clusterIndexTables[(int)Mode][(targetRofId)*tableSize + firstBinIndex]};
+            const int32_t maxRowClusterIndex{clusterIndexTables[(int)Mode][(targetRofId)*tableSize + maxBinIndex]};
+            auto clustersNextLayer = getClustersOnLayer((int32_t)targetRofId, nRofs, (int32_t)Mode, rofClusters, clusters);
+            if (clustersNextLayer.empty()) {
+              continue;
+            }
+            for (int32_t iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < (int32_t)clustersNextLayer.size(); ++iNextLayerClusterIndex) {
+              if (iteration && usedClusters[(int32_t)Mode][iNextLayerClusterIndex]) {
+                continue;
+              }
+              const Cluster& GPUrestrict() nextCluster { clustersNextLayer[iNextLayerClusterIndex] };
+              if (o2::gpu::GPUCommonMath::Abs(math_utils::smallestAngleDifference(currentCluster.phi, nextCluster.phi)) < phiCut) {
+                if (storedTracklets < maxTrackletsPerCluster) {
+                  if constexpr (!dryRun) {
+                    if constexpr (Mode == TrackletMode::Layer0Layer1) {
+                      tracklets[0][trackletsPerCluster[iCurrentLayerClusterIndex] + storedTracklets] = Tracklet{iNextLayerClusterIndex, (int)iCurrentLayerClusterIndex, nextCluster, currentCluster, (short)targetRofId, (short)pivotRofId};
+                    } else {
+                      tracklets[1][trackletsPerCluster[iCurrentLayerClusterIndex] + storedTracklets] = Tracklet{(int)iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, (short)pivotRofId, (short)targetRofId};
+                    }
+                  }
+                  ++storedTracklets;
                 }
-                ++storedTracklets;
               }
             }
           }
         }
       }
-      foundTracklets[iCurrentLayerClusterIndex] = storedTracklets;
-      if (storedTracklets >= maxTrackletsPerCluster) {
-        printf("gpu tracklet finder: some lines will be left behind for cluster %d. valid: %u max: %zu\n", iCurrentLayerClusterIndex, storedTracklets, maxTrackletsPerCluster);
+      if constexpr (dryRun) {
+        for (int32_t i{0}; i < (int32_t)((2 * deltaRof) + 1); ++i) {
+          trackletsPerCluster[iCurrentLayerClusterIndex] += storedTrackletsLocal[i];
+        }
       }
     }
   }
 }
 
-template <TrackletMode Mode>
-GPUg() void trackleterKernelMultipleRof(
-  const Cluster* clustersNextLayer,    // 0 2
-  const Cluster* clustersCurrentLayer, // 1 1
-  const int* sizeNextLClusters,
-  const int* sizeCurrentLClusters,
-  const int* nextIndexTables,
-  Tracklet* Tracklets,
-  int* foundTracklets,
-  const IndexTableUtils* utils,
-  const short startRofId,
-  const short rofSize,
-  const float phiCut,
-  const unsigned int maxTrackletsPerCluster = 1e2)
+template <bool dryRun>
+GPUg() void computeTrackletSelectionMutliROFKernel(const Cluster** GPUrestrict() clusters,
+                                                   maybe_const<!dryRun, uint8_t>** GPUrestrict() usedClusters,
+                                                   const int32_t** GPUrestrict() rofClusters,
+                                                   const float phiCut,
+                                                   const float tanLambdaCut,
+                                                   const Tracklet** GPUrestrict() tracklets,
+                                                   uint8_t* GPUrestrict() usedTracklets,
+                                                   const int32_t** GPUrestrict() trackletOffsets,
+                                                   const int32_t** GPUrestrict() trackletLUTs,
+                                                   maybe_const<!dryRun, int32_t>* lineOffsets,
+                                                   maybe_const<dryRun, Line>* GPUrestrict() lines,
+                                                   const int32_t nRofs,
+                                                   const int32_t deltaRof,
+                                                   const int32_t maxTracklets)
 {
-  const int phiBins{utils->getNphiBins()};
-  const int zBins{utils->getNzBins()};
-  for (auto iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) {
-    short rof = static_cast<short>(iRof) + startRofId;
-    auto* clustersNextLayerRof = clustersNextLayer + (sizeNextLClusters[rof] - sizeNextLClusters[startRofId]);
-    auto* clustersCurrentLayerRof = clustersCurrentLayer + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]);
-    auto nClustersNextLayerRof = sizeNextLClusters[rof + 1] - sizeNextLClusters[rof];
-    auto nClustersCurrentLayerRof = sizeCurrentLClusters[rof + 1] - sizeCurrentLClusters[rof];
-    auto* indexTableNextRof = nextIndexTables + iRof * (phiBins * zBins + 1);
-    auto* TrackletsRof = Tracklets + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]) * maxTrackletsPerCluster;
-    auto* foundTrackletsRof = foundTracklets + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]);
+  for (uint32_t pivotRofId{blockIdx.x}; pivotRofId < nRofs; pivotRofId += gridDim.x) {
+    const int16_t startROF = o2::gpu::CAMath::Max(0, (int32_t)pivotRofId - deltaRof);
+    const int16_t endROF = o2::gpu::CAMath::Min(nRofs, (int32_t)pivotRofId + deltaRof + 1);
 
-    // single rof loop on layer1 clusters
-    for (int iCurrentLayerClusterIndex = threadIdx.x; iCurrentLayerClusterIndex < nClustersCurrentLayerRof; iCurrentLayerClusterIndex += blockDim.x) {
-      unsigned int storedTracklets{0};
-      const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
-      const Cluster& currentCluster = clustersCurrentLayerRof[iCurrentLayerClusterIndex];
-      const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)};
-      if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) {
-        int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
-        if (phiBinsNum < 0) {
-          phiBinsNum += phiBins;
-        }
-        // loop on phi bins next layer
-        for (unsigned int iPhiBin{(unsigned int)selectedBinsRect.y}, iPhiCount{0}; iPhiCount < (unsigned int)phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) {
-          const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
-          const int firstRowClusterIndex{indexTableNextRof[firstBinIndex]};
-          const int maxRowClusterIndex{indexTableNextRof[firstBinIndex + zBins]};
-          // loop on clusters next layer
-          for (int iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < nClustersNextLayerRof; ++iNextLayerClusterIndex) {
-            const Cluster& nextCluster = clustersNextLayerRof[iNextLayerClusterIndex];
-            if (o2::gpu::GPUCommonMath::Abs(smallestAngleDifference(currentCluster.phi, nextCluster.phi)) < phiCut) {
-              if (storedTracklets < maxTrackletsPerCluster) {
-                if constexpr (Mode == TrackletMode::Layer0Layer1) {
-                  new (TrackletsRof + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, rof, rof};
-                } else {
-                  new (TrackletsRof + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, rof, rof};
-                }
-                ++storedTracklets;
-              }
+    const uint32_t clusterOffset = rofClusters[1][pivotRofId];
+    const uint32_t nClustersCurrentLayer = rofClusters[1][pivotRofId + 1] - clusterOffset;
+    if (nClustersCurrentLayer <= 0) {
+      continue;
+    }
+
+    auto linesPerCluster = getNLinesPerCluster(pivotRofId, nRofs, rofClusters, lineOffsets);
+    auto nTrackletsPerCluster01 = getNTrackletsPerCluster(pivotRofId, nRofs, 0, rofClusters, trackletOffsets);
+    auto nTrackletsPerCluster12 = getNTrackletsPerCluster(pivotRofId, nRofs, 1, rofClusters, trackletOffsets);
+
+    for (uint32_t iCurrentLayerClusterIndex{threadIdx.x}; iCurrentLayerClusterIndex < nClustersCurrentLayer; iCurrentLayerClusterIndex += blockDim.x) {
+      int32_t validTracklets{0};
+      const int32_t nTracklets01 = nTrackletsPerCluster01[iCurrentLayerClusterIndex];
+      const int32_t nTracklets12 = nTrackletsPerCluster12[iCurrentLayerClusterIndex];
+      for (int32_t iTracklet12{0}; iTracklet12 < nTracklets12; ++iTracklet12) {
+        for (int32_t iTracklet01{0}; iTracklet01 < nTracklets01; ++iTracklet01) {
+
+          if (usedTracklets[trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01]) {
+            continue;
+          }
+
+          const auto& GPUrestrict() tracklet01 { tracklets[0][trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01] };
+          const auto& GPUrestrict() tracklet12 { tracklets[1][trackletLUTs[1][clusterOffset + iCurrentLayerClusterIndex] + iTracklet12] };
+          const int16_t rof0 = tracklet01.rof[0];
+          const int16_t rof2 = tracklet12.rof[1];
+          if (deltaRof > 0 && ((rof0 < startROF) || (rof0 >= endROF) || (rof2 < startROF) || (rof2 >= endROF) || (o2::gpu::CAMath::Abs(rof0 - rof2) > deltaRof))) {
+            continue;
+          }
+
+          const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklet01.tanLambda - tracklet12.tanLambda)};
+          const float deltaPhi{o2::gpu::GPUCommonMath::Abs(math_utils::smallestAngleDifference(tracklet01.phi, tracklet12.phi))};
+          //
+          if (deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets < maxTracklets) {
+            // TODO use atomics to avoid race conditions for torn writes but is it needed here?
+            usedTracklets[trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01] = 1;
+            if constexpr (dryRun) {
+              usedClusters[0][rofClusters[0][rof0] + tracklet01.firstClusterIndex] = 1;
+              usedClusters[2][rofClusters[2][rof2] + tracklet12.secondClusterIndex] = 1;
+            } else {
+              const Cluster* clusters0 = clusters[0] + rofClusters[0][tracklet01.rof[0]];
+              const Cluster* clusters1 = clusters[1] + rofClusters[1][tracklet01.rof[1]];
+              lines[lineOffsets[iCurrentLayerClusterIndex] + validTracklets] = Line(tracklet01, clusters0, clusters1);
             }
+            ++validTracklets;
           }
         }
       }
-      foundTrackletsRof[iCurrentLayerClusterIndex] = storedTracklets;
-      // if (storedTracklets >= maxTrackletsPerCluster && storedTracklets - maxTrackletsPerCluster < 5) {
-      //   printf("gpu tracklet finder: some lines will be left behind for cluster %d in rof: %d. valid: %u max: %lu (suppressing after 5 msgs)\n", iCurrentLayerClusterIndex, rof, storedTracklets, maxTrackletsPerCluster);
-      // }
+
+      if constexpr (dryRun) {
+        linesPerCluster[iCurrentLayerClusterIndex] = validTracklets;
+      }
     }
   }
 }
 
-template <bool initRun>
-GPUg() void trackletSelectionKernelSingleRof(
-  const Cluster* clusters0,
-  const Cluster* clusters1,
-  const unsigned int nClustersMiddleLayer,
-  Tracklet* tracklets01,
-  Tracklet* tracklets12,
-  const int* nFoundTracklet01,
-  const int* nFoundTracklet12,
-  unsigned char* usedTracklets,
-  Line* lines,
-  int* nFoundLines,
-  int* nExclusiveFoundLines,
-  const int maxTrackletsPerCluster = 1e2,
-  const float tanLambdaCut = 0.025f,
-  const float phiCut = 0.002f)
+template <TrackletMode Mode>
+GPUg() void compileTrackletsPerROFKernel(const int32_t nRofs,
+                                         int** GPUrestrict() nTrackletsPerROF,
+                                         const int32_t** GPUrestrict() rofClusters,
+                                         const int32_t** GPUrestrict() nTrackletsPerCluster)
 {
-  for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
-    const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
-    int validTracklets{0};
-    for (int iTracklet12{0}; iTracklet12 < nFoundTracklet12[iCurrentLayerClusterIndex]; ++iTracklet12) {
-      for (int iTracklet01{0}; iTracklet01 < nFoundTracklet01[iCurrentLayerClusterIndex] && validTracklets < maxTrackletsPerCluster; ++iTracklet01) {
-        const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklets01[stride + iTracklet01].tanLambda - tracklets12[stride + iTracklet12].tanLambda)};
-        const float deltaPhi{o2::gpu::GPUCommonMath::Abs(smallestAngleDifference(tracklets01[stride + iTracklet01].phi, tracklets12[stride + iTracklet12].phi))};
-        if (!usedTracklets[stride + iTracklet01] && deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets != maxTrackletsPerCluster) {
-          usedTracklets[stride + iTracklet01] = true;
-          if constexpr (!initRun) {
-            new (lines + nExclusiveFoundLines[iCurrentLayerClusterIndex] + validTracklets) Line{tracklets01[stride + iTracklet01], clusters0, clusters1};
-          }
-          ++validTracklets;
-        }
-      }
+  // TODO is this the best reduction kernel?
+  constexpr int32_t iMode = (Mode == TrackletMode::Layer0Layer1) ? 0 : 1;
+  extern __shared__ int32_t ssum[];
+  for (uint32_t rof = blockIdx.x; rof < (uint32_t)nRofs; rof += gridDim.x) {
+    const auto& GPUrestrict() currentNTracklets = getNTrackletsPerCluster(rof, nRofs, iMode, rofClusters, nTrackletsPerCluster);
+    int32_t localSum = 0;
+    for (uint32_t ci = threadIdx.x; ci < (uint32_t)currentNTracklets.size(); ci += blockDim.x) {
+      localSum += currentNTracklets[ci];
     }
-    if constexpr (initRun) {
-      nFoundLines[iCurrentLayerClusterIndex] = validTracklets;
-      if (validTracklets >= maxTrackletsPerCluster) {
-        printf("gpu tracklet selection: some lines will be left behind for cluster %d. valid: %d max: %d\n", iCurrentLayerClusterIndex, validTracklets, maxTrackletsPerCluster);
+    ssum[threadIdx.x] = localSum;
+    __syncthreads();
+    for (uint32_t stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+      if (threadIdx.x < stride) {
+        ssum[threadIdx.x] += ssum[threadIdx.x + stride];
       }
+      __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+      nTrackletsPerROF[iMode][rof] = ssum[0];
     }
   }
 }
 
-template <bool initRun>
-GPUg() void trackletSelectionKernelMultipleRof(
-  const Cluster* clusters0,               // Clusters on layer 0
-  const Cluster* clusters1,               // Clusters on layer 1
-  const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-  const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
-  Tracklet* tracklets01,                  // Tracklets on layer 0-1
-  Tracklet* tracklets12,                  // Tracklets on layer 1-2
-  const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
-  const int* nFoundTracklets12,           // Number of tracklets found on layers 1-2
-  unsigned char* usedTracklets,           // Used tracklets
-  Line* lines,                            // Lines
-  int* nFoundLines,                       // Number of found lines
-  int* nExclusiveFoundLines,              // Number of found lines exclusive scan
-  const unsigned int startRofId,          // Starting ROF ID
-  const unsigned int rofSize,             // Number of ROFs to consider
-  const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
-  const float tanLambdaCut = 0.025f,      // Cut on tan lambda
-  const float phiCut = 0.002f)            // Cut on phi
+template <typename T>
+GPUhi() void cubExclusiveScan(const T* GPUrestrict() in, T* GPUrestrict() out, int32_t num_items, cudaStream_t stream)
 {
-  for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) {
-    auto rof = iRof + startRofId;
-    auto* clustersL0Rof = clusters0 + (sizeClustersL0[rof] - sizeClustersL0[startRofId]);
-    auto clustersL1offsetRof = sizeClustersL1[rof] - sizeClustersL1[startRofId];
-    auto* clustersL1Rof = clusters1 + clustersL1offsetRof;
-    auto nClustersL1Rof = sizeClustersL1[rof + 1] - sizeClustersL1[rof];
-    auto* tracklets01Rof = tracklets01 + clustersL1offsetRof * maxTrackletsPerCluster;
-    auto* tracklets12Rof = tracklets12 + clustersL1offsetRof * maxTrackletsPerCluster;
-    auto* foundTracklets01Rof = nFoundTracklets01 + clustersL1offsetRof;
-    auto* foundTracklets12Rof = nFoundTracklets12 + clustersL1offsetRof;
-    auto* usedTrackletsRof = usedTracklets + clustersL1offsetRof * maxTrackletsPerCluster;
-    auto* foundLinesRof = nFoundLines + clustersL1offsetRof;
-    int* nExclusiveFoundLinesRof = nullptr;
-    if constexpr (!initRun) {
-      nExclusiveFoundLinesRof = nExclusiveFoundLines + clustersL1offsetRof;
-    }
-    for (int iClusterIndexLayer1 = threadIdx.x; iClusterIndexLayer1 < nClustersL1Rof; iClusterIndexLayer1 += blockDim.x) {
-      const int stride{iClusterIndexLayer1 * maxTrackletsPerCluster};
-      int validTracklets{0};
-      for (int iTracklet12{0}; iTracklet12 < foundTracklets12Rof[iClusterIndexLayer1]; ++iTracklet12) {
-        for (int iTracklet01{0}; iTracklet01 < foundTracklets01Rof[iClusterIndexLayer1] && validTracklets < maxTrackletsPerCluster; ++iTracklet01) {
-          const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklets01Rof[stride + iTracklet01].tanLambda - tracklets12Rof[stride + iTracklet12].tanLambda)};
-          const float deltaPhi{o2::gpu::GPUCommonMath::Abs(tracklets01Rof[stride + iTracklet01].phi - tracklets12Rof[stride + iTracklet12].phi)};
-          if (!usedTrackletsRof[stride + iTracklet01] && deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets != maxTrackletsPerCluster) {
-            usedTrackletsRof[stride + iTracklet01] = true;
-            if constexpr (!initRun) {
-              new (lines + nExclusiveFoundLinesRof[iClusterIndexLayer1] + validTracklets) Line{tracklets01Rof[stride + iTracklet01], clustersL0Rof, clustersL1Rof};
-            }
-            ++validTracklets;
-          }
-        }
-      }
-      if constexpr (initRun) {
-        foundLinesRof[iClusterIndexLayer1] = validTracklets;
-        // if (validTracklets >= maxTrackletsPerCluster) {
-        // printf("gpu tracklet selection: some lines will be left behind for cluster %d. valid: %d max: %d\n", iClusterIndexLayer1, validTracklets, maxTrackletsPerCluster);
-        // }
-      }
-    }
-  } // rof loop
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out + 1, num_items, stream));
+  GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out + 1, num_items, stream));
+  GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+}
+
+} // namespace gpu
+
+template <int nLayers>
+void countTrackletsInROFsHandler(const IndexTableUtils<nLayers>* GPUrestrict() utils,
+                                 const uint8_t* GPUrestrict() multMask,
+                                 const int32_t nRofs,
+                                 const int32_t deltaROF,
+                                 const int32_t* GPUrestrict() rofPV,
+                                 const int32_t vertPerRofThreshold,
+                                 const Cluster** GPUrestrict() clusters,
+                                 const uint32_t nClusters,
+                                 const int32_t** GPUrestrict() ROFClusters,
+                                 const uint8_t** GPUrestrict() usedClusters,
+                                 const int32_t** GPUrestrict() clustersIndexTables,
+                                 int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                 int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                 int32_t** GPUrestrict() trackletsPerROF,
+                                 const std::array<int32_t*, 2>& trackletsPerClusterLUTsHost,
+                                 const std::array<int32_t*, 2>& trackletsPerClusterSumLUTsHost,
+                                 const int32_t iteration,
+                                 const float phiCut,
+                                 const int32_t maxTrackletsPerCluster,
+                                 const int32_t nBlocks,
+                                 const int32_t nThreads,
+                                 gpu::Streams& streams)
+{
+  const uint32_t sharedBytes = nThreads * (2 * deltaROF + 1) * sizeof(uint16_t);
+  gpu::computeLayerTrackletMutliROFKernel<nLayers, TrackletMode::Layer0Layer1, true><<<nBlocks, nThreads, sharedBytes, streams[0].get()>>>(clusters,
+                                                                                                                                           ROFClusters,
+                                                                                                                                           usedClusters,
+                                                                                                                                           clustersIndexTables,
+                                                                                                                                           phiCut,
+                                                                                                                                           nullptr,
+                                                                                                                                           trackletsPerClusterLUTs,
+                                                                                                                                           utils,
+                                                                                                                                           nRofs,
+                                                                                                                                           deltaROF,
+                                                                                                                                           rofPV,
+                                                                                                                                           iteration,
+                                                                                                                                           vertPerRofThreshold,
+                                                                                                                                           maxTrackletsPerCluster);
+  gpu::compileTrackletsPerROFKernel<TrackletMode::Layer0Layer1><<<nBlocks, nThreads, nThreads * sizeof(int32_t), streams[0].get()>>>(nRofs, trackletsPerROF, ROFClusters, (const int32_t**)trackletsPerClusterLUTs);
+  gpu::cubExclusiveScan(trackletsPerClusterLUTsHost[0], trackletsPerClusterSumLUTsHost[0], nClusters, streams[0].get());
+
+  gpu::computeLayerTrackletMutliROFKernel<nLayers, TrackletMode::Layer1Layer2, true><<<nBlocks, nThreads, sharedBytes, streams[1].get()>>>(clusters,
+                                                                                                                                           ROFClusters,
+                                                                                                                                           usedClusters,
+                                                                                                                                           clustersIndexTables,
+                                                                                                                                           phiCut,
+                                                                                                                                           nullptr,
+                                                                                                                                           trackletsPerClusterLUTs,
+                                                                                                                                           utils,
+                                                                                                                                           nRofs,
+                                                                                                                                           deltaROF,
+                                                                                                                                           rofPV,
+                                                                                                                                           iteration,
+                                                                                                                                           vertPerRofThreshold,
+                                                                                                                                           maxTrackletsPerCluster);
+  gpu::compileTrackletsPerROFKernel<TrackletMode::Layer1Layer2><<<nBlocks, nThreads, nThreads * sizeof(int), streams[1].get()>>>(nRofs, trackletsPerROF, ROFClusters, (const int**)trackletsPerClusterLUTs);
+  gpu::cubExclusiveScan(trackletsPerClusterLUTsHost[1], trackletsPerClusterSumLUTsHost[1], nClusters, streams[1].get());
+}
+
+template <int32_t nLayers>
+void computeTrackletsInROFsHandler(const IndexTableUtils<nLayers>* GPUrestrict() utils,
+                                   const uint8_t* GPUrestrict() multMask,
+                                   const int32_t nRofs,
+                                   const int32_t deltaROF,
+                                   const int32_t* GPUrestrict() rofPV,
+                                   const int vertPerRofThreshold,
+                                   const Cluster** GPUrestrict() clusters,
+                                   const uint32_t nClusters,
+                                   const int32_t** GPUrestrict() ROFClusters,
+                                   const uint8_t** GPUrestrict() usedClusters,
+                                   const int32_t** GPUrestrict() clustersIndexTables,
+                                   Tracklet** GPUrestrict() foundTracklets,
+                                   const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                   const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                   const int32_t** GPUrestrict() trackletsPerROF,
+                                   const int32_t iteration,
+                                   const float phiCut,
+                                   const int32_t maxTrackletsPerCluster,
+                                   const int32_t nBlocks,
+                                   const int32_t nThreads,
+                                   gpu::Streams& streams)
+{
+  const uint32_t sharedBytes = nThreads * (2 * deltaROF + 1) * sizeof(uint16_t);
+  gpu::computeLayerTrackletMutliROFKernel<nLayers, TrackletMode::Layer0Layer1, false><<<nBlocks, nThreads, sharedBytes, streams[0].get()>>>(clusters,
+                                                                                                                                            ROFClusters,
+                                                                                                                                            usedClusters,
+                                                                                                                                            clustersIndexTables,
+                                                                                                                                            phiCut,
+                                                                                                                                            foundTracklets,
+                                                                                                                                            trackletsPerClusterSumLUTs,
+                                                                                                                                            utils,
+                                                                                                                                            nRofs,
+                                                                                                                                            deltaROF,
+                                                                                                                                            rofPV,
+                                                                                                                                            iteration,
+                                                                                                                                            vertPerRofThreshold,
+                                                                                                                                            maxTrackletsPerCluster);
+  gpu::computeLayerTrackletMutliROFKernel<nLayers, TrackletMode::Layer1Layer2, false><<<nBlocks, nThreads, sharedBytes, streams[1].get()>>>(clusters,
+                                                                                                                                            ROFClusters,
+                                                                                                                                            usedClusters,
+                                                                                                                                            clustersIndexTables,
+                                                                                                                                            phiCut,
+                                                                                                                                            foundTracklets,
+                                                                                                                                            trackletsPerClusterSumLUTs,
+                                                                                                                                            utils,
+                                                                                                                                            nRofs,
+                                                                                                                                            deltaROF,
+                                                                                                                                            rofPV,
+                                                                                                                                            iteration,
+                                                                                                                                            vertPerRofThreshold,
+                                                                                                                                            maxTrackletsPerCluster);
+}
+
+void countTrackletsMatchingInROFsHandler(const int32_t nRofs,
+                                         const int32_t deltaROF,
+                                         const uint32_t nClusters,
+                                         const int32_t** GPUrestrict() ROFClusters,
+                                         const Cluster** GPUrestrict() clusters,
+                                         uint8_t** GPUrestrict() usedClusters,
+                                         const Tracklet** GPUrestrict() foundTracklets,
+                                         uint8_t* GPUrestrict() usedTracklets,
+                                         const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                         const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                         int32_t* GPUrestrict() linesPerClusterLUT,
+                                         int32_t* GPUrestrict() linesPerClusterSumLUT,
+                                         const int32_t iteration,
+                                         const float phiCut,
+                                         const float tanLambdaCut,
+                                         const int32_t nBlocks,
+                                         const int32_t nThreads,
+                                         gpu::Streams& streams)
+{
+  streams[1].sync(); // need to make sure that all tracklets are done, since this placed in 0 tracklet01 will be done but tracklet12 needs to be guaranteed
+  gpu::computeTrackletSelectionMutliROFKernel<true><<<nBlocks, nThreads, 0, streams[0].get()>>>(nullptr,
+                                                                                                usedClusters,
+                                                                                                ROFClusters,
+                                                                                                phiCut,
+                                                                                                tanLambdaCut,
+                                                                                                foundTracklets,
+                                                                                                usedTracklets,
+                                                                                                trackletsPerClusterLUTs,
+                                                                                                trackletsPerClusterSumLUTs,
+                                                                                                linesPerClusterLUT,
+                                                                                                nullptr,
+                                                                                                nRofs,
+                                                                                                deltaROF,
+                                                                                                100);
+  gpu::cubExclusiveScan(linesPerClusterLUT, linesPerClusterSumLUT, nClusters, streams[0].get());
 }
 
+void computeTrackletsMatchingInROFsHandler(const int32_t nRofs,
+                                           const int32_t deltaROF,
+                                           const uint32_t nClusters,
+                                           const int32_t** GPUrestrict() ROFClusters,
+                                           const Cluster** GPUrestrict() clusters,
+                                           const uint8_t** GPUrestrict() usedClusters,
+                                           const Tracklet** GPUrestrict() foundTracklets,
+                                           uint8_t* GPUrestrict() usedTracklets,
+                                           const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                           const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                           const int32_t* GPUrestrict() linesPerClusterSumLUT,
+                                           Line* GPUrestrict() lines,
+                                           const int32_t iteration,
+                                           const float phiCut,
+                                           const float tanLambdaCut,
+                                           const int32_t nBlocks,
+                                           const int32_t nThreads,
+                                           gpu::Streams& streams)
+{
+  gpu::computeTrackletSelectionMutliROFKernel<false><<<nBlocks, nThreads, 0, streams[0].get()>>>(clusters,
+                                                                                                 nullptr,
+                                                                                                 ROFClusters,
+                                                                                                 phiCut,
+                                                                                                 tanLambdaCut,
+                                                                                                 foundTracklets,
+                                                                                                 usedTracklets,
+                                                                                                 trackletsPerClusterLUTs,
+                                                                                                 trackletsPerClusterSumLUTs,
+                                                                                                 linesPerClusterSumLUT,
+                                                                                                 lines,
+                                                                                                 nRofs,
+                                                                                                 deltaROF,
+                                                                                                 100);
+}
+
+/// Explicit instantiation of ITS2 handlers
+template void countTrackletsInROFsHandler<7>(const IndexTableUtils<7>* GPUrestrict() utils,
+                                             const uint8_t* GPUrestrict() multMask,
+                                             const int32_t nRofs,
+                                             const int32_t deltaROF,
+                                             const int32_t* GPUrestrict() rofPV,
+                                             const int32_t vertPerRofThreshold,
+                                             const Cluster** GPUrestrict() clusters,
+                                             const uint32_t nClusters,
+                                             const int32_t** GPUrestrict() ROFClusters,
+                                             const uint8_t** GPUrestrict() usedClusters,
+                                             const int32_t** GPUrestrict() clustersIndexTables,
+                                             int32_t** trackletsPerClusterLUTs,
+                                             int32_t** trackletsPerClusterSumLUTs,
+                                             int32_t** trackletsPerROF,
+                                             const std::array<int32_t*, 2>& trackletsPerClusterLUTsHost,
+                                             const std::array<int32_t*, 2>& trackletsPerClusterSumLUTsHost,
+                                             const int32_t iteration,
+                                             const float phiCut,
+                                             const int32_t maxTrackletsPerCluster,
+                                             const int32_t nBlocks,
+                                             const int32_t nThreads,
+                                             gpu::Streams& streams);
+
+template void computeTrackletsInROFsHandler<7>(const IndexTableUtils<7>* GPUrestrict() utils,
+                                               const uint8_t* GPUrestrict() multMask,
+                                               const int32_t nRofs,
+                                               const int32_t deltaROF,
+                                               const int32_t* GPUrestrict() rofPV,
+                                               const int vertPerRofThreshold,
+                                               const Cluster** GPUrestrict() clusters,
+                                               const uint32_t nClusters,
+                                               const int32_t** GPUrestrict() ROFClusters,
+                                               const uint8_t** GPUrestrict() usedClusters,
+                                               const int32_t** GPUrestrict() clustersIndexTables,
+                                               Tracklet** GPUrestrict() foundTracklets,
+                                               const int32_t** GPUrestrict() trackletsPerClusterLUTs,
+                                               const int32_t** GPUrestrict() trackletsPerClusterSumLUTs,
+                                               const int32_t** GPUrestrict() trackletsPerROF,
+                                               const int32_t iteration,
+                                               const float phiCut,
+                                               const int32_t maxTrackletsPerCluster,
+                                               const int32_t nBlocks,
+                                               const int32_t nThreads,
+                                               gpu::Streams& streams);
+/*
 GPUg() void lineClustererMultipleRof(
   const int* sizeClustersL1,     // Number of clusters on layer 1 per ROF
   Line* lines,                   // Lines
@@ -567,6 +657,4 @@ GPUg() void computeVertexKernel(
   }
 }
 */
-} // namespace gpu
-} // namespace its
-} // namespace o2
+} // namespace o2::its
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
index dd83669311a54..e8e475f2232c8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
@@ -19,8 +19,8 @@ if(HIP_ENABLED)
                          ../cuda/TrackerTraitsGPU.cxx
                          ../cuda/TracerGPU.cu
                          ../cuda/TrackingKernels.cu
-                         # ../cuda/VertexingKernels.cu
-                         # ../cuda/VertexerTraitsGPU.cxx
+                         ../cuda/VertexingKernels.cu
+                         ../cuda/VertexerTraitsGPU.cxx
                  PUBLIC_INCLUDE_DIRECTORIES ../
                  PUBLIC_LINK_LIBRARIES O2::ITStracking
                                        O2::GPUTracking
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
index 72857794c711e..c3be0de2dade7 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
@@ -15,6 +15,8 @@
 #ifndef TRACKINGITS_DEFINITIONS_H_
 #define TRACKINGITS_DEFINITIONS_H_
 
+#include <type_traits>
+
 #include "ReconstructionDataFormats/Vertex.h"
 
 #ifdef CA_DEBUG
@@ -28,8 +30,16 @@
 namespace o2::its
 {
 
+enum class TrackletMode {
+  Layer0Layer1 = 0,
+  Layer1Layer2 = 2
+};
+
 using Vertex = o2::dataformats::Vertex<o2::dataformats::TimeStamp<int>>;
 
-}
+template <bool IsConst, typename T>
+using maybe_const = typename std::conditional<IsConst, const T, T>::type;
+
+} // namespace o2::its
 
 #endif
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
index a148049e50129..c34701ce222e2 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -204,10 +204,8 @@ struct TimeFrame {
   void computeTracletsPerClusterScans();
   int& getNTrackletsROF(int rofId, int combId) { return mNTrackletsPerROF[combId][rofId]; }
   auto& getLines(int rofId) { return mLines[rofId]; }
-  int getNLinesTotal() const
-  {
-    return std::accumulate(mLines.begin(), mLines.end(), 0, [](int sum, const auto& l) { return sum + l.size(); });
-  }
+  int getNLinesTotal() const noexcept { return mTotalLines; }
+  void setNLinesTotal(uint32_t a) noexcept { mTotalLines = a; }
   auto& getTrackletClusters(int rofId) { return mTrackletClusters[rofId]; }
   gsl::span<const Tracklet> getFoundTracklets(int rofId, int combId) const;
   gsl::span<Tracklet> getFoundTracklets(int rofId, int combId);
@@ -237,10 +235,9 @@ struct TimeFrame {
 
   void setExternalAllocator(ExternalAllocator* allocator)
   {
-    if (mIsGPU) {
+    if (isGPU()) {
       LOGP(debug, "Setting timeFrame allocator to external");
       mAllocator = allocator;
-      mExtAllocator = true; // to be removed
     } else {
       LOGP(fatal, "External allocator is currently only supported for GPU");
     }
@@ -276,8 +273,6 @@ struct TimeFrame {
 
   IndexTableUtilsN mIndexTableUtils;
 
-  bool mIsGPU = false;
-
   std::array<bounded_vector<Cluster>, nLayers> mClusters;
   std::array<bounded_vector<TrackingFrameInfo>, nLayers> mTrackingFrameInfo;
   std::array<bounded_vector<int>, nLayers> mClusterExternalIndices;
@@ -296,9 +291,8 @@ struct TimeFrame {
   bounded_vector<Vertex> mPrimaryVertices;
 
   // State if memory will be externally managed.
-  bool mExtAllocator = false;
   ExternalAllocator* mAllocator = nullptr;
-  bool getExtAllocator() const { return mExtAllocator; }
+  bool getExtAllocator() const noexcept { return mAllocator != nullptr; }
 
   std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;
   std::vector<bounded_vector<Tracklet>> mTracklets;
@@ -313,6 +307,10 @@ struct TimeFrame {
 
   virtual void wipe();
 
+  // interface
+  virtual bool isGPU() const noexcept { return false; }
+  virtual const char* getName() const noexcept { return "CPU"; }
+
  private:
   void prepareClusters(const TrackingParameters& trkParam, const int maxLayers = nLayers);
   float mBz = 5.;
@@ -348,6 +346,7 @@ struct TimeFrame {
   std::vector<std::pair<MCCompLabel, float>> mVerticesMCRecInfo;
   bounded_vector<MCCompLabel> mVerticesContributorLabels;
   std::array<uint32_t, 2> mTotalTracklets = {0, 0};
+  uint32_t mTotalLines = 0;
   unsigned int mNoVertexROF = 0;
   bounded_vector<int> mTotVertPerIteration;
   // \Vertexer
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h
index 4c445bdbbfa16..d368eb1d1f56a 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h
@@ -136,6 +136,12 @@ struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper<ITSG
   int nBlocksTracksSeeds[MaxIter] = {60, 60, 60, 60};
   int nThreadsTracksSeeds[MaxIter] = {256, 256, 256, 256};
 
+  int nBlocksVtxComputeTracklets[2] = {60, 60};
+  int nThreadsVtxComputeTracklets[2] = {256, 256};
+
+  int nBlocksVtxComputeMatching[2] = {60, 60};
+  int nThreadsVtxComputeMatching[2] = {256, 256};
+
   O2ParamDef(ITSGpuTrackingParamConfig, "ITSGpuTrackingParam");
 };
 
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h
index 7da7617957179..dda32ddfd5aec 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h
@@ -43,11 +43,6 @@ class MCCompLabel;
 namespace its
 {
 
-enum class TrackletMode {
-  Layer0Layer1 = 0,
-  Layer1Layer2 = 2
-};
-
 template <int nLayers>
 class VertexerTraits
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
index e8212f4ad53a1..eb2e4ae690e5b 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx
@@ -85,7 +85,7 @@ void Tracker<nLayers>::clustersToTracks(const LogFunc& logger, const LogFunc& er
       }
       double timeTracklets{0.}, timeCells{0.}, timeNeighbours{0.}, timeRoads{0.};
       int nTracklets{0}, nCells{0}, nNeighbours{0}, nTracks{-static_cast<int>(mTimeFrame->getNumberOfTracks())};
-      int nROFsIterations = (mTrkParams[iteration].nROFsPerIterations > 0 && !mTimeFrame->mIsGPU) ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1;
+      int nROFsIterations = (mTrkParams[iteration].nROFsPerIterations > 0 && !mTimeFrame->isGPU()) ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1;
       iVertex = std::min(maxNvertices, 0);
       logger(std::format("==== ITS {} Tracking iteration {} summary ====", mTraits->getName(), iteration));
 
diff --git a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx
index 153d7b6faa358..53be03af7eb14 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx
@@ -17,6 +17,7 @@
 
 #include <oneapi/tbb/blocked_range.h>
 #include <oneapi/tbb/parallel_for.h>
+#include <oneapi/tbb/combinable.h>
 
 #include "ITStracking/VertexerTraits.h"
 #include "ITStracking/BoundedAllocator.h"
@@ -290,6 +291,7 @@ template <int nLayers>
 void VertexerTraits<nLayers>::computeTrackletMatching(const int iteration)
 {
   mTaskArena->execute([&] {
+    tbb::combinable<int> totalLines{0};
     tbb::parallel_for(
       tbb::blocked_range<short>(0, (short)mTimeFrame->getNrof()),
       [&](const tbb::blocked_range<short>& Rofs) {
@@ -333,8 +335,10 @@ void VertexerTraits<nLayers>::computeTrackletMatching(const int iteration)
                 mVrtParams[iteration].phiCut);
             }
           }
+          totalLines.local() += mTimeFrame->getLines(pivotRofId).size();
         }
       });
+    mTimeFrame->setNLinesTotal(totalLines.combine(std::plus<int>()));
   });
 
 #ifdef VTX_DEBUG
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h
index c4202e9980d24..813e0aef2d1aa 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h
@@ -21,7 +21,7 @@
 #include "ITStracking/TimeFrame.h"
 #if defined(__CUDACC__) || defined(__HIPCC__)
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
-// #include "ITStrackingGPU/VertexerTraitsGPU.h"
+#include "ITStrackingGPU/VertexerTraitsGPU.h"
 #include "ITStrackingGPU/TimeFrameGPU.h"
 #endif
 #else
@@ -39,10 +39,10 @@ template <int32_t>
 class TimeFrame
 {
 };
-// template <int32_t NLayers>
-// class VertexerTraitsGPU : public VertexerTraits<NLayers>
-// {
-// };
+template <int32_t NLayers>
+class VertexerTraitsGPU : public VertexerTraits<NLayers>
+{
+};
 template <int32_t NLayers = 7>
 class TrackerTraitsGPU : public TrackerTraits<NLayers>
 {
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 71582b4fed55e..9e7cfa5495040 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -97,7 +97,9 @@ void GPUReconstructionCUDA::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits<
     trackerTraits->reset(new o2::its::TrackerTraitsGPU);
   }
   if (vertexerTraits) {
-    vertexerTraits->reset(new o2::its::VertexerTraits<7>); // TODO gpu-code to be implemented
+    vertexerTraits->reset(new o2::its::VertexerTraits<7>);
+    // TODO gpu-code to be implemented then remove line above and uncomment line below
+    // vertexerTraits->reset(new o2::its::VertexerTraitsGPU<7>);
   }
   if (timeFrame) {
     timeFrame->reset(new o2::its::gpu::TimeFrameGPU);
diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx
index e53f5db3a2549..26dff3710cd4a 100644
--- a/GPU/GPUTracking/Global/GPUChainITS.cxx
+++ b/GPU/GPUTracking/Global/GPUChainITS.cxx
@@ -70,7 +70,7 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe()
     mRec->GetITSTraits(nullptr, nullptr, &mITSTimeFrame);
   }
 #if !defined(GPUCA_STANDALONE)
-  if (mITSTimeFrame->mIsGPU) {
+  if (mITSTimeFrame->isGPU()) {
     auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); };
 
     mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator);