diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h
deleted file mode 100644
index f4f73e715c305..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file Array.h
-/// \brief
-///
-
-#ifndef ITSTRACKINGGPU_ARRAY_H_
-#define ITSTRACKINGGPU_ARRAY_H_
-
-#include "GPUCommonDef.h"
-
-namespace o2
-{
-namespace its
-{
-namespace gpu
-{
-template <typename T, size_t Size>
-struct ArrayTraits final {
-  typedef T InternalArray[Size];
-
-  GPUhd() static constexpr T& getReference(const InternalArray& internalArray, size_t index) noexcept
-  {
-    return const_cast<T&>(internalArray[index]);
-  }
-
-  GPUhd() static constexpr T* getPointer(const InternalArray& internalArray) noexcept
-  {
-    return const_cast<T*>(internalArray);
-  }
-};
-
-template <typename T, size_t Size>
-struct Array final {
-
-  void copy(const Array<T, Size>& t)
-  {
-    memcpy(InternalArray, t.data(), Size * sizeof(T));
-  }
-
-  GPUhd() T* data() noexcept { return const_cast<T*>(InternalArray); }
-  GPUhd() const T* data() const noexcept { return const_cast<T*>(InternalArray); }
-  GPUhd() T& operator[](const int index) noexcept { return const_cast<T&>(InternalArray[index]); }
-  GPUhd() constexpr T& operator[](const int index) const noexcept { return const_cast<T&>(InternalArray[index]); }
-  GPUhd() size_t size() const noexcept { return Size; }
-
-  T InternalArray[Size];
-};
-} // namespace gpu
-} // namespace its
-} // namespace o2
-
-#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h
deleted file mode 100644
index bfc4c63756e0b..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file Context.h
-/// \brief
-///
-
-#ifndef ITSTRACKINGGPU_CONTEXT_H_
-#define ITSTRACKINGGPU_CONTEXT_H_
-
-#include <string>
-#include <vector>
-#include "ITStracking/Definitions.h"
-
-namespace o2
-{
-namespace its
-{
-namespace gpu
-{
-
-struct DeviceProperties final {
-  std::string name;
-  int gpuProcessors;
-  int gpuCores;
-  long globalMemorySize;
-  long constantMemorySize;
-  long sharedMemorySize;
-  long maxClockRate;
-  int busWidth;
-  long l2CacheSize;
-  long registersPerBlock;
-  int warpSize;
-  int maxThreadsPerBlock;
-  int maxBlocksPerSM;
-  dim3 maxThreadsDim;
-  dim3 maxGridDim;
-};
-
-class Context final
-{
- public:
-  static Context& getInstance();
-
-  Context(const Context&);
-  Context& operator=(const Context&);
-
-  const DeviceProperties& getDeviceProperties();
-  const DeviceProperties& getDeviceProperties(const int);
-
- private:
-  Context(bool dumpDevices = false);
-  ~Context() = default;
-
-  int mDevicesNum;
-  std::vector<DeviceProperties> mDeviceProperties;
-};
-} // namespace gpu
-} // namespace its
-} // namespace o2
-
-#endif /* TRAKINGITSU_INCLUDE_GPU_CONTEXT_H_ */
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h
deleted file mode 100644
index a5e859475521c..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file PrimaryVertexContextNVNV.h
-/// \brief
-///
-
-#ifndef ITSTRACKINGGPU_PRIMARYVERTEXCONTEXTGPU_H_
-#define ITSTRACKINGGPU_PRIMARYVERTEXCONTEXTGPU_H_
-
-#include <array>
-
-#include <cub/cub.cuh>
-#include <cstdint>
-
-#include "ITStracking/Configuration.h"
-#include "ITStracking/Constants.h"
-#include "ITStracking/Definitions.h"
-#include "ITStracking/PrimaryVertexContext.h"
-#include "ITStracking/Road.h"
-#include "ITStracking/Tracklet.h"
-
-#include "DeviceStoreGPU.h"
-#include "UniquePointer.h"
-
-namespace o2
-{
-namespace its
-{
-
-class PrimaryVertexContextNV final : public PrimaryVertexContext
-{
- public:
-  PrimaryVertexContextNV() = default;
-  ~PrimaryVertexContextNV() override;
-
-  void initialise(const MemoryParameters& memParam, const TrackingParameters& trkParam,
-                  const std::vector<std::vector<Cluster>>& cl, const std::array<float, 3>& pv, const int iteration) override;
-
-  gpu::DeviceStoreNV& getDeviceContext();
-  gpu::Array<gpu::Vector<Cluster>, constants::its2::LayersNumber>& getDeviceClusters();
-  gpu::Array<gpu::Vector<Tracklet>, constants::its2::TrackletsPerRoad>& getDeviceTracklets();
-  gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad>& getDeviceTrackletsLookupTable();
-  gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad>& getDeviceTrackletsPerClustersTable();
-  gpu::Array<gpu::Vector<Cell>, constants::its2::CellsPerRoad>& getDeviceCells();
-  gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad - 1>& getDeviceCellsLookupTable();
-  gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad - 1>& getDeviceCellsPerTrackletTable();
-  std::array<gpu::Vector<int>, constants::its2::CellsPerRoad>& getTempTableArray();
-  std::array<gpu::Vector<Tracklet>, constants::its2::CellsPerRoad>& getTempTrackletArray();
-  std::array<gpu::Vector<Cell>, constants::its2::CellsPerRoad - 1>& getTempCellArray();
-  void updateDeviceContext();
-
- private:
-  gpu::DeviceStoreNV mGPUContext;
-  gpu::UniquePointer<gpu::DeviceStoreNV> mGPUContextDevicePointer;
-  std::array<gpu::Vector<int>, constants::its2::CellsPerRoad> mTempTableArray;
-  std::array<gpu::Vector<Tracklet>, constants::its2::CellsPerRoad> mTempTrackletArray;
-  std::array<gpu::Vector<Cell>, constants::its2::CellsPerRoad - 1> mTempCellArray;
-};
-
-inline PrimaryVertexContextNV::~PrimaryVertexContextNV() = default;
-
-inline gpu::DeviceStoreNV& PrimaryVertexContextNV::getDeviceContext()
-{
-  return *mGPUContextDevicePointer;
-}
-
-inline gpu::Array<gpu::Vector<Cluster>, constants::its2::LayersNumber>& PrimaryVertexContextNV::getDeviceClusters()
-{
-  return mGPUContext.getClusters();
-}
-
-inline gpu::Array<gpu::Vector<Tracklet>, constants::its2::TrackletsPerRoad>& PrimaryVertexContextNV::getDeviceTracklets()
-{
-  return mGPUContext.getTracklets();
-}
-
-inline gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getDeviceTrackletsLookupTable()
-{
-  return mGPUContext.getTrackletsLookupTable();
-}
-
-inline gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad>&
-  PrimaryVertexContextNV::getDeviceTrackletsPerClustersTable()
-{
-  return mGPUContext.getTrackletsPerClusterTable();
-}
-
-inline gpu::Array<gpu::Vector<Cell>, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getDeviceCells()
-{
-  return mGPUContext.getCells();
-}
-
-inline gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad - 1>& PrimaryVertexContextNV::getDeviceCellsLookupTable()
-{
-  return mGPUContext.getCellsLookupTable();
-}
-
-inline gpu::Array<gpu::Vector<int>, constants::its2::CellsPerRoad - 1>&
-  PrimaryVertexContextNV::getDeviceCellsPerTrackletTable()
-{
-  return mGPUContext.getCellsPerTrackletTable();
-}
-
-inline std::array<gpu::Vector<int>, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getTempTableArray()
-{
-  return mTempTableArray;
-}
-
-inline std::array<gpu::Vector<Tracklet>, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getTempTrackletArray()
-{
-  return mTempTrackletArray;
-}
-
-inline std::array<gpu::Vector<Cell>, constants::its2::CellsPerRoad - 1>& PrimaryVertexContextNV::getTempCellArray()
-{
-  return mTempCellArray;
-}
-
-inline void PrimaryVertexContextNV::updateDeviceContext()
-{
-  mGPUContextDevicePointer = gpu::UniquePointer<gpu::DeviceStoreNV>{mGPUContext};
-}
-
-inline void PrimaryVertexContextNV::initialise(const MemoryParameters& memParam, const TrackingParameters& trkParam,
-                                               const std::vector<std::vector<Cluster>>& cl, const std::array<float, 3>& pv, const int iteration)
-{
-  ///TODO: to be re-enabled in the future
-  // this->PrimaryVertexContext::initialise(memParam, cl, pv, iteration);
-  // mGPUContextDevicePointer = mGPUContext.initialise(mPrimaryVertex, mClusters, mTracklets, mCells, mCellsLookupTable, mMinR, mMaxR);
-}
-
-} // namespace its
-} // namespace o2
-
-#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h
index 20744b47cd9b5..ba36c29addc73 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h
@@ -18,16 +18,11 @@
 
 #include "ITStracking/Definitions.h"
 
-namespace o2
-{
-namespace its
-{
-namespace gpu
+namespace o2::its::gpu
 {
 
 class Stream final
 {
-
  public:
   Stream();
   ~Stream();
@@ -37,8 +32,5 @@ class Stream final
  private:
   GPUStream mStream;
 };
-} // namespace gpu
-} // namespace its
-} // namespace o2
-
-#endif /* TRAKINGITSU_INCLUDE_GPU_STREAM_H_ */
+} // namespace o2::its::gpu
+#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h
index c477922e59533..ac489fc8bfee5 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h
@@ -17,8 +17,8 @@
 #include "ITStracking/TimeFrame.h"
 
 #include "ITStrackingGPU/ClusterLinesGPU.h"
-#include "ITStrackingGPU/Array.h"
-#include "ITStrackingGPU/Vector.h"
+// #include "ITStrackingGPU/Array.h"
+// #include "ITStrackingGPU/Vector.h"
 #include "ITStrackingGPU/Stream.h"
 
 #include <gsl/gsl>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 100e49def0d50..1581eea6ce81a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -17,10 +17,8 @@
 #include "ITStracking/Configuration.h"
 
 #include "ITStrackingGPU/ClusterLinesGPU.h"
-#include "ITStrackingGPU/Array.h"
-#include "ITStrackingGPU/Vector.h"
 #include "ITStrackingGPU/Stream.h"
-#include "ITStrackingGPU/TimeFrameChunk.h"
+#include "ITStrackingGPU/Utils.h"
 
 #include <gsl/gsl>
 
@@ -39,8 +37,6 @@ class DefaultGPUAllocator : public ExternalAllocator
 template <int nLayers = 7>
 class TimeFrameGPU : public TimeFrame
 {
-  friend class GpuTimeFrameChunk<nLayers>;
-
  public:
   TimeFrameGPU();
   ~TimeFrameGPU();
@@ -88,24 +84,18 @@ class TimeFrameGPU : public TimeFrame
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
   void unregisterRest();
-  void initDeviceChunks(const int, const int);
   template <Task task>
-  size_t loadChunkData(const size_t, const size_t, const size_t);
-  size_t getNChunks() const { return mMemChunks.size(); }
-  GpuTimeFrameChunk<nLayers>& getChunk(const int chunk) { return mMemChunks[chunk]; }
-  Stream& getStream(const size_t stream) { return mGpuStreams[stream]; }
+  Stream& getStream(const size_t stream)
+  {
+    return mGpuStreams[stream];
+  }
   void wipe(const int);
 
   /// interface
   int getNClustersInRofSpan(const int, const int, const int) const;
   IndexTableUtils* getDeviceIndexTableUtils() { return mIndexTableUtilsDevice; }
   int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; }
-  std::vector<std::vector<Vertex>>& getVerticesInChunks() { return mVerticesInChunks; }
-  std::vector<std::vector<int>>& getNVerticesInChunks() { return mNVerticesInChunks; }
   std::vector<o2::its::TrackITSExt>& getTrackITSExt() { return mTrackITSExt; }
-  std::vector<std::vector<o2::MCCompLabel>>& getLabelsInChunks() { return mLabelsInChunks; }
-  int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones.
-  StaticTrackingParameters<nLayers>* getDeviceTrackingParameters() { return mTrackingParamsDevice; }
   Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; }
   int* getDeviceROFramesPV() { return mROFramesPVDevice; }
   unsigned char* getDeviceUsedClusters(const int);
@@ -158,16 +148,13 @@ class TimeFrameGPU : public TimeFrame
  private:
   void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
   bool mHostRegistered = false;
-  std::vector<GpuTimeFrameChunk<nLayers>> mMemChunks;
   TimeFrameGPUParameters mGpuParams;
-  StaticTrackingParameters<nLayers> mStaticTrackingParams;
 
   // Host-available device buffer sizes
   std::array<int, nLayers - 1> mNTracklets;
   std::array<int, nLayers - 2> mNCells;
 
   // Device pointers
-  StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
   IndexTableUtils* mIndexTableUtilsDevice;
 
   // Hybrid pref
@@ -216,31 +203,10 @@ class TimeFrameGPU : public TimeFrame
   size_t mAvailMemGB;
   bool mFirstInit = true;
 
-  // Output
-  std::vector<std::vector<Vertex>> mVerticesInChunks;
-  std::vector<std::vector<int>> mNVerticesInChunks;
-  std::vector<std::vector<o2::MCCompLabel>> mLabelsInChunks;
-
   // Temporary buffer for storing output tracks from GPU tracking
   std::vector<TrackITSExt> mTrackITSExt;
 };
 
-template <int nLayers>
-template <Task task>
-size_t TimeFrameGPU<nLayers>::loadChunkData(const size_t chunk, const size_t offset, const size_t maxRofs) // offset: readout frame to start from, maxRofs: to manage boundaries
-{
-  size_t nRof{0};
-
-  mMemChunks[chunk].reset(task, mGpuStreams[chunk]); // Reset chunks memory
-  if constexpr ((bool)task) {
-    nRof = mMemChunks[chunk].loadDataOnDevice(offset, maxRofs, 3, mGpuStreams[chunk]);
-  } else {
-    nRof = mMemChunks[chunk].loadDataOnDevice(offset, maxRofs, nLayers, mGpuStreams[chunk]);
-  }
-  LOGP(debug, "In chunk {}: loaded {} readout frames starting from {}", chunk, nRof, offset);
-  return nRof;
-}
-
 template <int nLayers>
 inline int TimeFrameGPU<nLayers>::getNClustersInRofSpan(const int rofIdstart, const int rofSpanSize, const int layerId) const
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
index 076523261ff7e..f9583d97ca030 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h
@@ -55,7 +55,6 @@ class TrackerTraitsGPU : public TrackerTraits
  private:
   IndexTableUtils* mDeviceIndexTableUtils;
   gpu::TimeFrameGPU<7>* mTimeFrameGPU;
-  gpu::StaticTrackingParameters<nLayers>* mStaticTrkPars;
 };
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h
deleted file mode 100644
index ce04da3dde622..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file UniquePointer.h
-/// \brief
-///
-
-#ifndef ITSTRACKINGGPU_UNIQUEPOINTER_H_
-#define ITSTRACKINGGPU_UNIQUEPOINTER_H_
-
-#include "Utils.h"
-
-namespace o2
-{
-namespace its
-{
-namespace gpu
-{
-
-namespace
-{
-template <typename T>
-struct UniquePointerTraits final {
-  typedef T* InternalPointer;
-
-  GPUhd() static constexpr T& getReference(const InternalPointer& internalPointer) noexcept
-  {
-    return const_cast<T&>(*internalPointer);
-  }
-
-  GPUhd() static constexpr T* getPointer(const InternalPointer& internalPointer) noexcept
-  {
-    return const_cast<T*>(internalPointer);
-  }
-};
-} // namespace
-
-template <typename T>
-class UniquePointer final
-{
-  typedef UniquePointerTraits<T> PointerTraits;
-
- public:
-  UniquePointer();
-  explicit UniquePointer(const T&);
-  ~UniquePointer();
-
-  UniquePointer(const UniquePointer&) = delete;
-  UniquePointer& operator=(const UniquePointer&) = delete;
-
-  UniquePointer(UniquePointer&&);
-  UniquePointer& operator=(UniquePointer&&);
-
-  GPUhd() T* get() noexcept;
-  GPUhd() const T* get() const noexcept;
-  GPUhd() T& operator*() noexcept;
-  GPUhd() const T& operator*() const noexcept;
-
- protected:
-  void destroy();
-
- private:
-  typename PointerTraits::InternalPointer mDevicePointer;
-};
-
-template <typename T>
-UniquePointer<T>::UniquePointer() : mDevicePointer{nullptr}
-{
-  // Nothing to do
-}
-
-template <typename T>
-UniquePointer<T>::UniquePointer(const T& ref)
-{
-  try {
-
-    utils::host::gpuMalloc(reinterpret_cast<void**>(&mDevicePointer), sizeof(T));
-    utils::host::gpuMemcpyHostToDevice(mDevicePointer, &ref, sizeof(T));
-
-  } catch (...) {
-
-    destroy();
-
-    throw;
-  }
-}
-
-template <typename T>
-UniquePointer<T>::~UniquePointer()
-{
-  destroy();
-}
-
-template <typename T>
-UniquePointer<T>::UniquePointer(UniquePointer<T>&& other) : mDevicePointer{other.mDevicePointer}
-{
-  // Nothing to do
-}
-
-template <typename T>
-UniquePointer<T>& UniquePointer<T>::operator=(UniquePointer<T>&& other)
-{
-  mDevicePointer = other.mDevicePointer;
-  other.mDevicePointer = nullptr;
-
-  return *this;
-}
-
-template <typename T>
-void UniquePointer<T>::destroy()
-{
-  if (mDevicePointer != nullptr) {
-
-    utils::host::gpuFree(mDevicePointer);
-  }
-}
-
-template <typename T>
-GPUhd() T* UniquePointer<T>::get() noexcept
-{
-  return PointerTraits::getPointer(mDevicePointer);
-}
-
-template <typename T>
-GPUhd() const T* UniquePointer<T>::get() const noexcept
-{
-  return PointerTraits::getPointer(mDevicePointer);
-}
-
-template <typename T>
-GPUhd() T& UniquePointer<T>::operator*() noexcept
-{
-  return PointerTraits::getReference(mDevicePointer);
-}
-
-template <typename T>
-GPUhd() const T& UniquePointer<T>::operator*() const noexcept
-{
-  return PointerTraits::getReference(mDevicePointer);
-}
-} // namespace gpu
-} // namespace its
-} // namespace o2
-
-#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index a88e51742e84a..7ed70a4e0fdd9 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -31,6 +31,12 @@ struct gpuPair {
 
 namespace gpu
 {
+
+template <typename T>
+void discardResult(const T&)
+{
+}
+
 // Poor man implementation of a span-like struct. It is very limited.
 template <typename T>
 struct gpuSpan {
@@ -91,38 +97,6 @@ GPUhd() const T* getPtrFromRuler(int index, const T* src, const int* ruler, cons
 {
   return src + ruler[index] * stride;
 }
-
-GPUh() void gpuThrowOnError();
-
-namespace utils
-{
-#ifdef __CUDACC__
-void checkGPUError(const cudaError_t error, const char* file = __FILE__, const int line = __LINE__);
-#endif
-#ifdef __HIPCC__
-void checkGPUError(const hipError_t error, const char* file = __FILE__, const int line = __LINE__);
-#endif
-
-// Dump device properties
-void getDeviceProp(int, bool verbose = true);
-
-dim3 getBlockSize(const int);
-dim3 getBlockSize(const int, const int);
-dim3 getBlockSize(const int, const int, const int);
-dim3 getBlocksGrid(const dim3&, const int);
-dim3 getBlocksGrid(const dim3&, const int, const int);
-
-void gpuMalloc(void**, const int);
-void gpuFree(void*);
-void gpuMemset(void*, int, int);
-void gpuMemcpyHostToDevice(void*, const void*, int);
-void gpuMemcpyDeviceToHost(void*, const void*, int);
-void gpuMemcpyToSymbol(const void* symbol, const void* src, int size);
-void gpuMemcpyFromSymbol(void* dst, const void* symbol, int size);
-
-GPUd() int getLaneIndex();
-GPUd() int shareToWarp(const int, const int);
-} // namespace utils
 } // namespace gpu
 } // namespace its
 } // namespace o2
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h
deleted file mode 100644
index 3912caec8449c..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h
+++ /dev/null
@@ -1,310 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file Vector.h
-/// \brief
-///
-
-#ifndef ITSTRACKINGGPU_VECTOR_H_
-#define ITSTRACKINGGPU_VECTOR_H_
-
-#include <cassert>
-#include <new>
-#include <type_traits>
-#include <vector>
-
-#include "Stream.h"
-#include "Utils.h"
-
-namespace o2
-{
-namespace its
-{
-namespace gpu
-{
-
-template <typename T>
-class Vector final
-{
-  static_assert(std::is_trivially_destructible<T>::value, "Vector only supports trivially destructible objects.");
-
- public:
-  Vector();
-  explicit Vector(const size_t, const size_t = 0);
-  Vector(const T* const, const size_t, const size_t = 0);
-  GPUhd() ~Vector();
-
-  Vector(const Vector&) = delete;
-  Vector& operator=(const Vector&) = delete;
-
-  GPUhd() Vector(Vector&&);
-  Vector& operator=(Vector&&);
-
-  size_t getSizeFromDevice() const;
-
-  T getElementFromDevice(const size_t) const;
-
-  void resize(const size_t);
-  void reset(const size_t, const size_t = 0);
-  void reset(const T* const, const size_t, const size_t = 0);
-
-  void resetWithInt(const size_t, const int value = 0);
-  void copyIntoSizedVector(std::vector<T>&);
-
-  GPUhd() T* get() const;
-  GPUhd() size_t capacity() const;
-  GPUhd() Vector<T> getWeakCopy() const;
-  GPUd() T& operator[](const size_t) const;
-
-  GPUd() size_t size() const;
-  GPUhd() void dump();
-
-  template <typename... Args>
-  GPUd() void emplace(const size_t, Args&&...);
-
- protected:
-  void destroy();
-
- private:
-  GPUhd() Vector(const Vector&, const bool);
-
-  T* mArrayPtr = nullptr;
-  size_t* mDeviceSizePtr = nullptr;
-  size_t mCapacity;
-  bool mIsWeak;
-};
-
-template <typename T>
-Vector<T>::Vector() : Vector{nullptr, 0}
-{
-  // Nothing to do
-}
-
-template <typename T>
-Vector<T>::Vector(const size_t capacity, const size_t initialSize) : Vector{nullptr, capacity, initialSize}
-{
-  // Nothing to do
-}
-
-template <typename T>
-Vector<T>::Vector(const T* const source, const size_t size, const size_t initialSize) : mCapacity{size}, mIsWeak{false}
-{
-  if (size > 0) {
-    try {
-
-      utils::gpuMalloc(reinterpret_cast<void**>(&mArrayPtr), size * sizeof(T));
-      utils::gpuMalloc(reinterpret_cast<void**>(&mDeviceSizePtr), sizeof(size_t));
-
-      if (source != nullptr) {
-
-        utils::gpuMemcpyHostToDevice(mArrayPtr, source, size * sizeof(T));
-        utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t));
-
-      } else {
-
-        utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &initialSize, sizeof(size_t));
-      }
-
-    } catch (...) {
-
-      destroy();
-
-      throw;
-    }
-  }
-}
-
-template <typename T>
-GPUhd() Vector<T>::Vector(const Vector& other, const bool isWeak)
-  : mArrayPtr{other.mArrayPtr},
-    mDeviceSizePtr{other.mDeviceSizePtr},
-    mCapacity{other.mCapacity},
-    mIsWeak{isWeak}
-{
-  // Nothing to do
-}
-
-template <typename T>
-GPUhd() Vector<T>::~Vector()
-{
-  if (mIsWeak) {
-    return;
-  } else {
-#if defined(TRACKINGITSU_GPU_DEVICE)
-    assert(0);
-#else
-    destroy();
-#endif
-  }
-}
-
-template <typename T>
-GPUhd() Vector<T>::Vector(Vector<T>&& other)
-  : mArrayPtr{other.mArrayPtr},
-    mDeviceSizePtr{other.mDeviceSizePtr},
-    mCapacity{other.mCapacity},
-    mIsWeak{other.mIsWeak}
-{
-  other.mArrayPtr = nullptr;
-  other.mDeviceSizePtr = nullptr;
-}
-
-template <typename T>
-Vector<T>& Vector<T>::operator=(Vector<T>&& other)
-{
-  destroy();
-
-  mArrayPtr = other.mArrayPtr;
-  mDeviceSizePtr = other.mDeviceSizePtr;
-  mCapacity = other.mCapacity;
-  mIsWeak = other.mIsWeak;
-
-  other.mArrayPtr = nullptr;
-  other.mDeviceSizePtr = nullptr;
-
-  return *this;
-}
-
-template <typename T>
-size_t Vector<T>::getSizeFromDevice() const
-{
-  size_t size;
-  utils::gpuMemcpyDeviceToHost(&size, mDeviceSizePtr, sizeof(size_t));
-
-  return size;
-}
-
-template <typename T>
-void Vector<T>::resize(const size_t size)
-{
-  utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t));
-}
-
-template <typename T>
-void Vector<T>::reset(const size_t capacity, const size_t initialSize)
-{
-  reset(nullptr, capacity, initialSize);
-}
-
-template <typename T>
-void Vector<T>::reset(const T* const source, const size_t size, const size_t initialSize)
-{
-  if (size > mCapacity) {
-    if (mArrayPtr != nullptr) {
-      utils::gpuFree(mArrayPtr);
-    }
-    utils::gpuMalloc(reinterpret_cast<void**>(&mArrayPtr), size * sizeof(T));
-    mCapacity = size;
-  }
-  if (mDeviceSizePtr == nullptr) {
-    utils::gpuMalloc(reinterpret_cast<void**>(&mDeviceSizePtr), sizeof(size_t));
-  }
-
-  if (source != nullptr) {
-    utils::gpuMemcpyHostToDevice(mArrayPtr, source, size * sizeof(T));
-    utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t));
-  } else {
-    utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &initialSize, sizeof(size_t));
-  }
-}
-
-template <typename T>
-void Vector<T>::resetWithInt(const size_t size, const int value)
-{
-  if (size > mCapacity) {
-    if (mArrayPtr != nullptr) {
-      utils::gpuFree(mArrayPtr);
-    }
-    utils::gpuMalloc(reinterpret_cast<void**>(&mArrayPtr), size * sizeof(int));
-    mCapacity = size;
-  }
-  if (mDeviceSizePtr == nullptr) {
-    utils::gpuMalloc(reinterpret_cast<void**>(&mDeviceSizePtr), sizeof(int));
-  }
-
-  utils::gpuMemset(mArrayPtr, value, size * sizeof(int));
-  utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(int));
-}
-
-template <typename T>
-void Vector<T>::copyIntoSizedVector(std::vector<T>& destinationVector)
-{
-  utils::gpuMemcpyDeviceToHost(destinationVector.data(), mArrayPtr, destinationVector.size() * sizeof(T));
-}
-
-template <typename T>
-inline void Vector<T>::destroy()
-{
-  if (mArrayPtr != nullptr) {
-    utils::gpuFree(mArrayPtr);
-  }
-  if (mDeviceSizePtr != nullptr) {
-    utils::gpuFree(mDeviceSizePtr);
-  }
-}
-
-template <typename T>
-GPUhd() T* Vector<T>::get() const
-{
-  return mArrayPtr;
-}
-
-template <typename T>
-GPUhd() size_t Vector<T>::capacity() const
-{
-  return mCapacity;
-}
-
-template <typename T>
-GPUhd() Vector<T> Vector<T>::getWeakCopy() const
-{
-  return Vector{*this, true};
-}
-
-template <typename T>
-GPUd() T& Vector<T>::operator[](const size_t index) const
-{
-  return mArrayPtr[index];
-}
-
-template <typename T>
-GPUh() T Vector<T>::getElementFromDevice(const size_t index) const
-{
-  T element;
-  utils::gpuMemcpyDeviceToHost(&element, mArrayPtr + index, sizeof(T));
-
-  return element;
-}
-
-template <typename T>
-GPUd() size_t Vector<T>::size() const
-{
-  return *mDeviceSizePtr;
-}
-
-template <typename T>
-template <typename... Args>
-GPUd() void Vector<T>::emplace(const size_t index, Args&&... arguments)
-{
-  new (mArrayPtr + index) T(std::forward<Args>(arguments)...);
-}
-
-template <typename T>
-GPUhd() void Vector<T>::dump()
-{
-  printf("mArrayPtr = %p\nmDeviceSize   = %p\nmCapacity     = %d\nmIsWeak       = %s\n",
-         mArrayPtr, mDeviceSizePtr, mCapacity, mIsWeak ? "true" : "false");
-}
-} // namespace gpu
-} // namespace its
-} // namespace o2
-
-#endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
index 531cf2b0dcd33..56f6e57e2f305 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -17,16 +17,13 @@ message(STATUS "Building ITS CUDA tracker")
 # add_compile_definitions(ITS_MEASURE_GPU_TIME)
 o2_add_library(ITStrackingCUDA
                SOURCES ClusterLinesGPU.cu
-                       Context.cu
                        Stream.cu
                        TrackerTraitsGPU.cxx
                        TimeFrameGPU.cu
-                       TimeFrameChunk.cu
                        TracerGPU.cu
                        TrackingKernels.cu
                        VertexingKernels.cu
                        VertexerTraitsGPU.cxx
-                       Utils.cu
                PUBLIC_INCLUDE_DIRECTORIES ../
                PUBLIC_LINK_LIBRARIES O2::ITStracking
                                      O2::SimConfig
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu
deleted file mode 100644
index f3bced9463020..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#include <cuda_runtime.h>
-#include "ITStrackingGPU/Context.h"
-#include "ITStrackingGPU/Utils.h"
-
-#include <sstream>
-#include <stdexcept>
-#include <iostream>
-
-namespace o2
-{
-namespace its
-{
-namespace gpu
-{
-
-using utils::checkGPUError;
-
-Context::Context(bool dumpDevices)
-{
-  checkGPUError(cudaGetDeviceCount(&mDevicesNum), __FILE__, __LINE__);
-
-  if (mDevicesNum == 0) {
-    throw std::runtime_error{"There are no available GPU device(s)\n"};
-  }
-
-  mDeviceProperties.resize(mDevicesNum, DeviceProperties{});
-
-  int currentDeviceIndex;
-  checkGPUError(cudaGetDevice(&currentDeviceIndex), __FILE__, __LINE__);
-
-  for (int iDevice{0}; iDevice < mDevicesNum; ++iDevice) {
-
-    cudaDeviceProp deviceProperties;
-
-    checkGPUError(cudaSetDevice(iDevice), __FILE__, __LINE__);
-    checkGPUError(cudaGetDeviceProperties(&deviceProperties, iDevice), __FILE__, __LINE__);
-
-    int major = deviceProperties.major;
-    int minor = deviceProperties.minor;
-
-    mDeviceProperties[iDevice].name = deviceProperties.name;
-    mDeviceProperties[iDevice].gpuProcessors = deviceProperties.multiProcessorCount;
-    mDeviceProperties[iDevice].gpuCores = getGPUCores(major, minor) * deviceProperties.multiProcessorCount;
-    mDeviceProperties[iDevice].globalMemorySize = deviceProperties.totalGlobalMem;
-    mDeviceProperties[iDevice].constantMemorySize = deviceProperties.totalConstMem;
-    mDeviceProperties[iDevice].sharedMemorySize = deviceProperties.sharedMemPerBlock;
-    mDeviceProperties[iDevice].maxClockRate = deviceProperties.memoryClockRate;
-    mDeviceProperties[iDevice].busWidth = deviceProperties.memoryBusWidth;
-    mDeviceProperties[iDevice].l2CacheSize = deviceProperties.l2CacheSize;
-    mDeviceProperties[iDevice].registersPerBlock = deviceProperties.regsPerBlock;
-    mDeviceProperties[iDevice].warpSize = deviceProperties.warpSize;
-    mDeviceProperties[iDevice].maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
-    mDeviceProperties[iDevice].maxBlocksPerSM = getGPUMaxThreadsPerComputingUnit();
-    mDeviceProperties[iDevice].maxThreadsDim = dim3{static_cast<unsigned int>(deviceProperties.maxThreadsDim[0]),
-                                                    static_cast<unsigned int>(deviceProperties.maxThreadsDim[1]),
-                                                    static_cast<unsigned int>(deviceProperties.maxThreadsDim[2])};
-    mDeviceProperties[iDevice].maxGridDim = dim3{static_cast<unsigned int>(deviceProperties.maxGridSize[0]),
-                                                 static_cast<unsigned int>(deviceProperties.maxGridSize[1]),
-                                                 static_cast<unsigned int>(deviceProperties.maxGridSize[2])};
-    if (dumpDevices) {
-      std::cout << "################ " << GPU_ARCH << " DEVICE " << iDevice << " ################" << std::endl;
-      std::cout << "Name " << mDeviceProperties[iDevice].name << std::endl;
-      std::cout << "minor " << minor << " major " << major << std::endl;
-      std::cout << "gpuProcessors " << mDeviceProperties[iDevice].gpuProcessors << std::endl;
-      std::cout << "gpuCores " << mDeviceProperties[iDevice].gpuCores << std::endl;
-      std::cout << "globalMemorySize " << mDeviceProperties[iDevice].globalMemorySize << std::endl;
-      std::cout << "constantMemorySize " << mDeviceProperties[iDevice].constantMemorySize << std::endl;
-      std::cout << "sharedMemorySize " << mDeviceProperties[iDevice].sharedMemorySize << std::endl;
-      std::cout << "maxClockRate " << mDeviceProperties[iDevice].maxClockRate << std::endl;
-      std::cout << "busWidth " << mDeviceProperties[iDevice].busWidth << std::endl;
-      std::cout << "l2CacheSize " << mDeviceProperties[iDevice].l2CacheSize << std::endl;
-      std::cout << "registersPerBlock " << mDeviceProperties[iDevice].registersPerBlock << std::endl;
-      std::cout << "warpSize " << mDeviceProperties[iDevice].warpSize << std::endl;
-      std::cout << "maxThreadsPerBlock " << mDeviceProperties[iDevice].maxThreadsPerBlock << std::endl;
-      std::cout << "maxBlocksPerSM " << mDeviceProperties[iDevice].maxBlocksPerSM << std::endl;
-      std::cout << "maxThreadsDim " << mDeviceProperties[iDevice].maxThreadsDim.x << ", "
-                << mDeviceProperties[iDevice].maxThreadsDim.y << ", "
-                << mDeviceProperties[iDevice].maxThreadsDim.z << std::endl;
-      std::cout << "maxGridDim " << mDeviceProperties[iDevice].maxGridDim.x << ", "
-                << mDeviceProperties[iDevice].maxGridDim.y << ", "
-                << mDeviceProperties[iDevice].maxGridDim.z << std::endl;
-      std::cout << std::endl;
-    }
-  }
-
-  checkGPUError(cudaSetDevice(currentDeviceIndex), __FILE__, __LINE__);
-}
-
-Context& Context::getInstance()
-{
-  static Context gpuContext;
-  return gpuContext;
-}
-
-const DeviceProperties& Context::getDeviceProperties()
-{
-  int currentDeviceIndex;
-  checkGPUError(cudaGetDevice(&currentDeviceIndex), __FILE__, __LINE__);
-
-  return getDeviceProperties(currentDeviceIndex);
-}
-
-const DeviceProperties& Context::getDeviceProperties(const int deviceIndex)
-{
-  return mDeviceProperties[deviceIndex];
-}
-
-} // namespace gpu
-} // namespace its
-} // namespace o2
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu
index 885587d8d4544..34bc3dc68a7a4 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu
@@ -10,10 +10,8 @@
 // or submit itself to any jurisdiction.
 ///
 
-#include <cuda_runtime.h>
 #include "ITStrackingGPU/Stream.h"
-#include "ITStrackingGPU/Utils.h"
-#include "GPUCommonLogger.h"
+#include "GPUCommonHelpers.h"
 
 namespace o2
 {
@@ -21,16 +19,15 @@ namespace its
 {
 namespace gpu
 {
-using utils::checkGPUError;
 
 Stream::Stream()
 {
-  checkGPUError(cudaStreamCreate(&mStream));
+  GPUChkErrS(cudaStreamCreate(&mStream));
 }
+
 Stream::~Stream()
 {
-  LOGP(info, "Destroying stream");
-  checkGPUError(cudaStreamDestroy(mStream));
+  GPUChkErrS(cudaStreamDestroy(mStream));
 }
 
 const GPUStream& Stream::get() const
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu
index 8353b6ff0aa8b..c8512e667aea8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu
@@ -26,6 +26,7 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonMath.h"
 #include "GPUCommonLogger.h"
+#include "GPUCommonHelpers.h"
 
 #ifndef __HIPCC__
 #define THRUST_NAMESPACE thrust::cuda
@@ -39,38 +40,37 @@ using constants::GB;
 using constants::MB;
 namespace gpu
 {
-using utils::checkGPUError;
 
 template <int nLayers>
 GpuTimeFrameChunk<nLayers>::~GpuTimeFrameChunk()
 {
   if (mAllocated) {
     for (int i = 0; i < nLayers; ++i) {
-      checkGPUError(cudaFree(mClustersDevice[i]));
-      // checkGPUError(cudaFree(mTrackingFrameInfoDevice[i]));
-      checkGPUError(cudaFree(mClusterExternalIndicesDevice[i]));
-      checkGPUError(cudaFree(mIndexTablesDevice[i]));
+      GPUChkErrS(cudaFree(mClustersDevice[i]));
+      // GPUChkErrS(cudaFree(mTrackingFrameInfoDevice[i]));
+      GPUChkErrS(cudaFree(mClusterExternalIndicesDevice[i]));
+      GPUChkErrS(cudaFree(mIndexTablesDevice[i]));
       if (i < nLayers - 1) {
-        checkGPUError(cudaFree(mTrackletsDevice[i]));
-        checkGPUError(cudaFree(mTrackletsLookupTablesDevice[i]));
+        GPUChkErrS(cudaFree(mTrackletsDevice[i]));
+        GPUChkErrS(cudaFree(mTrackletsLookupTablesDevice[i]));
         if (i < nLayers - 2) {
-          checkGPUError(cudaFree(mCellsDevice[i]));
-          checkGPUError(cudaFree(mCellsLookupTablesDevice[i]));
-          checkGPUError(cudaFree(mRoadsLookupTablesDevice[i]));
+          GPUChkErrS(cudaFree(mCellsDevice[i]));
+          GPUChkErrS(cudaFree(mCellsLookupTablesDevice[i]));
+          GPUChkErrS(cudaFree(mRoadsLookupTablesDevice[i]));
           if (i < nLayers - 3) {
-            checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i]));
-            checkGPUError(cudaFree(mNeighboursCellDevice[i]));
+            GPUChkErrS(cudaFree(mNeighboursCellLookupTablesDevice[i]));
+            GPUChkErrS(cudaFree(mNeighboursCellDevice[i]));
           }
         }
       }
     }
-    // checkGPUError(cudaFree(mRoadsDevice));
-    checkGPUError(cudaFree(mCUBTmpBufferDevice));
-    checkGPUError(cudaFree(mFoundTrackletsDevice));
-    checkGPUError(cudaFree(mNFoundCellsDevice));
-    checkGPUError(cudaFree(mCellsDeviceArray));
-    checkGPUError(cudaFree(mNeighboursCellDeviceArray));
-    checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray));
+    // GPUChkErrS(cudaFree(mRoadsDevice));
+    GPUChkErrS(cudaFree(mCUBTmpBufferDevice));
+    GPUChkErrS(cudaFree(mFoundTrackletsDevice));
+    GPUChkErrS(cudaFree(mNFoundCellsDevice));
+    GPUChkErrS(cudaFree(mCellsDeviceArray));
+    GPUChkErrS(cudaFree(mNeighboursCellDeviceArray));
+    GPUChkErrS(cudaFree(mNeighboursCellLookupTablesDeviceArray));
   }
 }
 
@@ -117,9 +117,9 @@ void GpuTimeFrameChunk<nLayers>::allocate(const size_t nrof, Stream& stream)
   // static_cast<TimeFrameGPU<nLayers>*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast<void**>(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true);
 
   // /// Copy pointers of allocated memory to regrouping arrays
-  // checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get()));
-  // checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
-  // checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
+  // GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get()));
+  // GPUChkErrS(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
+  // GPUChkErrS(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
 
   mAllocated = true;
 }
@@ -133,28 +133,28 @@ void GpuTimeFrameChunk<nLayers>::reset(const Task task, Stream& stream)
   //     auto thrustTrackletsBegin = thrust::device_ptr<Tracklet>(mTrackletsDevice[i]);
   //     auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof;
   //     thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{});
-  //     checkGPUError(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
+  //     GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
   //   }
-  //   checkGPUError(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
-  //   checkGPUError(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get()));
+  //   GPUChkErrS(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
+  //   GPUChkErrS(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get()));
   // } else {
   //   for (int i = 0; i < nLayers; ++i) {
   //     if (i < nLayers - 1) {
-  //       checkGPUError(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
+  //       GPUChkErrS(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get()));
   //       auto thrustTrackletsBegin = thrust::device_ptr<Tracklet>(mTrackletsDevice[i]);
   //       auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof;
   //       thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{});
   //       if (i < nLayers - 2) {
-  //         checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get()));
-  //         checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
+  //         GPUChkErrS(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get()));
+  //         GPUChkErrS(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
   //         if (i < nLayers - 3) {
-  //           checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
-  //           checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
+  //           GPUChkErrS(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
+  //           GPUChkErrS(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
   //         }
   //       }
   //     }
   //   }
-  //   checkGPUError(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get()));
+  //   GPUChkErrS(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get()));
   // }
 }
 
@@ -275,12 +275,12 @@ size_t GpuTimeFrameChunk<nLayers>::loadDataOnDevice(const size_t startRof, const
   //   if (mHostClusters[i].size() > mTFGPUParams->clustersPerROfCapacity * nRofs) {
   //     LOGP(warning, "Clusters on layer {} exceed the expected value, resizing to config value: {}, will lose information!", i, mTFGPUParams->clustersPerROfCapacity * nRofs);
   //   }
-  //   checkGPUError(cudaMemcpyAsync(mClustersDevice[i],
+  //   GPUChkErrS(cudaMemcpyAsync(mClustersDevice[i],
   //                                 mHostClusters[i].data(),
   //                                 (int)std::min(mHostClusters[i].size(), mTFGPUParams->clustersPerROfCapacity * nRofs) * sizeof(Cluster),
   //                                 cudaMemcpyHostToDevice, stream.get()));
   //   if (mHostIndexTables[i].data()) {
-  //     checkGPUError(cudaMemcpyAsync(mIndexTablesDevice[i],
+  //     GPUChkErrS(cudaMemcpyAsync(mIndexTablesDevice[i],
   //                                   mHostIndexTables[i].data(),
   //                                   mHostIndexTables[i].size() * sizeof(int),
   //                                   cudaMemcpyHostToDevice, stream.get()));
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index b1aa55f533c34..55d523dcc30d0 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -26,23 +26,24 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonMath.h"
 #include "GPUCommonLogger.h"
+#include "GPUCommonHelpers.h"
 
 #ifdef ITS_MEASURE_GPU_TIME
-#define START_GPU_STREAM_TIMER(stream, name)           \
-  cudaEvent_t event_start, event_stop;                 \
-  checkGPUError(cudaEventCreate(&event_start));        \
-  checkGPUError(cudaEventCreate(&event_stop));         \
-  checkGPUError(cudaEventRecord(event_start, stream)); \
+#define START_GPU_STREAM_TIMER(stream, name)        \
+  cudaEvent_t event_start, event_stop;              \
+  GPUChkErrS(cudaEventCreate(&event_start));        \
+  GPUChkErrS(cudaEventCreate(&event_stop));         \
+  GPUChkErrS(cudaEventRecord(event_start, stream)); \
   const std::string task_name = name;
 
 #define STOP_GPU_STREAM_TIMER(stream)                                                \
-  checkGPUError(cudaEventRecord(event_stop, stream));                                \
-  checkGPUError(cudaEventSynchronize(event_stop));                                   \
+  GPUChkErrS(cudaEventRecord(event_stop, stream));                                   \
+  GPUChkErrS(cudaEventSynchronize(event_stop));                                      \
   float ms;                                                                          \
-  checkGPUError(cudaEventElapsedTime(&ms, event_start, event_stop));                 \
+  GPUChkErrS(cudaEventElapsedTime(&ms, event_start, event_stop));                    \
   std::cout << "Elapsed time for " << task_name << ": " << ms << " ms" << std::endl; \
-  checkGPUError(cudaEventDestroy(event_start));                                      \
-  checkGPUError(cudaEventDestroy(event_stop));
+  GPUChkErrS(cudaEventDestroy(event_start));                                         \
+  GPUChkErrS(cudaEventDestroy(event_stop));
 #else
 #define START_GPU_STREAM_TIMER(stream, name)
 #define STOP_GPU_STREAM_TIMER(stream)
@@ -57,7 +58,6 @@ using constants::MB;
 
 namespace gpu
 {
-using utils::checkGPUError;
 
 void* DefaultGPUAllocator::allocate(size_t size)
 {
@@ -69,7 +69,6 @@ template <int nLayers>
 TimeFrameGPU<nLayers>::TimeFrameGPU()
 {
   mIsGPU = true;
-  utils::getDeviceProp(0, true);
 }
 
 template <int nLayers>
@@ -82,7 +81,7 @@ void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream* strPt
     *ptr = mAllocator->allocate(size);
   } else {
     LOGP(debug, "Calling default CUDA allocator");
-    checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(ptr), size, strPtr->get()));
+    GPUChkErrS(cudaMallocAsync(reinterpret_cast<void**>(ptr), size, strPtr->get()));
   }
 }
 
@@ -101,7 +100,7 @@ void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
     allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
   }
   LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
-  checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -113,12 +112,12 @@ void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator());
-      checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
+      GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator());
-    checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -131,12 +130,12 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator());
-      checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
+      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator());
-    checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -149,10 +148,10 @@ void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
-      checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -165,10 +164,10 @@ void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
-      checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -179,7 +178,7 @@ void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags");
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
-    checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -192,10 +191,10 @@ void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
-      checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -208,12 +207,12 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator());
-      checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
-      checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
+      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
     allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator());
-    checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -225,7 +224,7 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
     LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -237,10 +236,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
     LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
   }
 }
@@ -254,11 +253,11 @@ void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
       LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator());
     }
-    checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
   }
   if (!iteration) {
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator());
-    checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -269,13 +268,13 @@ void TimeFrameGPU<nLayers>::createTrackletsBuffers()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     mNTracklets[iLayer] = 0;
-    checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
+    GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
     LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator());
   }
   allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -285,8 +284,8 @@ void TimeFrameGPU<nLayers>::loadTrackletsDevice()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB);
-    checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -297,11 +296,11 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
-    checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
+    GPUChkErrS(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
   }
-  checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
+  GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -311,12 +310,12 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
   // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
   allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
-    checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -327,7 +326,7 @@ void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const uns
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
   LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
-  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -339,11 +338,11 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
     LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator());
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding.
-    checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
-    checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
-  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -354,10 +353,10 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
-    checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
   allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
-  checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -366,7 +365,7 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
   mNCells[layer] = 0;
-  checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
+  GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
   LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator());
 
@@ -379,8 +378,8 @@ void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
-    checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+    GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -390,8 +389,8 @@ void TimeFrameGPU<nLayers>::loadRoadsDevice()
 {
   LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", mRoads.size(), mRoads.size() * sizeof(Road<nLayers - 2>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mRoadsDevice), mRoads.size() * sizeof(Road<nLayers - 2>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road<nLayers - 2>), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road<nLayers - 2>), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -400,8 +399,8 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(std::vector<CellSeed>& seeds)
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds");
   LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -411,7 +410,7 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, con
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
@@ -425,7 +424,7 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std
   mCellsNeighbours[layer].resize(neighbours.size());
   LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
@@ -436,7 +435,7 @@ void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -448,8 +447,8 @@ void TimeFrameGPU<nLayers>::createTrackITSExtDevice(std::vector<CellSeed>& seeds
   mTrackITSExt.resize(seeds.size());
   LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
-  checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
+  GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -460,7 +459,7 @@ void TimeFrameGPU<nLayers>::downloadCellsDevice()
   for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     mCells[iLayer].resize(mNCells[iLayer]);
-    checkGPUError(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -472,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
     LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
     mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
-    checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -483,7 +482,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
-  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -491,7 +490,7 @@ void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(std::vector<int>& lut, c
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB);
-  checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -500,9 +499,9 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(std::vector<CellSeed>& see
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks");
   LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB);
-  checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
-  checkGPUError(cudaHostUnregister(mTrackITSExt.data()));
-  checkGPUError(cudaHostUnregister(seeds.data()));
+  GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
+  GPUChkErrS(cudaHostUnregister(seeds.data()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -511,8 +510,8 @@ void TimeFrameGPU<nLayers>::unregisterRest()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory");
   LOGP(debug, "unregistering rest of the host memory...");
-  checkGPUError(cudaHostUnregister(mCellsDevice.data()));
-  checkGPUError(cudaHostUnregister(mTrackletsDevice.data()));
+  GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
+  GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -520,13 +519,13 @@ template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
 {
   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    checkGPUError(cudaHostUnregister(mUnsortedClusters[iLayer].data()));
-    checkGPUError(cudaHostUnregister(mClusters[iLayer].data()));
-    checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data()));
+    GPUChkErrS(cudaHostUnregister(mUnsortedClusters[iLayer].data()));
+    GPUChkErrS(cudaHostUnregister(mClusters[iLayer].data()));
+    GPUChkErrS(cudaHostUnregister(mTrackingFrameInfo[iLayer].data()));
   }
-  checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data()));
-  checkGPUError(cudaHostUnregister(mUnsortedClustersDevice.data()));
-  checkGPUError(cudaHostUnregister(mClustersDevice.data()));
+  GPUChkErrS(cudaHostUnregister(mTrackingFrameInfoDevice.data()));
+  GPUChkErrS(cudaHostUnregister(mUnsortedClustersDevice.data()));
+  GPUChkErrS(cudaHostUnregister(mClustersDevice.data()));
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 583452d0c429c..805e66675e1b9 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -36,6 +36,9 @@
 
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
+#include "ITStrackingGPU/Utils.h"
+
+#include "GPUCommonHelpers.h"
 
 #ifndef __HIPCC__
 #define THRUST_NAMESPACE thrust::cuda
@@ -51,20 +54,6 @@
 #include "DetectorsBase/Propagator.h"
 using namespace o2::track;
 
-#define gpuCheckError(x)                \
-  {                                     \
-    gpuAssert((x), __FILE__, __LINE__); \
-  }
-inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
-{
-  if (code != cudaSuccess) {
-    LOGF(error, "GPUassert: %s %s %d", cudaGetErrorString(code), file, line);
-    if (abort) {
-      throw std::runtime_error("GPU assert failed.");
-    }
-  }
-}
-
 namespace o2::its
 {
 using namespace constants::its2;
@@ -873,20 +862,20 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
       mulScatAng[iLayer]);
     void* d_temp_storage = nullptr;
     size_t temp_storage_bytes = 0;
-    gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
-                                                temp_storage_bytes,        // temp_storage_bytes
-                                                trackletsLUTsHost[iLayer], // d_in
-                                                trackletsLUTsHost[iLayer], // d_out
-                                                nClusters[iLayer] + 1,     // num_items
-                                                0));                       // NOLINT: this is the offset of the sum, not a pointer
-    discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
-                                                temp_storage_bytes,        // temp_storage_bytes
-                                                trackletsLUTsHost[iLayer], // d_in
-                                                trackletsLUTsHost[iLayer], // d_out
-                                                nClusters[iLayer] + 1,     // num_items
-                                                0));                       // NOLINT: this is the offset of the sum, not a pointer
-    gpuCheckError(cudaFree(d_temp_storage));
+    GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                             temp_storage_bytes,        // temp_storage_bytes
+                                             trackletsLUTsHost[iLayer], // d_in
+                                             trackletsLUTsHost[iLayer], // d_out
+                                             nClusters[iLayer] + 1,     // num_items
+                                             0));                       // NOLINT: this is the offset of the sum, not a pointer
+    GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+    GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                             temp_storage_bytes,        // temp_storage_bytes
+                                             trackletsLUTsHost[iLayer], // d_in
+                                             trackletsLUTsHost[iLayer], // d_out
+                                             nClusters[iLayer] + 1,     // num_items
+                                             0));                       // NOLINT: this is the offset of the sum, not a pointer
+    GPUChkErrS(cudaFree(d_temp_storage));
   }
 }
 
@@ -955,24 +944,24 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
     auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
     nTracklets[iLayer] = unique_end - tracklets_ptr;
     if (iLayer > 0) {
-      gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
+      GPUChkErrS(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
       gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
       void* d_temp_storage = nullptr;
       size_t temp_storage_bytes = 0;
-      gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
-                                                  temp_storage_bytes,        // temp_storage_bytes
-                                                  trackletsLUTsHost[iLayer], // d_in
-                                                  trackletsLUTsHost[iLayer], // d_out
-                                                  nClusters[iLayer] + 1,     // num_items
-                                                  0));                       // NOLINT: this is the offset of the sum, not a pointer
-      discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-      gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
-                                                  temp_storage_bytes,        // temp_storage_bytes
-                                                  trackletsLUTsHost[iLayer], // d_in
-                                                  trackletsLUTsHost[iLayer], // d_out
-                                                  nClusters[iLayer] + 1,     // num_items
-                                                  0));                       // NOLINT: this is the offset of the sum, not a pointer
-      gpuCheckError(cudaFree(d_temp_storage));
+      GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                               temp_storage_bytes,        // temp_storage_bytes
+                                               trackletsLUTsHost[iLayer], // d_in
+                                               trackletsLUTsHost[iLayer], // d_out
+                                               nClusters[iLayer] + 1,     // num_items
+                                               0));                       // NOLINT: this is the offset of the sum, not a pointer
+      GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+      GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                               temp_storage_bytes,        // temp_storage_bytes
+                                               trackletsLUTsHost[iLayer], // d_in
+                                               trackletsLUTsHost[iLayer], // d_out
+                                               nClusters[iLayer] + 1,     // num_items
+                                               0));                       // NOLINT: this is the offset of the sum, not a pointer
+      GPUChkErrS(cudaFree(d_temp_storage));
     }
   }
 }
@@ -1011,20 +1000,20 @@ void countCellsHandler(
     nSigmaCut);               // const float
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,     // d_temp_storage
-                                              temp_storage_bytes, // temp_storage_bytes
-                                              cellsLUTsHost,      // d_in
-                                              cellsLUTsHost,      // d_out
-                                              nTracklets + 1,     // num_items
-                                              0));                // NOLINT: this is the offset of the sum, not a pointer
-  discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,     // d_temp_storage
-                                              temp_storage_bytes, // temp_storage_bytes
-                                              cellsLUTsHost,      // d_in
-                                              cellsLUTsHost,      // d_out
-                                              nTracklets + 1,     // num_items
-                                              0));                // NOLINT: this is the offset of the sum, not a pointer
-  gpuCheckError(cudaFree(d_temp_storage));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,     // d_temp_storage
+                                           temp_storage_bytes, // temp_storage_bytes
+                                           cellsLUTsHost,      // d_in
+                                           cellsLUTsHost,      // d_out
+                                           nTracklets + 1,     // num_items
+                                           0));                // NOLINT: this is the offset of the sum, not a pointer
+  GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,     // d_temp_storage
+                                           temp_storage_bytes, // temp_storage_bytes
+                                           cellsLUTsHost,      // d_in
+                                           cellsLUTsHost,      // d_out
+                                           nTracklets + 1,     // num_items
+                                           0));                // NOLINT: this is the offset of the sum, not a pointer
+  GPUChkErrS(cudaFree(d_temp_storage));
 }
 
 void computeCellsHandler(
@@ -1089,37 +1078,37 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
 
   void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
   size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
-  gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
-                                              temp_storage_bytes, // temp_storage_bytes
-                                              neighboursLUT,      // d_in
-                                              neighboursLUT,      // d_out
-                                              nCellsNext));       // num_items
-
-  discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-  gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
-                                              temp_storage_bytes, // temp_storage_bytes
-                                              neighboursLUT,      // d_in
-                                              neighboursLUT,      // d_out
-                                              nCellsNext));       // num_items
-
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
-                                              temp_storage_bytes_2, // temp_storage_bytes
-                                              neighboursIndexTable, // d_in
-                                              neighboursIndexTable, // d_out
-                                              nCells + 1,           // num_items
-                                              0));                  // NOLINT: this is the offset of the sum, not a pointer
-
-  discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2));
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
-                                              temp_storage_bytes_2, // temp_storage_bytes
-                                              neighboursIndexTable, // d_in
-                                              neighboursIndexTable, // d_out
-                                              nCells + 1,           // num_items
-                                              0));                  // NOLINT: this is the offset of the sum, not a pointer
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
+                                           temp_storage_bytes, // temp_storage_bytes
+                                           neighboursLUT,      // d_in
+                                           neighboursLUT,      // d_out
+                                           nCellsNext));       // num_items
+
+  GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
+                                           temp_storage_bytes, // temp_storage_bytes
+                                           neighboursLUT,      // d_in
+                                           neighboursLUT,      // d_out
+                                           nCellsNext));       // num_items
+
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
+                                           temp_storage_bytes_2, // temp_storage_bytes
+                                           neighboursIndexTable, // d_in
+                                           neighboursIndexTable, // d_out
+                                           nCells + 1,           // num_items
+                                           0));                  // NOLINT: this is the offset of the sum, not a pointer
+
+  GPUChkErrS(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
+                                           temp_storage_bytes_2, // temp_storage_bytes
+                                           neighboursIndexTable, // d_in
+                                           neighboursIndexTable, // d_out
+                                           nCells + 1,           // num_items
+                                           0));                  // NOLINT: this is the offset of the sum, not a pointer
   unsigned int nNeighbours;
-  gpuCheckError(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaFree(d_temp_storage));
-  gpuCheckError(cudaFree(d_temp_storage_2));
+  GPUChkErrS(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost));
+  GPUChkErrS(cudaFree(d_temp_storage));
+  GPUChkErrS(cudaFree(d_temp_storage_2));
   return nNeighbours;
 }
 
@@ -1150,8 +1139,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
+  GPUChkErrS(cudaPeekAtLastError());
+  GPUChkErrS(cudaDeviceSynchronize());
 }
 
 int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually remove this!
@@ -1172,12 +1161,12 @@ int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually
   thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()),
                thrust::make_permutation_iterator(neighVectorPairs, vals.end()),
                sortedNeigh.begin());
-  discardResult(cudaDeviceSynchronize());
+  GPUChkErrS(cudaDeviceSynchronize());
   auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair<int, int>()); // trim leading -1s
   auto trimmedSize = sortedNeigh.end() - trimmedBegin;
   neighHost.resize(trimmedSize);
   thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first<int, int>());
-  gpuCheckError(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost));
+  GPUChkErrS(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost));
 
   return trimmedSize;
 }
@@ -1225,19 +1214,19 @@ void processNeighboursHandler(const int startLayer,
                                                             matCorrType);
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr,                                       // d_temp_storage
-                                              temp_storage_bytes,                            // temp_storage_bytes
-                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
-                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
-                                              nCells[startLayer] + 1,                        // num_items
-                                              0));                                           // NOLINT: this is the offset of the sum, not a pointer
-  discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
-                                              temp_storage_bytes,                            // temp_storage_bytes
-                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
-                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
-                                              nCells[startLayer] + 1,                        // num_items
-                                              0));                                           // NOLINT: this is the offset of the sum, not a pointer
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr,                                       // d_temp_storage
+                                           temp_storage_bytes,                            // temp_storage_bytes
+                                           thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                           thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                           nCells[startLayer] + 1,                        // num_items
+                                           0));                                           // NOLINT: this is the offset of the sum, not a pointer
+  GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
+                                           temp_storage_bytes,                            // temp_storage_bytes
+                                           thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                           thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                           nCells[startLayer] + 1,                        // num_items
+                                           0));                                           // NOLINT: this is the offset of the sum, not a pointer
 
   updatedCellId.resize(foundSeedsTable.back());
   updatedCellSeed.resize(foundSeedsTable.back());
@@ -1260,7 +1249,7 @@ void processNeighboursHandler(const int startLayer,
                                                              propagator,
                                                              matCorrType);
   auto t1 = updatedCellSeed.size();
-  gpuCheckError(cudaFree(d_temp_storage));
+  GPUChkErrS(cudaFree(d_temp_storage));
   int level = startLevel;
   for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
     temp_storage_bytes = 0;
@@ -1289,19 +1278,19 @@ void processNeighboursHandler(const int startLayer,
                                                               maxChi2ClusterAttachment,
                                                               propagator,
                                                               matCorrType);
-    gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr,                                       // d_temp_storage
-                                                temp_storage_bytes,                            // temp_storage_bytes
-                                                thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
-                                                thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
-                                                nCells[iLayer] + 1,                            // num_items
-                                                0));                                           // NOLINT: this is the offset of the sum, not a pointer
-    discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
-                                                temp_storage_bytes,                            // temp_storage_bytes
-                                                thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
-                                                thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
-                                                nCells[iLayer] + 1,                            // num_items
-                                                0));                                           // NOLINT: this is the offset of the sum, not a pointer
+    GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr,                                       // d_temp_storage
+                                             temp_storage_bytes,                            // temp_storage_bytes
+                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                             nCells[iLayer] + 1,                            // num_items
+                                             0));                                           // NOLINT: this is the offset of the sum, not a pointer
+    GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+    GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
+                                             temp_storage_bytes,                            // temp_storage_bytes
+                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                             nCells[iLayer] + 1,                            // num_items
+                                             0));                                           // NOLINT: this is the offset of the sum, not a pointer
     auto foundSeeds{foundSeedsTable.back()};
     updatedCellId.resize(foundSeeds);
     thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0);
@@ -1325,7 +1314,7 @@ void processNeighboursHandler(const int startLayer,
                                                                maxChi2ClusterAttachment,
                                                                propagator,
                                                                matCorrType);
-    gpuCheckError(cudaFree(d_temp_storage));
+    GPUChkErrS(cudaFree(d_temp_storage));
   }
   thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
   auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
@@ -1365,8 +1354,8 @@ void trackSeedHandler(CellSeed* trackSeeds,
   thrust::device_ptr<o2::its::TrackITSExt> tr_ptr(tracks);
 
   thrust::sort(tr_ptr, tr_ptr + nSeeds, gpu::compare_track_chi2());
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
+  GPUChkErrS(cudaPeekAtLastError());
+  GPUChkErrS(cudaDeviceSynchronize());
 }
 
 template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu
deleted file mode 100644
index 99a24f347bd48..0000000000000
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#include <cuda_runtime.h>
-#include "ITStrackingGPU/Utils.h"
-#include "ITStrackingGPU/Context.h"
-#include "ITStracking/Constants.h"
-
-#include <sstream>
-#include <stdexcept>
-#include <cstdio>
-#include <iomanip>
-#include <numeric>
-#include <iostream>
-#include <cstdint>
-
-namespace
-{
-int roundUp(const int numToRound, const int multiple)
-{
-  if (multiple == 0) {
-    return numToRound;
-  }
-
-  int remainder{numToRound % multiple};
-  if (remainder == 0) {
-    return numToRound;
-  }
-  return numToRound + multiple - remainder;
-}
-
-int findNearestDivisor(const int numToRound, const int divisor)
-{
-
-  if (numToRound > divisor) {
-    return divisor;
-  }
-
-  int result = numToRound;
-  while (divisor % result != 0) {
-    ++result;
-  }
-  return result;
-}
-
-} // namespace
-
-namespace o2
-{
-namespace its
-{
-using constants::GB;
-namespace gpu
-{
-GPUh() void gpuThrowOnError()
-{
-  cudaError_t error = cudaGetLastError();
-
-  if (error != cudaSuccess) {
-    std::ostringstream errorString{};
-    errorString << GPU_ARCH << " API returned error  [" << cudaGetErrorString(error) << "] (code " << error << ")" << std::endl;
-    throw std::runtime_error{errorString.str()};
-  }
-}
-
-double bytesToconfig(size_t s) { return (double)s / (1024.0); }
-double bytesToGB(size_t s) { return (double)s / GB; }
-
-void utils::checkGPUError(const cudaError_t error, const char* file, const int line)
-{
-  if (error != cudaSuccess) {
-    std::ostringstream errorString{};
-    errorString << file << ":" << line << std::endl
-                << GPU_ARCH << " API returned error [" << cudaGetErrorString(error) << "] (code "
-                << error << ")" << std::endl;
-    throw std::runtime_error{errorString.str()};
-  }
-}
-
-void utils::getDeviceProp(int deviceId, bool print)
-{
-  const int w1 = 34;
-  std::cout << std::left;
-  std::cout << std::setw(w1)
-            << "--------------------------------------------------------------------------------"
-            << std::endl;
-  std::cout << std::setw(w1) << "device#" << deviceId << std::endl;
-
-  cudaDeviceProp props;
-  checkGPUError(cudaGetDeviceProperties(&props, deviceId));
-  if (print) {
-    std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
-    std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
-    std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
-    std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
-    std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
-    std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
-              << std::endl;
-    std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
-    std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl;
-    std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
-              << std::endl;
-    std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
-    std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
-              << " Mhz" << std::endl;
-    std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
-              << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
-#if !defined(__CUDACC__)
-    std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-              << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl;
-#endif
-#if defined(__HIPCC__)
-    std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-              << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl;
-#endif
-    std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
-    std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config"
-              << std::endl;
-    std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
-    std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
-    std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
-    std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
-    std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl;
-    std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
-    std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
-    std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
-    std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
-    std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
-    std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
-    std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
-    std::cout << std::setw(w1) << "major: " << props.major << std::endl;
-    std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
-    std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
-    std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl;
-    std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl;
-#if defined(__HIPCC__)
-    std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
-    std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
-              << std::endl;
-    std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
-    std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
-              << std::endl;
-    std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
-    std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
-    std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
-    std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
-    std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
-    std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
-    std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
-    std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
-    std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
-    std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
-    std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
-    std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
-    std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
-    std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
-#endif
-    std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
-    std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
-    std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
-    std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
-    std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
-    std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
-    std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
-#if defined(__HIPCC__)
-    std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
-    std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
-#endif
-
-    int deviceCnt;
-    checkGPUError(cudaGetDeviceCount(&deviceCnt));
-    std::cout << std::setw(w1) << "peers: ";
-    for (int i = 0; i < deviceCnt; i++) {
-      int isPeer;
-      checkGPUError(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
-      if (isPeer) {
-        std::cout << "device#" << i << " ";
-      }
-    }
-    std::cout << std::endl;
-    std::cout << std::setw(w1) << "non-peers: ";
-    for (int i = 0; i < deviceCnt; i++) {
-      int isPeer;
-      checkGPUError(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
-      if (!isPeer) {
-        std::cout << "device#" << i << " ";
-      }
-    }
-    std::cout << std::endl;
-
-    size_t free, total;
-    checkGPUError(cudaMemGetInfo(&free, &total));
-
-    std::cout << std::fixed << std::setprecision(2);
-    std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl;
-    std::cout << std::setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << std::setprecision(0)
-              << (float)free / total * 100.0 << "%)" << std::endl;
-  }
-}
-
-dim3 utils::getBlockSize(const int colsNum)
-{
-  return getBlockSize(colsNum, 1);
-}
-
-dim3 utils::getBlockSize(const int colsNum, const int rowsNum)
-{
-  const DeviceProperties& deviceProperties = Context::getInstance().getDeviceProperties();
-  return getBlockSize(colsNum, rowsNum, deviceProperties.gpuCores / deviceProperties.maxBlocksPerSM);
-}
-
-dim3 utils::getBlockSize(const int colsNum, const int rowsNum, const int maxThreadsPerBlock)
-{
-  const DeviceProperties& deviceProperties = Context::getInstance().getDeviceProperties();
-  int xThreads = max(min(colsNum, deviceProperties.maxThreadsDim.x), 1);
-  int yThreads = max(min(rowsNum, deviceProperties.maxThreadsDim.y), 1);
-  const int totalThreads = roundUp(min(xThreads * yThreads, maxThreadsPerBlock),
-                                   deviceProperties.warpSize);
-
-  if (xThreads > yThreads) {
-
-    xThreads = findNearestDivisor(xThreads, totalThreads);
-    yThreads = totalThreads / xThreads;
-
-  } else {
-
-    yThreads = findNearestDivisor(yThreads, totalThreads);
-    xThreads = totalThreads / yThreads;
-  }
-
-  return dim3{static_cast<unsigned int>(xThreads), static_cast<unsigned int>(yThreads)};
-}
-
-dim3 utils::getBlocksGrid(const dim3& threadsPerBlock, const int rowsNum)
-{
-  return getBlocksGrid(threadsPerBlock, rowsNum, 1);
-}
-
-dim3 utils::getBlocksGrid(const dim3& threadsPerBlock, const int rowsNum, const int colsNum)
-{
-
-  return dim3{1 + (rowsNum - 1) / threadsPerBlock.x, 1 + (colsNum - 1) / threadsPerBlock.y};
-}
-
-void utils::gpuMalloc(void** p, const int size)
-{
-  checkGPUError(cudaMalloc(p, size), __FILE__, __LINE__);
-}
-
-void utils::gpuFree(void* p)
-{
-  checkGPUError(cudaFree(p), __FILE__, __LINE__);
-}
-
-void utils::gpuMemset(void* p, int value, int size)
-{
-  checkGPUError(cudaMemset(p, value, size), __FILE__, __LINE__);
-}
-
-void utils::gpuMemcpyHostToDevice(void* dst, const void* src, int size)
-{
-  checkGPUError(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice), __FILE__, __LINE__);
-}
-
-void utils::gpuMemcpyDeviceToHost(void* dst, const void* src, int size)
-{
-  checkGPUError(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost), __FILE__, __LINE__);
-}
-
-void utils::gpuMemcpyToSymbol(const void* symbol, const void* src, int size)
-{
-  checkGPUError(cudaMemcpyToSymbol(symbol, src, size, 0, cudaMemcpyHostToDevice), __FILE__, __LINE__);
-}
-
-void utils::gpuMemcpyFromSymbol(void* dst, const void* symbol, int size)
-{
-  checkGPUError(cudaMemcpyFromSymbol(dst, symbol, size, 0, cudaMemcpyDeviceToHost), __FILE__, __LINE__);
-}
-} // namespace gpu
-} // namespace its
-} // namespace o2
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
index a26d52b2961c3..f630d2cdec76a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx
@@ -54,162 +54,162 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
   if (!mTimeFrameGPU->getClusters().size()) {
     return;
   }
-  std::vector<std::thread> threads(mTimeFrameGPU->getNChunks());
-  for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
-    //   int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()};
-    //   mTimeFrameGPU->getVerticesInChunks()[chunkId].clear();
-    //   mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear();
-    //   mTimeFrameGPU->getLabelsInChunks()[chunkId].clear();
-    //   auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void {
-    //     auto offset = chunkId * rofPerChunk;
-    //     auto maxROF = offset + rofPerChunk;
-    //     while (offset < maxROF) {
-    //       auto rofs = mTimeFrameGPU->loadChunkData<gpu::Task::Vertexer>(chunkId, offset, maxROF);
-    //       RANGE("chunk_gpu_vertexing", 1);
-    //       // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()};
-    //       // timer.Start("vtTrackletFinder");
-    //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer0Layer1><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),         // const Cluster* clustersNextLayer,    // 0 2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-    //         mTimeFrameGPU->getDeviceROframesClusters(0),                   // const int* sizeNextLClusters,
-    //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0),      // const int* nextIndexTables,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),        // Tracklet* Tracklets,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets,
-    //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
-    //         offset,                                                        // const unsigned int startRofId,
-    //         rofs,                                                          // const unsigned int rofSize,
-    //         mVrtParams.phiCut,                                             // const float phiCut,
-    //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
-
-    //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer1Layer2><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2),         // const Cluster* clustersNextLayer,    // 0 2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-    //         mTimeFrameGPU->getDeviceROframesClusters(2),                   // const int* sizeNextLClusters,
-    //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2),      // const int* nextIndexTables,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),        // Tracklet* Tracklets,
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets,
-    //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
-    //         offset,                                                        // const unsigned int startRofId,
-    //         rofs,                                                          // const unsigned int rofSize,
-    //         mVrtParams.phiCut,                                             // const float phiCut,
-    //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
-
-    //       gpu::trackletSelectionKernelMultipleRof<true><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-    //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-    //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
-    //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
-    //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
-    //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
-    //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
-    //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
-
-    //       discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),
-    //                                                   mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize,
-    //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),
-    //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(),
-    //                                                   mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
-    //                                                   mTimeFrameGPU->getStream(chunkId).get()));
-
-    //       // Reset used tracklets
-    //       checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),
-    //                                     false,
-    //                                     sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
-    //                                     mTimeFrameGPU->getStream(chunkId).get()),
-    //                     __FILE__, __LINE__);
-
-    //       gpu::trackletSelectionKernelMultipleRof<false><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-    //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-    //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
-    //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
-    //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
-    //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
-    //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
-    //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
-    //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
-
-    //       int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1);
-    //       int lastFoundLines;
-    //       std::vector<int> exclusiveFoundLinesHost(nClusters + 1);
-
-    //       // Obtain whole exclusive sum including nCluster+1 element  (nCluster+1)th element is the total number of found lines.
-    //       checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-    //       checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-    //       exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines;
-
-    //       std::vector<Line> lines(exclusiveFoundLinesHost[nClusters]);
-
-    //       checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-    //       checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get()));
-
-    //       // Compute vertices
-    //       std::vector<ClusterLines> clusterLines;
-    //       std::vector<bool> usedLines;
-    //       for (int rofId{0}; rofId < rofs; ++rofId) {
-    //         auto rof = offset + rofId;
-    //         auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF
-    //         auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof];     // number of clusters for this ROF
-    //         auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof];                                                  // starting line offset for this ROF
-    //         auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof;
-    //         gsl::span<const o2::its::Line> linesInRof(lines.data() + linesOffsetRof, static_cast<gsl::span<o2::its::Line>::size_type>(nLinesRof));
-
-    //         usedLines.resize(linesInRof.size(), false);
-    //         usedLines.assign(linesInRof.size(), false);
-    //         clusterLines.clear();
-    //         clusterLines.reserve(nClustersL1Rof);
-    //         computeVerticesInRof(rof,
-    //                              linesInRof,
-    //                              usedLines,
-    //                              clusterLines,
-    //                              mTimeFrameGPU->getBeamXY(),
-    //                              mTimeFrameGPU->getVerticesInChunks()[chunkId],
-    //                              mTimeFrameGPU->getNVerticesInChunks()[chunkId],
-    //                              mTimeFrameGPU,
-    //                              mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr);
-    //       }
-    //       offset += rofs;
-    //     }
-    //   };
-    //   // Do work
-    //   threads[chunkId] = std::thread(doVertexReconstruction);
-    // }
-    // for (auto& thread : threads) {
-    //   thread.join();
-    // }
-    // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
-    //   int start{0};
-    //   for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) {
-    //     gsl::span<const Vertex> rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast<gsl::span<Vertex>::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])};
-    //     mTimeFrameGPU->addPrimaryVertices(rofVerts);
-    //     if (mTimeFrameGPU->hasMCinformation()) {
-    //       mTimeFrameGPU->getVerticesLabels().emplace_back();
-    //       // TODO: add MC labels
-    //     }
-    //     start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId];
-    //   }
-    // }
-    // mTimeFrameGPU->wipe(3);
-  }
+  // std::vector<std::thread> threads(mTimeFrameGPU->getNChunks());
+  // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
+  //   int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()};
+  //   mTimeFrameGPU->getVerticesInChunks()[chunkId].clear();
+  //   mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear();
+  //   mTimeFrameGPU->getLabelsInChunks()[chunkId].clear();
+  //   auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void {
+  //     auto offset = chunkId * rofPerChunk;
+  //     auto maxROF = offset + rofPerChunk;
+  //     while (offset < maxROF) {
+  //       auto rofs = mTimeFrameGPU->loadChunkData<gpu::Task::Vertexer>(chunkId, offset, maxROF);
+  //       RANGE("chunk_gpu_vertexing", 1);
+  //       // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()};
+  //       // timer.Start("vtTrackletFinder");
+  //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer0Layer1><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),         // const Cluster* clustersNextLayer,    // 0 2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
+  //         mTimeFrameGPU->getDeviceROframesClusters(0),                   // const int* sizeNextLClusters,
+  //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0),      // const int* nextIndexTables,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),        // Tracklet* Tracklets,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets,
+  //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
+  //         offset,                                                        // const unsigned int startRofId,
+  //         rofs,                                                          // const unsigned int rofSize,
+  //         mVrtParams.phiCut,                                             // const float phiCut,
+  //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
+
+  //       gpu::trackleterKernelMultipleRof<TrackletMode::Layer1Layer2><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2),         // const Cluster* clustersNextLayer,    // 0 2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
+  //         mTimeFrameGPU->getDeviceROframesClusters(2),                   // const int* sizeNextLClusters,
+  //         mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2),      // const int* nextIndexTables,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),        // Tracklet* Tracklets,
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets,
+  //         mTimeFrameGPU->getDeviceIndexTableUtils(),                     // const IndexTableUtils* utils,
+  //         offset,                                                        // const unsigned int startRofId,
+  //         rofs,                                                          // const unsigned int rofSize,
+  //         mVrtParams.phiCut,                                             // const float phiCut,
+  //         mVrtParams.maxTrackletsPerCluster);                            // const size_t maxTrackletsPerCluster = 1e2
+
+  //       gpu::trackletSelectionKernelMultipleRof<true><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
+  //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
+  //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
+  //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
+  //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
+  //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
+  //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
+  //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
+
+  //       discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),
+  //                                                   mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize,
+  //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),
+  //                                                   mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(),
+  //                                                   mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
+  //                                                   mTimeFrameGPU->getStream(chunkId).get()));
+
+  //       // Reset used tracklets
+  //       checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),
+  //                                     false,
+  //                                     sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1),
+  //                                     mTimeFrameGPU->getStream(chunkId).get()),
+  //                     __FILE__, __LINE__);
+
+  //       gpu::trackletSelectionKernelMultipleRof<false><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
+  //         mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
+  //         mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1),    // const int* nFoundTracklet12,            // Number of tracklets found on layers 1-2
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(),        // unsigned char* usedTracklets,           // Used tracklets
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceLines(),                // Line* lines,                            // Lines
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(),          // int* nFoundLines,                       // Number of found lines
+  //         mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines,              // Number of found lines exclusive scan
+  //         offset,                                                           // const unsigned int startRofId,          // Starting ROF ID
+  //         rofs,                                                             // const unsigned int rofSize,             // Number of ROFs to consider
+  //         mVrtParams.maxTrackletsPerCluster,                                // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster
+  //         mVrtParams.tanLambdaCut,                                          // const float tanLambdaCut = 0.025f,      // Cut on tan lambda
+  //         mVrtParams.phiCut);                                               // const float phiCut = 0.002f)            // Cut on phi
+
+  //       int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1);
+  //       int lastFoundLines;
+  //       std::vector<int> exclusiveFoundLinesHost(nClusters + 1);
+
+  //       // Obtain whole exclusive sum including nCluster+1 element  (nCluster+1)th element is the total number of found lines.
+  //       checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
+  //       checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
+  //       exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines;
+
+  //       std::vector<Line> lines(exclusiveFoundLinesHost[nClusters]);
+
+  //       checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
+  //       checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get()));
+
+  //       // Compute vertices
+  //       std::vector<ClusterLines> clusterLines;
+  //       std::vector<bool> usedLines;
+  //       for (int rofId{0}; rofId < rofs; ++rofId) {
+  //         auto rof = offset + rofId;
+  //         auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF
+  //         auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof];     // number of clusters for this ROF
+  //         auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof];                                                  // starting line offset for this ROF
+  //         auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof;
+  //         gsl::span<const o2::its::Line> linesInRof(lines.data() + linesOffsetRof, static_cast<gsl::span<o2::its::Line>::size_type>(nLinesRof));
+
+  //         usedLines.resize(linesInRof.size(), false);
+  //         usedLines.assign(linesInRof.size(), false);
+  //         clusterLines.clear();
+  //         clusterLines.reserve(nClustersL1Rof);
+  //         computeVerticesInRof(rof,
+  //                              linesInRof,
+  //                              usedLines,
+  //                              clusterLines,
+  //                              mTimeFrameGPU->getBeamXY(),
+  //                              mTimeFrameGPU->getVerticesInChunks()[chunkId],
+  //                              mTimeFrameGPU->getNVerticesInChunks()[chunkId],
+  //                              mTimeFrameGPU,
+  //                              mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr);
+  //       }
+  //       offset += rofs;
+  //     }
+  //   };
+  //   // Do work
+  //   threads[chunkId] = std::thread(doVertexReconstruction);
+  // }
+  // for (auto& thread : threads) {
+  //   thread.join();
+  // }
+  // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
+  //   int start{0};
+  //   for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) {
+  //     gsl::span<const Vertex> rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast<gsl::span<Vertex>::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])};
+  //     mTimeFrameGPU->addPrimaryVertices(rofVerts);
+  //     if (mTimeFrameGPU->hasMCinformation()) {
+  //       mTimeFrameGPU->getVerticesLabels().emplace_back();
+  //       // TODO: add MC labels
+  //     }
+  //     start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId];
+  //   }
+  // }
+  // mTimeFrameGPU->wipe(3);
+  // }
 }
 
 void VertexerTraitsGPU::computeTrackletMatching(const int iteration)
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
index 2ba4471ef61e5..3aab0624ef556 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
@@ -14,6 +14,7 @@
 #include <cub/cub.cuh>
 
 #include "ITStrackingGPU/VertexingKernels.h"
+#include "GPUCommonHelpers.h"
 
 namespace o2
 {
@@ -21,7 +22,6 @@ namespace its
 {
 using constants::its::VertexerHistogramVolume;
 using constants::math::TwoPi;
-using gpu::utils::checkGPUError;
 using math_utils::getNormalizedPhi;
 using namespace constants::its2;
 
@@ -84,17 +84,6 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde
               getPhiBinIndex(phiRangeMax)};
 }
 
-GPUh() void gpuThrowOnError()
-{
-  cudaError_t error = cudaGetLastError();
-
-  if (error != cudaSuccess) {
-    std::ostringstream errorString{};
-    errorString << GPU_ARCH << " API returned error  [" << cudaGetErrorString(error) << "] (code " << error << ")" << std::endl;
-    throw std::runtime_error{errorString.str()};
-  }
-}
-
 template <typename... Args>
 GPUd() void printOnThread(const unsigned int tId, const char* str, Args... args)
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
index 0b686273a159a..8015f0b20e862 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt
@@ -14,16 +14,13 @@ if(HIP_ENABLED)
   set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fgpu-rdc")
   o2_add_hipified_library(ITStrackingHIP
                  SOURCES ../cuda/ClusterLinesGPU.cu
-                         ../cuda/Context.cu
                          ../cuda/TimeFrameGPU.cu
-                         ../cuda/TimeFrameChunk.cu
                          ../cuda/Stream.cu
                          ../cuda/TrackerTraitsGPU.cxx
                          ../cuda/TracerGPU.cu
                          ../cuda/TrackingKernels.cu
                          ../cuda/VertexingKernels.cu
                          ../cuda/VertexerTraitsGPU.cxx
-                         ../cuda/Utils.cu
                  PUBLIC_INCLUDE_DIRECTORIES ../
                  PUBLIC_LINK_LIBRARIES O2::ITStracking
                                        O2::GPUTracking
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h
index 6324b03cb8ca6..da02149fbc432 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h
@@ -23,8 +23,10 @@
 
 #include "ITStracking/Definitions.h"
 #include "CommonConstants/MathConstants.h"
+
 #include "GPUCommonMath.h"
 #include "GPUCommonDef.h"
+#include "GPUCommonArray.h"
 
 namespace o2
 {
@@ -52,9 +54,9 @@ constexpr int ClustersPerCell{3};
 constexpr int UnusedIndex{-1};
 constexpr float Resolution{0.0005f};
 
-GPUhdi() constexpr GPUArray<float, 3> VertexerHistogramVolume()
+GPUhdi() constexpr o2::gpu::gpustd::array<float, 3> VertexerHistogramVolume()
 {
-  return GPUArray<float, 3>{{1.98, 1.98, 40.f}};
+  return o2::gpu::gpustd::array<float, 3>{{1.98, 1.98, 40.f}};
 }
 } // namespace its
 
@@ -64,27 +66,29 @@ constexpr int LayersNumber{7};
 constexpr int TrackletsPerRoad{LayersNumber - 1};
 constexpr int CellsPerRoad{LayersNumber - 2};
 
-GPUhdi() constexpr GPUArray<float, LayersNumber> LayersZCoordinate()
+GPUhdi() constexpr o2::gpu::gpustd::array<float, LayersNumber> LayersZCoordinate()
 {
   constexpr double s = 1.; // safety margin
-  return GPUArray<float, LayersNumber>{{16.333f + s, 16.333f + s, 16.333f + s, 42.140f + s, 42.140f + s, 73.745f + s, 73.745f + s}};
+  return o2::gpu::gpustd::array<float, LayersNumber>{16.333f + s, 16.333f + s, 16.333f + s, 42.140f + s, 42.140f + s, 73.745f + s, 73.745f + s};
 }
-GPUhdi() constexpr GPUArray<float, LayersNumber> LayersRCoordinate()
+
+GPUhdi() constexpr o2::gpu::gpustd::array<float, LayersNumber> LayersRCoordinate()
 {
-  return GPUArray<float, LayersNumber>{{2.33959f, 3.14076f, 3.91924f, 19.6213f, 24.5597f, 34.388f, 39.3329f}};
+  return o2::gpu::gpustd::array<float, LayersNumber>{{2.33959f, 3.14076f, 3.91924f, 19.6213f, 24.5597f, 34.388f, 39.3329f}};
 }
 
 constexpr int ZBins{256};
 constexpr int PhiBins{128};
 constexpr float InversePhiBinSize{PhiBins / constants::math::TwoPi};
-GPUhdi() constexpr GPUArray<float, LayersNumber> InverseZBinSize()
+GPUhdi() constexpr o2::gpu::gpustd::array<float, LayersNumber> InverseZBinSize()
 {
   constexpr auto zSize = LayersZCoordinate();
-  return GPUArray<float, LayersNumber>{{0.5f * ZBins / (zSize[0]), 0.5f * ZBins / (zSize[1]), 0.5f * ZBins / (zSize[2]),
-                                        0.5f * ZBins / (zSize[3]), 0.5f * ZBins / (zSize[4]), 0.5f * ZBins / (zSize[5]),
-                                        0.5f * ZBins / (zSize[6])}};
+  return o2::gpu::gpustd::array<float, LayersNumber>{0.5f * ZBins / (zSize[0]), 0.5f * ZBins / (zSize[1]), 0.5f * ZBins / (zSize[2]),
+                                                     0.5f * ZBins / (zSize[3]), 0.5f * ZBins / (zSize[4]), 0.5f * ZBins / (zSize[5]),
+                                                     0.5f * ZBins / (zSize[6])};
 }
-inline float getInverseZCoordinate(const int layerIndex)
+
+GPUhdi() constexpr float getInverseZCoordinate(const int layerIndex)
 {
   return 0.5f * ZBins / LayersZCoordinate()[layerIndex];
 }
@@ -115,7 +119,7 @@ namespace pdgcodes
 constexpr int PionCode{211};
 }
 } // namespace constants
-#ifndef __OPENCL__ /// FIXME: this is for compatibility with OCL
+#ifndef GPUCA_GPUCODE_DEVICE
 typedef std::vector<std::vector<int>> index_table_t;
 #endif
 } // namespace its
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
index a1d2fa338ba63..59d0e59eb0637 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h
@@ -15,15 +15,6 @@
 #ifndef TRACKINGITS_DEFINITIONS_H_
 #define TRACKINGITS_DEFINITIONS_H_
 
-// #define CA_DEBUG
-// #define VTX_DEBUG
-#define __USE_GPU_TRACER__
-
-template <typename T>
-void discardResult(const T&)
-{
-}
-
 #ifndef GPUCA_GPUCODE_DEVICE
 #include <array>
 #endif
@@ -36,103 +27,17 @@ void discardResult(const T&)
   } while (0)
 #endif
 
-#if defined(__CUDA_ARCH__) // ????
-#define TRACKINGITSU_GPU_DEVICE
-#endif
-
 #if defined(__CUDACC__) || defined(__HIPCC__)
-#define MATH_CEIL ceil
-
-#ifndef GPUCA_GPUCODE_DEVICE
-#include <cstddef>
-#endif
-#include "../GPU/ITStrackingGPU/Array.h"
-
-template <typename T, size_t Size>
-using GPUArray = o2::its::gpu::Array<T, Size>;
-
 #ifdef __CUDACC__
-#define GPU_ARCH "CUDA"
-
+#include <cuda_runtime.h>
 typedef cudaStream_t GPUStream;
-inline int getGPUCores(const int major, const int minor)
-{
-  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-  typedef struct
-  {
-    int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-    int Cores;
-  } sSMtoCores;
-
-  sSMtoCores nGpuArchCoresPerSM[] =
-    {
-      {0x20, 32},  // Fermi Generation (SM 2.0) GF100 class
-      {0x21, 48},  // Fermi Generation (SM 2.1) GF10x class
-      {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
-      {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
-      {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
-      {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
-      {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
-      {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
-      {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class
-      {0x60, 64},  // Pascal Generation (SM 6.0) GP100 class
-      {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class
-      {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class
-      {0x70, 64},  // Volta Generation (SM 7.0) GV100 class
-      {0x72, 64},  // Volta Generation (SM 7.2) GV10B class
-      {0x75, 64},  // Turing Generation (SM 7.5) TU1xx class
-      {-1, -1}};
-
-  int index = 0;
-
-  while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
-    }
-
-    index++;
-  }
-
-  // If we don't find the values, we default use the previous one to run properly
-  return nGpuArchCoresPerSM[index - 1].Cores;
-}
-inline int getGPUMaxThreadsPerComputingUnit()
-{
-  return 8;
-}
-
 #else // __HIPCC__
-#define GPU_ARCH "HIP"
+#include <hip/hip_runtime.h>
 typedef hipStream_t GPUStream;
-inline int getGPUCores(const int major, const int minor)
-{
-  // Hardcoded result for AMD RADEON WX 9100, to be decided if and how determine this paramter
-  return 4096;
-}
-
-inline int getGPUMaxThreadsPerComputingUnit()
-{
-  return 8;
-}
 #endif
-
 #else
-#define MATH_CEIL std::ceil
-#ifndef __VECTOR_TYPES_H__
-#include "GPUCommonDef.h"
-#endif
-#ifndef __OPENCL__
-#include <cstddef>
-template <typename T, size_t Size>
-using GPUArray = std::array<T, Size>;
-#else
-#include "../GPU/ITStrackingGPU/Array.h"
-template <typename T, size_t Size>
-using GPUArray = o2::its::gpu::Array<T, Size>;
-#endif
-
-typedef struct _dummyStream {
+typedef struct __dummyStream {
 } GPUStream;
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx b/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx
index 52119048b0ed8..630ad9acf59d2 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx
@@ -17,6 +17,8 @@
 #include "ITStracking/MathUtils.h"
 #include "ITStracking/IndexTableUtils.h"
 
+#include "GPUCommonArray.h"
+
 namespace o2
 {
 namespace its
@@ -90,8 +92,8 @@ bool Cluster::operator==(const Cluster& rhs) const
          this->indexTableBinIndex == rhs.indexTableBinIndex;
 }
 
-TrackingFrameInfo::TrackingFrameInfo(float x, float y, float z, float xTF, float alpha, GPUArray<float, 2>&& posTF,
-                                     GPUArray<float, 3>&& covTF)
+TrackingFrameInfo::TrackingFrameInfo(float x, float y, float z, float xTF, float alpha, o2::gpu::gpustd::array<float, 2>&& posTF,
+                                     o2::gpu::gpustd::array<float, 3>&& covTF)
   : xCoordinate{x}, yCoordinate{y}, zCoordinate{z}, xTrackingFrame{xTF}, alphaTrackingFrame{alpha}, positionTrackingFrame{posTF}, covarianceTrackingFrame{covTF}
 {
   // Nothing to do