diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
index a733f0ff99f26..5c19dda27f593 100644
--- a/GPU/Common/GPUCommonAlgorithm.h
+++ b/GPU/Common/GPUCommonAlgorithm.h
@@ -24,9 +24,7 @@
 
 // ----------------------------- SORTING -----------------------------
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 class GPUCommonAlgorithm
 {
@@ -43,6 +41,10 @@ class GPUCommonAlgorithm
   GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
   template <class T, class S>
   GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
+#ifndef __OPENCL__
+  template <class T, class S>
+  GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
+#endif
   template <class T>
   GPUd() static void swap(T& a, T& b);
 
@@ -71,13 +73,6 @@ class GPUCommonAlgorithm
   template <typename I>
   GPUd() static void IterSwap(I a, I b) noexcept;
 };
-} // namespace gpu
-} // namespace o2
-
-namespace o2
-{
-namespace gpu
-{
 
 #ifndef GPUCA_ALGORITHM_STD
 template <typename I>
@@ -217,8 +212,7 @@ GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept
 
 typedef GPUCommonAlgorithm CAAlgo;
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
 
@@ -226,9 +220,7 @@ typedef GPUCommonAlgorithm CAAlgo;
 
 #else
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 template <class T>
@@ -247,15 +239,12 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   GPUCommonAlgorithm::sort(begin, end, comp);
 }
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif // THRUST
 // sort and sortInBlock below are not taken from Thrust, since our implementations are faster
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 template <class T>
@@ -328,8 +317,7 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
 }
 #endif
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 // ----------------------------- WORK GROUP FUNCTIONS -----------------------------
 
@@ -458,4 +446,8 @@ GPUdi() T warp_broadcast(T v, int32_t i)
 
 #endif
 
+#ifdef GPUCA_ALGORITHM_STD
+#undef GPUCA_ALGORITHM_STD
+#endif
+
 #endif
diff --git a/GPU/Common/GPUCommonAlgorithmThrust.h b/GPU/Common/GPUCommonAlgorithmThrust.h
index 0208c12f1cd08..049071227a58e 100644
--- a/GPU/Common/GPUCommonAlgorithmThrust.h
+++ b/GPU/Common/GPUCommonAlgorithmThrust.h
@@ -23,16 +23,19 @@
 #pragma GCC diagnostic pop
 
 #include "GPUCommonDef.h"
+#include "GPUCommonHelpers.h"
 
-#ifdef __CUDACC__
+#ifndef __HIPCC__ // CUDA
 #define GPUCA_THRUST_NAMESPACE thrust::cuda
-#else
+#define GPUCA_CUB_NAMESPACE cub
+#include <cub/cub.cuh>
+#else // HIP
 #define GPUCA_THRUST_NAMESPACE thrust::hip
+#define GPUCA_CUB_NAMESPACE hipcub
+#include <hipcub/hipcub.hpp>
 #endif
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 // - Our quicksort and bubble sort implementations are faster
@@ -54,7 +57,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
 }
 
 template <class T>
-GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
+GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) // TODO: Try cub::BlockMergeSort
 {
   if (get_local_id(0) == 0) {
     sortDeviceDynamic(begin, end);
@@ -87,7 +90,24 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
 }
 
-} // namespace gpu
-} // namespace o2
+template <class T, class S>
+GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
+{
+  thrust::device_ptr<T> p(begin);
+#if 0 // Use Thrust
+  auto alloc = rec->getThrustVolatileDeviceAllocator();
+  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp);
+#else // Use CUB
+  size_t tempSize = 0;
+  void* tempMem = nullptr;
+  GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
+  tempMem = rec->AllocateVolatileDeviceMemory(tempSize);
+  GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
+#endif
+}
+} // namespace o2::gpu
+
+#undef GPUCA_THRUST_NAMESPACE
+#undef GPUCA_CUB_NAMESPACE
 
 #endif
diff --git a/GPU/Common/GPUCommonHelpers.h b/GPU/Common/GPUCommonHelpers.h
index 915d93c9bc791..2927ddab6bd0c 100644
--- a/GPU/Common/GPUCommonHelpers.h
+++ b/GPU/Common/GPUCommonHelpers.h
@@ -35,6 +35,7 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonLogger.h"
 #include <cstdint>
+#include <functional>
 
 namespace o2::gpu::internal
 {
@@ -60,4 +61,22 @@ static inline int32_t GPUReconstructionChkErr(const int64_t error, const char* f
 #undef GPUCOMMON_INTERNAL_CAT
 } // namespace o2::gpu::internal
 
+namespace o2::gpu
+{
+class GPUReconstruction;
+class ThrustVolatileAllocator
+{
+ public:
+  typedef char value_type;
+
+  char* allocate(std::ptrdiff_t n);
+  void deallocate(char* ptr, size_t);
+
+ private:
+  ThrustVolatileAllocator(GPUReconstruction* r);
+  std::function<char*(size_t)> mAlloc;
+  friend class GPUReconstruction;
+};
+} // namespace o2::gpu
+
 #endif
diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h
index b7a44c1df0f38..58f046161aa8b 100644
--- a/GPU/Common/GPUCommonMath.h
+++ b/GPU/Common/GPUCommonMath.h
@@ -42,9 +42,7 @@
     #define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
 #endif // clang-format on
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 class GPUCommonMath
@@ -540,7 +538,6 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt
 
 #undef GPUCA_CHOICE
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif // GPUCOMMONMATH_H
diff --git a/GPU/Common/GPUCommonTransform3D.h b/GPU/Common/GPUCommonTransform3D.h
index 4c5cca1f00ddc..2f517aded7eed 100644
--- a/GPU/Common/GPUCommonTransform3D.h
+++ b/GPU/Common/GPUCommonTransform3D.h
@@ -17,9 +17,7 @@
 
 #include "GPUCommonDef.h"
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 class Transform3D
 {
@@ -79,7 +77,6 @@ class Transform3D
                                 kZZ = 10,
                                 kDZ = 11 };
 };
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif
diff --git a/GPU/Common/GPUROOTCartesianFwd.h b/GPU/Common/GPUROOTCartesianFwd.h
index 89b0aa44eb78c..c631637a3bc6e 100644
--- a/GPU/Common/GPUROOTCartesianFwd.h
+++ b/GPU/Common/GPUROOTCartesianFwd.h
@@ -46,9 +46,7 @@ class DefaultCoordinateSystemTag;
 } // namespace Math
 } // namespace ROOT
 
-namespace o2
-{
-namespace math_utils
+namespace o2::math_utils
 {
 
 namespace detail
@@ -79,7 +77,6 @@ template <typename T>
 using Vector3D = detail::GPUPoint3D<T, 1>;
 #endif
 
-} // namespace math_utils
-} // namespace o2
+} // namespace o2::math_utils
 
 #endif
diff --git a/GPU/Common/GPUROOTSMatrixFwd.h b/GPU/Common/GPUROOTSMatrixFwd.h
index 44b2254949df2..0159cc8922140 100644
--- a/GPU/Common/GPUROOTSMatrixFwd.h
+++ b/GPU/Common/GPUROOTSMatrixFwd.h
@@ -35,9 +35,7 @@ class MatRepStd;
 } // namespace Math
 } // namespace ROOT
 
-namespace o2
-{
-namespace math_utils
+namespace o2::math_utils
 {
 
 namespace detail
@@ -72,7 +70,6 @@ template <class T, uint32_t D1, uint32_t D2 = D1>
 using MatRepStd = detail::MatRepStdGPU<T, D1, D2>;
 #endif
 
-} // namespace math_utils
-} // namespace o2
+} // namespace o2::math_utils
 
 #endif
diff --git a/GPU/GPUTracking/Base/GPUGeneralKernels.h b/GPU/GPUTracking/Base/GPUGeneralKernels.h
index ce93e2e5eead8..eb816c91f5909 100644
--- a/GPU/GPUTracking/Base/GPUGeneralKernels.h
+++ b/GPU/GPUTracking/Base/GPUGeneralKernels.h
@@ -27,9 +27,9 @@
 #endif
 
 #if defined(__HIPCC__)
-#define GPUCA_CUB hipcub
+#define GPUCA_CUB_NAMESPACE hipcub
 #else
-#define GPUCA_CUB cub
+#define GPUCA_CUB_NAMESPACE cub
 #endif
 
 namespace o2::gpu
@@ -54,7 +54,7 @@ class GPUKernelTemplate
   struct GPUSharedMemoryWarpScan64 {
     // Provides the shared memory resources for warp wide CUB collectives
 #if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
-    typedef GPUCA_CUB::WarpScan<T> WarpScan;
+    typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
     union {
       typename WarpScan::TempStorage cubWarpTmpMem;
     };
@@ -65,9 +65,9 @@ class GPUKernelTemplate
   struct GPUSharedMemoryScan64 {
     // Provides the shared memory resources for CUB collectives
 #if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
-    typedef GPUCA_CUB::BlockScan<T, I> BlockScan;
-    typedef GPUCA_CUB::BlockReduce<T, I> BlockReduce;
-    typedef GPUCA_CUB::WarpScan<T> WarpScan;
+    typedef GPUCA_CUB_NAMESPACE::BlockScan<T, I> BlockScan;
+    typedef GPUCA_CUB_NAMESPACE::BlockReduce<T, I> BlockReduce;
+    typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
     union {
       typename BlockScan::TempStorage cubTmpMem;
       typename BlockReduce::TempStorage cubReduceTmpMem;
@@ -110,6 +110,6 @@ class GPUitoa : public GPUKernelTemplate
 
 } // namespace o2::gpu
 
-#undef GPUCA_CUB
+#undef GPUCA_CUB_NAMESPACE
 
 #endif
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 5582084fd0e17..d96d5aad74622 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -30,6 +30,7 @@
 #include "GPUROOTDumpCore.h"
 #include "GPUConfigDump.h"
 #include "GPUChainTracking.h"
+#include "GPUCommonHelpers.h"
 
 #include "GPUMemoryResource.h"
 #include "GPUChain.h"
@@ -1193,3 +1194,12 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size)
 {
   mInputControl.set(ptr, size);
 }
+
+ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
+{
+  mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
+}
+ThrustVolatileAllocator GPUReconstruction::getThrustVolatileDeviceAllocator()
+{
+  return ThrustVolatileAllocator(this);
+}
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 5e03c77f08230..18098396e1349 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -47,6 +47,7 @@ struct GPUMemorySizeScalers;
 struct GPUReconstructionPipelineContext;
 struct GPUReconstructionThreading;
 class GPUROOTDumpCore;
+class ThrustVolatileAllocator;
 
 namespace gpu_reconstruction_kernels
 {
@@ -165,6 +166,7 @@ class GPUReconstruction
   void ClearAllocatedMemory(bool clearOutputs = true);
   void ReturnVolatileDeviceMemory();
   void ReturnVolatileMemory();
+  ThrustVolatileAllocator getThrustVolatileDeviceAllocator();
   void PushNonPersistentMemory(uint64_t tag);
   void PopNonPersistentMemory(RecoStep step, uint64_t tag);
   void BlockStackedMemory(GPUReconstruction* rec);
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index f8203c2dc5858..99c59afd2011a 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
 
 set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
-set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h)
+set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
diff --git a/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h b/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h
deleted file mode 100644
index fdc5c16d91f35..0000000000000
--- a/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file CUDAThrustHelpers.h
-/// \author David Rohr
-
-#ifndef GPU_CUDATHRUSTHELPERS_H
-#define GPU_CUDATHRUSTHELPERS_H
-
-#include "GPULogging.h"
-#include <vector>
-#include <memory>
-
-namespace o2::gpu
-{
-
-class ThrustVolatileAsyncAllocator
-{
- public:
-  typedef char value_type;
-
-  ThrustVolatileAsyncAllocator(GPUReconstruction* r) : mRec(r) {}
-  char* allocate(std::ptrdiff_t n) { return (char*)mRec->AllocateVolatileDeviceMemory(n); }
-
-  void deallocate(char* ptr, size_t) {}
-
- private:
-  GPUReconstruction* mRec;
-};
-
-} // namespace o2::gpu
-
-#ifndef __HIPCC__
-// Override synchronize call at end of thrust algorithm running on stream, just don't run cudaStreamSynchronize
-namespace thrust::cuda_cub
-{
-
-typedef thrust::cuda_cub::execution_policy<typeof(thrust::cuda::par(*(o2::gpu::ThrustVolatileAsyncAllocator*)nullptr).on(*(cudaStream_t*)nullptr))> thrustStreamPolicy;
-template <>
-__host__ __device__ inline cudaError_t synchronize<thrustStreamPolicy>(thrustStreamPolicy& policy)
-{
-#ifndef GPUCA_GPUCODE_DEVICE
-  // Do not synchronize!
-  return cudaSuccess;
-#else
-  return synchronize_stream(derived_cast(policy));
-#endif
-}
-
-} // namespace thrust::cuda_cub
-#endif // __HIPCC__
-
-#endif // GPU_CUDATHRUSTHELPERS_H
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 202edd49bc44c..175fd205153ea 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -19,7 +19,6 @@
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
-#include "CUDAThrustHelpers.h"
 #include "GPUReconstructionIncludes.h"
 #include "GPUParamRTC.h"
 #include "GPUReconstructionCUDAHelpers.inc"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index f78270d40146c..30bbc76d4c415 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -54,6 +54,8 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
 
   void getRTCKernelCalls(std::vector<std::string>& kernels);
 
+  template <class T, class S>
+  friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
   GPUReconstructionCUDAInternals* mInternals;
 };
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
index f341a778076b8..534f5e8606897 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
@@ -16,7 +16,6 @@
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
-#include "CUDAThrustHelpers.h"
 
 #include <stdexcept>
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc
index a34f940a1337a..c2b6f6d05dd7f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc
@@ -16,6 +16,7 @@
 #define GPURECONSTRUCTIONCUDAHELPERS_INC_H
 
 #include "GPUCommonHelpers.h"
+#include "GPUReconstruction.h"
 
 namespace o2::gpu::internal
 {
@@ -28,4 +29,10 @@ int32_t __attribute__((weak)) GPUReconstructionCUDAChkErr(const int64_t error, c
 }
 } // namespace o2::gpu::internal
 
+namespace o2::gpu
+{
+char* __attribute__((weak)) ThrustVolatileAllocator::allocate(std::ptrdiff_t n) { return mAlloc(n); }
+void __attribute__((weak)) ThrustVolatileAllocator::deallocate(char* ptr, size_t) {}
+} // namespace o2::gpu
+
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index 0c83223ba238a..f1f459fe021bc 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -16,7 +16,6 @@
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
-#include "CUDAThrustHelpers.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index fee43eb6d8b0d..16e6e72d56e9a 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -24,7 +24,7 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}")
 if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify)
   file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR})
-  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu CUDAThrustHelpers.h GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu)
+  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu)
   set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesHost.h)
   set(HIP_SOURCES "")
   foreach(file ${GPUCA_HIP_FILE_LIST})
@@ -63,7 +63,7 @@ endif()
 
 set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip)
 set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx)
-set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h)
+set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h)
 
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
index 36a947dda9dc3..ffab3ba0be063 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -50,19 +50,13 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSecto
       gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = MergerShadow.BorderRange(jSector) + *processors()->tpcTrackers[jSector].NTracks();
       runKernel<GPUTPCGMMergerMergeBorders, 3>({{1, -WarpSize(), stream, deviceType}}, range1, n1, 0);
       runKernel<GPUTPCGMMergerMergeBorders, 3>({{1, -WarpSize(), stream, deviceType}}, range2, n2, 1);
-      deviceEvent* e = nullptr;
-      int32_t ne = 0;
-      if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel
-        ne = std::min<int32_t>(n, mRec->NStreams());
-        for (int32_t j = 1; j < ne; j++) {
-          RecordMarker(&mEvents->sector[j], j);
-        }
-        e = &mEvents->sector[1];
-        ne--;
-        stream = 0;
-      }
-      runKernel<GPUTPCGMMergerMergeBorders, 2>({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, e, ne}}, i, withinSector, mergeMode);
+      runKernel<GPUTPCGMMergerMergeBorders, 2>({GetGridAuto(stream, deviceType)}, i, withinSector, mergeMode);
+    }
+    int32_t ne = std::min<int32_t>(n, mRec->NStreams()) - 1; // Stream 0 must wait for all streams, Note n > 1
+    for (int32_t j = 0; j < ne; j++) {
+      RecordMarker(&mEvents->sector[j], j + 1);
     }
+    StreamWaitForEvents(0, &mEvents->sector[0], ne);
   } else {
     for (uint32_t i = 0; i < n; i++) {
       runKernel<GPUTPCGMMergerMergeBorders, 0>(GetGridAuto(0, deviceType), i, withinSector, mergeMode);
@@ -121,7 +115,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   for (uint32_t i = 0; i < NSECTORS; i++) {
     runKernel<GPUTPCGMMergerUnpackSaveNumber>({{1, -WarpSize(), 0, deviceType}}, i);
     runKernel<GPUTPCGMMergerUnpackResetIds>(GetGridAuto(0, deviceType), i);
-    runKernel<GPUTPCGMMergerSectorRefit>(GetGridAuto(0, deviceType), i);
+    runKernel<GPUTPCGMMergerSectorRefit>(GetGridAuto(0, deviceType), i); // TODO: Why all in stream 0?
   }
   if (GetProcessingSettings().deterministicGPUReconstruction) {
     runKernel<GPUTPCGMMergerUnpackSaveNumber>({{1, -WarpSize(), 0, deviceType}}, NSECTORS);
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
index 5ede29d7fd851..c8c844eee748a 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
@@ -772,12 +772,10 @@ struct MergeBorderTracks_compMin {
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
 {
-  thrust::device_ptr<GPUTPCGMBorderRange> p(range);
-  ThrustVolatileAsyncAllocator alloc(this);
   if (cmpMax) {
-    thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMax());
+    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
   } else {
-    thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMin());
+    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin());
   }
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize MergeBorderTracks<3>
@@ -1877,17 +1875,13 @@ struct GPUTPCGMMergerSortTracksQPt_comp {
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
 {
-  thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackOrderProcess());
-  ThrustVolatileAsyncAllocator alloc(this);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
 {
-  thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackSort());
-  ThrustVolatileAsyncAllocator alloc(this);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
 
@@ -2110,9 +2104,7 @@ struct GPUTPCGMMergerMergeLoopers_comp {
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
 {
-  thrust::device_ptr<MergeLooperParam> params(mProcessorsShadow->tpcMerger.LooperCandidates());
-  ThrustVolatileAsyncAllocator alloc(this);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), params, params + processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
 
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
index 13f204d0f940a..8056f22484e70 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
@@ -104,9 +104,7 @@ struct GPUTPCGMO2OutputSort_comp {
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
 {
-  thrust::device_ptr<GPUTPCGMMerger::tmpSort> trackSort(mProcessorsShadow->tpcMerger.TrackSortO2());
-  ThrustVolatileAsyncAllocator alloc(this);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>