diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h index a733f0ff99f26..5c19dda27f593 100644 --- a/GPU/Common/GPUCommonAlgorithm.h +++ b/GPU/Common/GPUCommonAlgorithm.h @@ -24,9 +24,7 @@ // ----------------------------- SORTING ----------------------------- -namespace o2 -{ -namespace gpu +namespace o2::gpu { class GPUCommonAlgorithm { @@ -43,6 +41,10 @@ class GPUCommonAlgorithm GPUd() static void sortInBlock(T* begin, T* end, const S& comp); template GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp); +#ifndef __OPENCL__ + template + GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); +#endif template GPUd() static void swap(T& a, T& b); @@ -71,13 +73,6 @@ class GPUCommonAlgorithm template GPUd() static void IterSwap(I a, I b) noexcept; }; -} // namespace gpu -} // namespace o2 - -namespace o2 -{ -namespace gpu -{ #ifndef GPUCA_ALGORITHM_STD template @@ -217,8 +212,7 @@ GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept typedef GPUCommonAlgorithm CAAlgo; -} // namespace gpu -} // namespace o2 +} // namespace o2::gpu #if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY) @@ -226,9 +220,7 @@ typedef GPUCommonAlgorithm CAAlgo; #else -namespace o2 -{ -namespace gpu +namespace o2::gpu { template @@ -247,15 +239,12 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co GPUCommonAlgorithm::sort(begin, end, comp); } -} // namespace gpu -} // namespace o2 +} // namespace o2::gpu #endif // THRUST // sort and sortInBlock below are not taken from Thrust, since our implementations are faster -namespace o2 -{ -namespace gpu +namespace o2::gpu { template @@ -328,8 +317,7 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b) } #endif -} // namespace gpu -} // namespace o2 +} // namespace o2::gpu // ----------------------------- WORK GROUP FUNCTIONS ----------------------------- @@ -458,4 +446,8 @@ GPUdi() T warp_broadcast(T v, int32_t i) #endif +#ifdef GPUCA_ALGORITHM_STD +#undef GPUCA_ALGORITHM_STD +#endif + #endif diff --git a/GPU/Common/GPUCommonAlgorithmThrust.h b/GPU/Common/GPUCommonAlgorithmThrust.h index 0208c12f1cd08..049071227a58e 100644 --- a/GPU/Common/GPUCommonAlgorithmThrust.h +++ b/GPU/Common/GPUCommonAlgorithmThrust.h @@ -23,16 +23,19 @@ #pragma GCC diagnostic pop #include "GPUCommonDef.h" +#include "GPUCommonHelpers.h" -#ifdef __CUDACC__ +#ifndef __HIPCC__ // CUDA #define GPUCA_THRUST_NAMESPACE thrust::cuda -#else +#define GPUCA_CUB_NAMESPACE cub +#include +#else // HIP #define GPUCA_THRUST_NAMESPACE thrust::hip +#define GPUCA_CUB_NAMESPACE hipcub +#include #endif -namespace o2 -{ -namespace gpu +namespace o2::gpu { // - Our quicksort and bubble sort implementations are faster @@ -54,7 +57,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp) } template -GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) +GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) // TODO: Try cub::BlockMergeSort { if (get_local_id(0) == 0) { sortDeviceDynamic(begin, end); @@ -87,7 +90,24 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp); } -} // namespace gpu -} // namespace o2 +template +GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp) +{ + thrust::device_ptr p(begin); +#if 0 // Use Thrust + auto alloc = rec->getThrustVolatileDeviceAllocator(); + thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp); +#else // Use CUB + size_t tempSize = 0; + void* tempMem = nullptr; + GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream])); + tempMem = rec->AllocateVolatileDeviceMemory(tempSize); + GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream])); +#endif +} +} // namespace o2::gpu + +#undef GPUCA_THRUST_NAMESPACE +#undef GPUCA_CUB_NAMESPACE #endif diff --git a/GPU/Common/GPUCommonHelpers.h b/GPU/Common/GPUCommonHelpers.h index 915d93c9bc791..2927ddab6bd0c 100644 --- a/GPU/Common/GPUCommonHelpers.h +++ b/GPU/Common/GPUCommonHelpers.h @@ -35,6 +35,7 @@ #include "GPUCommonDef.h" #include "GPUCommonLogger.h" #include +#include namespace o2::gpu::internal { @@ -60,4 +61,22 @@ static inline int32_t GPUReconstructionChkErr(const int64_t error, const char* f #undef GPUCOMMON_INTERNAL_CAT } // namespace o2::gpu::internal +namespace o2::gpu +{ +class GPUReconstruction; +class ThrustVolatileAllocator +{ + public: + typedef char value_type; + + char* allocate(std::ptrdiff_t n); + void deallocate(char* ptr, size_t); + + private: + ThrustVolatileAllocator(GPUReconstruction* r); + std::function mAlloc; + friend class GPUReconstruction; +}; +} // namespace o2::gpu + #endif diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h index b7a44c1df0f38..58f046161aa8b 100644 --- a/GPU/Common/GPUCommonMath.h +++ b/GPU/Common/GPUCommonMath.h @@ -42,9 +42,7 @@ #define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host #endif // clang-format on -namespace o2 -{ -namespace gpu +namespace o2::gpu { class GPUCommonMath @@ -540,7 +538,6 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt #undef GPUCA_CHOICE -} // namespace gpu -} // namespace o2 +} // namespace o2::gpu #endif // GPUCOMMONMATH_H diff --git a/GPU/Common/GPUCommonTransform3D.h b/GPU/Common/GPUCommonTransform3D.h index 4c5cca1f00ddc..2f517aded7eed 100644 --- a/GPU/Common/GPUCommonTransform3D.h +++ b/GPU/Common/GPUCommonTransform3D.h @@ -17,9 +17,7 @@ #include "GPUCommonDef.h" -namespace o2 -{ -namespace gpu +namespace o2::gpu { class Transform3D { @@ -79,7 +77,6 @@ class Transform3D kZZ = 10, kDZ = 11 }; }; -} // namespace gpu -} // namespace o2 +} // namespace o2::gpu #endif diff --git a/GPU/Common/GPUROOTCartesianFwd.h b/GPU/Common/GPUROOTCartesianFwd.h index 89b0aa44eb78c..c631637a3bc6e 100644 --- a/GPU/Common/GPUROOTCartesianFwd.h +++ b/GPU/Common/GPUROOTCartesianFwd.h @@ -46,9 +46,7 @@ class DefaultCoordinateSystemTag; } // namespace Math } // namespace ROOT -namespace o2 -{ -namespace math_utils +namespace o2::math_utils { namespace detail @@ -79,7 +77,6 @@ template using Vector3D = detail::GPUPoint3D; #endif -} // namespace math_utils -} // namespace o2 +} // namespace o2::math_utils #endif diff --git a/GPU/Common/GPUROOTSMatrixFwd.h b/GPU/Common/GPUROOTSMatrixFwd.h index 44b2254949df2..0159cc8922140 100644 --- a/GPU/Common/GPUROOTSMatrixFwd.h +++ b/GPU/Common/GPUROOTSMatrixFwd.h @@ -35,9 +35,7 @@ class MatRepStd; } // namespace Math } // namespace ROOT -namespace o2 -{ -namespace math_utils +namespace o2::math_utils { namespace detail @@ -72,7 +70,6 @@ template using MatRepStd = detail::MatRepStdGPU; #endif -} // namespace math_utils -} // namespace o2 +} // namespace o2::math_utils #endif diff --git a/GPU/GPUTracking/Base/GPUGeneralKernels.h b/GPU/GPUTracking/Base/GPUGeneralKernels.h index ce93e2e5eead8..eb816c91f5909 100644 --- a/GPU/GPUTracking/Base/GPUGeneralKernels.h +++ b/GPU/GPUTracking/Base/GPUGeneralKernels.h @@ -27,9 +27,9 @@ #endif #if defined(__HIPCC__) -#define GPUCA_CUB hipcub +#define GPUCA_CUB_NAMESPACE hipcub #else -#define GPUCA_CUB cub +#define GPUCA_CUB_NAMESPACE cub #endif namespace o2::gpu @@ -54,7 +54,7 @@ class GPUKernelTemplate struct GPUSharedMemoryWarpScan64 { // Provides the shared memory resources for warp wide CUB collectives #if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY) - typedef GPUCA_CUB::WarpScan WarpScan; + typedef GPUCA_CUB_NAMESPACE::WarpScan WarpScan; union { typename WarpScan::TempStorage cubWarpTmpMem; }; @@ -65,9 +65,9 @@ class GPUKernelTemplate struct GPUSharedMemoryScan64 { // Provides the shared memory resources for CUB collectives #if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY) - typedef GPUCA_CUB::BlockScan BlockScan; - typedef GPUCA_CUB::BlockReduce BlockReduce; - typedef GPUCA_CUB::WarpScan WarpScan; + typedef GPUCA_CUB_NAMESPACE::BlockScan BlockScan; + typedef GPUCA_CUB_NAMESPACE::BlockReduce BlockReduce; + typedef GPUCA_CUB_NAMESPACE::WarpScan WarpScan; union { typename BlockScan::TempStorage cubTmpMem; typename BlockReduce::TempStorage cubReduceTmpMem; @@ -110,6 +110,6 @@ class GPUitoa : public GPUKernelTemplate } // namespace o2::gpu -#undef GPUCA_CUB +#undef GPUCA_CUB_NAMESPACE #endif diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 5582084fd0e17..d96d5aad74622 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -30,6 +30,7 @@ #include "GPUROOTDumpCore.h" #include "GPUConfigDump.h" #include "GPUChainTracking.h" +#include "GPUCommonHelpers.h" #include "GPUMemoryResource.h" #include "GPUChain.h" @@ -1193,3 +1194,12 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size) { mInputControl.set(ptr, size); } + +ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r) +{ + mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); }; +} +ThrustVolatileAllocator GPUReconstruction::getThrustVolatileDeviceAllocator() +{ + return ThrustVolatileAllocator(this); +} diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 5e03c77f08230..18098396e1349 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -47,6 +47,7 @@ struct GPUMemorySizeScalers; struct GPUReconstructionPipelineContext; struct GPUReconstructionThreading; class GPUROOTDumpCore; +class ThrustVolatileAllocator; namespace gpu_reconstruction_kernels { @@ -165,6 +166,7 @@ class GPUReconstruction void ClearAllocatedMemory(bool clearOutputs = true); void ReturnVolatileDeviceMemory(); void ReturnVolatileMemory(); + ThrustVolatileAllocator getThrustVolatileDeviceAllocator(); void PushNonPersistentMemory(uint64_t tag); void PopNonPersistentMemory(RecoStep step, uint64_t tag); void BlockStackedMemory(GPUReconstruction* rec); diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index f8203c2dc5858..99c59afd2011a 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -18,7 +18,7 @@ endif() message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}") set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu) -set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h) +set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h) # -------------------------------- Prepare RTC ------------------------------------------------------- enable_language(ASM) if(ALIGPU_BUILD_TYPE STREQUAL "O2") diff --git a/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h b/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h deleted file mode 100644 index fdc5c16d91f35..0000000000000 --- a/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file CUDAThrustHelpers.h -/// \author David Rohr - -#ifndef GPU_CUDATHRUSTHELPERS_H -#define GPU_CUDATHRUSTHELPERS_H - -#include "GPULogging.h" -#include -#include - -namespace o2::gpu -{ - -class ThrustVolatileAsyncAllocator -{ - public: - typedef char value_type; - - ThrustVolatileAsyncAllocator(GPUReconstruction* r) : mRec(r) {} - char* allocate(std::ptrdiff_t n) { return (char*)mRec->AllocateVolatileDeviceMemory(n); } - - void deallocate(char* ptr, size_t) {} - - private: - GPUReconstruction* mRec; -}; - -} // namespace o2::gpu - -#ifndef __HIPCC__ -// Override synchronize call at end of thrust algorithm running on stream, just don't run cudaStreamSynchronize -namespace thrust::cuda_cub -{ - -typedef thrust::cuda_cub::execution_policy thrustStreamPolicy; -template <> -__host__ __device__ inline cudaError_t synchronize(thrustStreamPolicy& policy) -{ -#ifndef GPUCA_GPUCODE_DEVICE - // Do not synchronize! - return cudaSuccess; -#else - return synchronize_stream(derived_cast(policy)); -#endif -} - -} // namespace thrust::cuda_cub -#endif // __HIPCC__ - -#endif // GPU_CUDATHRUSTHELPERS_H diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 202edd49bc44c..175fd205153ea 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -19,7 +19,6 @@ #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" -#include "CUDAThrustHelpers.h" #include "GPUReconstructionIncludes.h" #include "GPUParamRTC.h" #include "GPUReconstructionCUDAHelpers.inc" diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index f78270d40146c..30bbc76d4c415 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -54,6 +54,8 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase void getRTCKernelCalls(std::vector& kernels); + template + friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); GPUReconstructionCUDAInternals* mInternals; }; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu index f341a778076b8..534f5e8606897 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu @@ -16,7 +16,6 @@ #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" -#include "CUDAThrustHelpers.h" #include diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc index a34f940a1337a..c2b6f6d05dd7f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAHelpers.inc @@ -16,6 +16,7 @@ #define GPURECONSTRUCTIONCUDAHELPERS_INC_H #include "GPUCommonHelpers.h" +#include "GPUReconstruction.h" namespace o2::gpu::internal { @@ -28,4 +29,10 @@ int32_t __attribute__((weak)) GPUReconstructionCUDAChkErr(const int64_t error, c } } // namespace o2::gpu::internal +namespace o2::gpu +{ +char* __attribute__((weak)) ThrustVolatileAllocator::allocate(std::ptrdiff_t n) { return mAlloc(n); } +void __attribute__((weak)) ThrustVolatileAllocator::deallocate(char* ptr, size_t) {} +} // namespace o2::gpu + #endif diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index 0c83223ba238a..f1f459fe021bc 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -16,7 +16,6 @@ #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" -#include "CUDAThrustHelpers.h" using namespace o2::gpu; diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index fee43eb6d8b0d..16e6e72d56e9a 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -24,7 +24,7 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}") if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}") set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify) file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR}) - set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu CUDAThrustHelpers.h GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu) + set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu) set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesHost.h) set(HIP_SOURCES "") foreach(file ${GPUCA_HIP_FILE_LIST}) @@ -63,7 +63,7 @@ endif() set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip) set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx) -set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h) +set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h) # -------------------------------- Prepare RTC ------------------------------------------------------- enable_language(ASM) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index 36a947dda9dc3..ffab3ba0be063 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -50,19 +50,13 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSecto gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = MergerShadow.BorderRange(jSector) + *processors()->tpcTrackers[jSector].NTracks(); runKernel({{1, -WarpSize(), stream, deviceType}}, range1, n1, 0); runKernel({{1, -WarpSize(), stream, deviceType}}, range2, n2, 1); - deviceEvent* e = nullptr; - int32_t ne = 0; - if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel - ne = std::min(n, mRec->NStreams()); - for (int32_t j = 1; j < ne; j++) { - RecordMarker(&mEvents->sector[j], j); - } - e = &mEvents->sector[1]; - ne--; - stream = 0; - } - runKernel({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, e, ne}}, i, withinSector, mergeMode); + runKernel({GetGridAuto(stream, deviceType)}, i, withinSector, mergeMode); + } + int32_t ne = std::min(n, mRec->NStreams()) - 1; // Stream 0 must wait for all streams, Note n > 1 + for (int32_t j = 0; j < ne; j++) { + RecordMarker(&mEvents->sector[j], j + 1); } + StreamWaitForEvents(0, &mEvents->sector[0], ne); } else { for (uint32_t i = 0; i < n; i++) { runKernel(GetGridAuto(0, deviceType), i, withinSector, mergeMode); @@ -121,7 +115,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) for (uint32_t i = 0; i < NSECTORS; i++) { runKernel({{1, -WarpSize(), 0, deviceType}}, i); runKernel(GetGridAuto(0, deviceType), i); - runKernel(GetGridAuto(0, deviceType), i); + runKernel(GetGridAuto(0, deviceType), i); // TODO: Why all in stream 0? } if (GetProcessingSettings().deterministicGPUReconstruction) { runKernel({{1, -WarpSize(), 0, deviceType}}, NSECTORS); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index 5ede29d7fd851..c8c844eee748a 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -772,12 +772,10 @@ struct MergeBorderTracks_compMin { template <> inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) { - thrust::device_ptr p(range); - ThrustVolatileAsyncAllocator alloc(this); if (cmpMax) { - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMax()); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax()); } else { - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMin()); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin()); } } #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize MergeBorderTracks<3> @@ -1877,17 +1875,13 @@ struct GPUTPCGMMergerSortTracksQPt_comp { template <> inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) { - thrust::device_ptr trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackOrderProcess()); - ThrustVolatileAsyncAllocator alloc(this); - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } template <> inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) { - thrust::device_ptr trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackSort()); - ThrustVolatileAsyncAllocator alloc(this); - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt @@ -2110,9 +2104,7 @@ struct GPUTPCGMMergerMergeLoopers_comp { template <> inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) { - thrust::device_ptr params(mProcessorsShadow->tpcMerger.LooperCandidates()); - ThrustVolatileAsyncAllocator alloc(this); - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), params, params + processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); } #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 13f204d0f940a..8056f22484e70 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -104,9 +104,7 @@ struct GPUTPCGMO2OutputSort_comp { template <> inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal(const krnlSetupTime& _xyz) { - thrust::device_ptr trackSort(mProcessorsShadow->tpcMerger.TrackSortO2()); - ThrustVolatileAsyncAllocator alloc(this); - thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp()); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp()); } #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread