diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h deleted file mode 100644 index f4f73e715c305..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Array.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file Array.h -/// \brief -/// - -#ifndef ITSTRACKINGGPU_ARRAY_H_ -#define ITSTRACKINGGPU_ARRAY_H_ - -#include "GPUCommonDef.h" - -namespace o2 -{ -namespace its -{ -namespace gpu -{ -template -struct ArrayTraits final { - typedef T InternalArray[Size]; - - GPUhd() static constexpr T& getReference(const InternalArray& internalArray, size_t index) noexcept - { - return const_cast(internalArray[index]); - } - - GPUhd() static constexpr T* getPointer(const InternalArray& internalArray) noexcept - { - return const_cast(internalArray); - } -}; - -template -struct Array final { - - void copy(const Array& t) - { - memcpy(InternalArray, t.data(), Size * sizeof(T)); - } - - GPUhd() T* data() noexcept { return const_cast(InternalArray); } - GPUhd() const T* data() const noexcept { return const_cast(InternalArray); } - GPUhd() T& operator[](const int index) noexcept { return const_cast(InternalArray[index]); } - GPUhd() constexpr T& operator[](const int index) const noexcept { return const_cast(InternalArray[index]); } - GPUhd() size_t size() const noexcept { return Size; } - - T InternalArray[Size]; -}; -} // namespace gpu -} // namespace its -} // namespace o2 - -#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h deleted file mode 100644 index bfc4c63756e0b..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Context.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file Context.h -/// \brief -/// - -#ifndef ITSTRACKINGGPU_CONTEXT_H_ -#define ITSTRACKINGGPU_CONTEXT_H_ - -#include -#include -#include "ITStracking/Definitions.h" - -namespace o2 -{ -namespace its -{ -namespace gpu -{ - -struct DeviceProperties final { - std::string name; - int gpuProcessors; - int gpuCores; - long globalMemorySize; - long constantMemorySize; - long sharedMemorySize; - long maxClockRate; - int busWidth; - long l2CacheSize; - long registersPerBlock; - int warpSize; - int maxThreadsPerBlock; - int maxBlocksPerSM; - dim3 maxThreadsDim; - dim3 maxGridDim; -}; - -class Context final -{ - public: - static Context& getInstance(); - - Context(const Context&); - Context& operator=(const Context&); - - const DeviceProperties& getDeviceProperties(); - const DeviceProperties& getDeviceProperties(const int); - - private: - Context(bool dumpDevices = false); - ~Context() = default; - - int mDevicesNum; - std::vector mDeviceProperties; -}; -} // namespace gpu -} // namespace its -} // namespace o2 - -#endif /* TRAKINGITSU_INCLUDE_GPU_CONTEXT_H_ */ diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h deleted file mode 100644 index a5e859475521c..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/PrimaryVertexContextGPU.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file PrimaryVertexContextNVNV.h -/// \brief -/// - -#ifndef ITSTRACKINGGPU_PRIMARYVERTEXCONTEXTGPU_H_ -#define ITSTRACKINGGPU_PRIMARYVERTEXCONTEXTGPU_H_ - -#include - -#include -#include - -#include "ITStracking/Configuration.h" -#include "ITStracking/Constants.h" -#include "ITStracking/Definitions.h" -#include "ITStracking/PrimaryVertexContext.h" -#include "ITStracking/Road.h" -#include "ITStracking/Tracklet.h" - -#include "DeviceStoreGPU.h" -#include "UniquePointer.h" - -namespace o2 -{ -namespace its -{ - -class PrimaryVertexContextNV final : public PrimaryVertexContext -{ - public: - PrimaryVertexContextNV() = default; - ~PrimaryVertexContextNV() override; - - void initialise(const MemoryParameters& memParam, const TrackingParameters& trkParam, - const std::vector>& cl, const std::array& pv, const int iteration) override; - - gpu::DeviceStoreNV& getDeviceContext(); - gpu::Array, constants::its2::LayersNumber>& getDeviceClusters(); - gpu::Array, constants::its2::TrackletsPerRoad>& getDeviceTracklets(); - gpu::Array, constants::its2::CellsPerRoad>& getDeviceTrackletsLookupTable(); - gpu::Array, constants::its2::CellsPerRoad>& getDeviceTrackletsPerClustersTable(); - gpu::Array, constants::its2::CellsPerRoad>& getDeviceCells(); - gpu::Array, constants::its2::CellsPerRoad - 1>& getDeviceCellsLookupTable(); - gpu::Array, constants::its2::CellsPerRoad - 1>& getDeviceCellsPerTrackletTable(); - std::array, constants::its2::CellsPerRoad>& getTempTableArray(); - std::array, constants::its2::CellsPerRoad>& getTempTrackletArray(); - std::array, constants::its2::CellsPerRoad - 1>& getTempCellArray(); - void updateDeviceContext(); - - private: - gpu::DeviceStoreNV mGPUContext; - gpu::UniquePointer mGPUContextDevicePointer; - std::array, constants::its2::CellsPerRoad> mTempTableArray; - std::array, constants::its2::CellsPerRoad> mTempTrackletArray; - std::array, constants::its2::CellsPerRoad - 1> mTempCellArray; -}; - -inline PrimaryVertexContextNV::~PrimaryVertexContextNV() = default; - -inline gpu::DeviceStoreNV& PrimaryVertexContextNV::getDeviceContext() -{ - return *mGPUContextDevicePointer; -} - -inline gpu::Array, constants::its2::LayersNumber>& PrimaryVertexContextNV::getDeviceClusters() -{ - return mGPUContext.getClusters(); -} - -inline gpu::Array, constants::its2::TrackletsPerRoad>& PrimaryVertexContextNV::getDeviceTracklets() -{ - return mGPUContext.getTracklets(); -} - -inline gpu::Array, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getDeviceTrackletsLookupTable() -{ - return mGPUContext.getTrackletsLookupTable(); -} - -inline gpu::Array, constants::its2::CellsPerRoad>& - PrimaryVertexContextNV::getDeviceTrackletsPerClustersTable() -{ - return mGPUContext.getTrackletsPerClusterTable(); -} - -inline gpu::Array, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getDeviceCells() -{ - return mGPUContext.getCells(); -} - -inline gpu::Array, constants::its2::CellsPerRoad - 1>& PrimaryVertexContextNV::getDeviceCellsLookupTable() -{ - return mGPUContext.getCellsLookupTable(); -} - -inline gpu::Array, constants::its2::CellsPerRoad - 1>& - PrimaryVertexContextNV::getDeviceCellsPerTrackletTable() -{ - return mGPUContext.getCellsPerTrackletTable(); -} - -inline std::array, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getTempTableArray() -{ - return mTempTableArray; -} - -inline std::array, constants::its2::CellsPerRoad>& PrimaryVertexContextNV::getTempTrackletArray() -{ - return mTempTrackletArray; -} - -inline std::array, constants::its2::CellsPerRoad - 1>& PrimaryVertexContextNV::getTempCellArray() -{ - return mTempCellArray; -} - -inline void PrimaryVertexContextNV::updateDeviceContext() -{ - mGPUContextDevicePointer = gpu::UniquePointer{mGPUContext}; -} - -inline void PrimaryVertexContextNV::initialise(const MemoryParameters& memParam, const TrackingParameters& trkParam, - const std::vector>& cl, const std::array& pv, const int iteration) -{ - ///TODO: to be re-enabled in the future - // this->PrimaryVertexContext::initialise(memParam, cl, pv, iteration); - // mGPUContextDevicePointer = mGPUContext.initialise(mPrimaryVertex, mClusters, mTracklets, mCells, mCellsLookupTable, mMinR, mMaxR); -} - -} // namespace its -} // namespace o2 - -#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h index 20744b47cd9b5..ba36c29addc73 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Stream.h @@ -18,16 +18,11 @@ #include "ITStracking/Definitions.h" -namespace o2 -{ -namespace its -{ -namespace gpu +namespace o2::its::gpu { class Stream final { - public: Stream(); ~Stream(); @@ -37,8 +32,5 @@ class Stream final private: GPUStream mStream; }; -} // namespace gpu -} // namespace its -} // namespace o2 - -#endif /* TRAKINGITSU_INCLUDE_GPU_STREAM_H_ */ +} // namespace o2::its::gpu +#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h index c477922e59533..ac489fc8bfee5 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h @@ -17,8 +17,8 @@ #include "ITStracking/TimeFrame.h" #include "ITStrackingGPU/ClusterLinesGPU.h" -#include "ITStrackingGPU/Array.h" -#include "ITStrackingGPU/Vector.h" +// #include "ITStrackingGPU/Array.h" +// #include "ITStrackingGPU/Vector.h" #include "ITStrackingGPU/Stream.h" #include diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 100e49def0d50..1581eea6ce81a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -17,10 +17,8 @@ #include "ITStracking/Configuration.h" #include "ITStrackingGPU/ClusterLinesGPU.h" -#include "ITStrackingGPU/Array.h" -#include "ITStrackingGPU/Vector.h" #include "ITStrackingGPU/Stream.h" -#include "ITStrackingGPU/TimeFrameChunk.h" +#include "ITStrackingGPU/Utils.h" #include @@ -39,8 +37,6 @@ class DefaultGPUAllocator : public ExternalAllocator template class TimeFrameGPU : public TimeFrame { - friend class GpuTimeFrameChunk; - public: TimeFrameGPU(); ~TimeFrameGPU(); @@ -88,24 +84,18 @@ class TimeFrameGPU : public TimeFrame void downloadCellsDevice(); void downloadCellsLUTDevice(); void unregisterRest(); - void initDeviceChunks(const int, const int); template - size_t loadChunkData(const size_t, const size_t, const size_t); - size_t getNChunks() const { return mMemChunks.size(); } - GpuTimeFrameChunk& getChunk(const int chunk) { return mMemChunks[chunk]; } - Stream& getStream(const size_t stream) { return mGpuStreams[stream]; } + Stream& getStream(const size_t stream) + { + return mGpuStreams[stream]; + } void wipe(const int); /// interface int getNClustersInRofSpan(const int, const int, const int) const; IndexTableUtils* getDeviceIndexTableUtils() { return mIndexTableUtilsDevice; } int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; } - std::vector>& getVerticesInChunks() { return mVerticesInChunks; } - std::vector>& getNVerticesInChunks() { return mNVerticesInChunks; } std::vector& getTrackITSExt() { return mTrackITSExt; } - std::vector>& getLabelsInChunks() { return mLabelsInChunks; } - int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones. - StaticTrackingParameters* getDeviceTrackingParameters() { return mTrackingParamsDevice; } Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; } int* getDeviceROFramesPV() { return mROFramesPVDevice; } unsigned char* getDeviceUsedClusters(const int); @@ -158,16 +148,13 @@ class TimeFrameGPU : public TimeFrame private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations bool mHostRegistered = false; - std::vector> mMemChunks; TimeFrameGPUParameters mGpuParams; - StaticTrackingParameters mStaticTrackingParams; // Host-available device buffer sizes std::array mNTracklets; std::array mNCells; // Device pointers - StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; // Hybrid pref @@ -216,31 +203,10 @@ class TimeFrameGPU : public TimeFrame size_t mAvailMemGB; bool mFirstInit = true; - // Output - std::vector> mVerticesInChunks; - std::vector> mNVerticesInChunks; - std::vector> mLabelsInChunks; - // Temporary buffer for storing output tracks from GPU tracking std::vector mTrackITSExt; }; -template -template -size_t TimeFrameGPU::loadChunkData(const size_t chunk, const size_t offset, const size_t maxRofs) // offset: readout frame to start from, maxRofs: to manage boundaries -{ - size_t nRof{0}; - - mMemChunks[chunk].reset(task, mGpuStreams[chunk]); // Reset chunks memory - if constexpr ((bool)task) { - nRof = mMemChunks[chunk].loadDataOnDevice(offset, maxRofs, 3, mGpuStreams[chunk]); - } else { - nRof = mMemChunks[chunk].loadDataOnDevice(offset, maxRofs, nLayers, mGpuStreams[chunk]); - } - LOGP(debug, "In chunk {}: loaded {} readout frames starting from {}", chunk, nRof, offset); - return nRof; -} - template inline int TimeFrameGPU::getNClustersInRofSpan(const int rofIdstart, const int rofSpanSize, const int layerId) const { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h index 076523261ff7e..f9583d97ca030 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h @@ -55,7 +55,6 @@ class TrackerTraitsGPU : public TrackerTraits private: IndexTableUtils* mDeviceIndexTableUtils; gpu::TimeFrameGPU<7>* mTimeFrameGPU; - gpu::StaticTrackingParameters* mStaticTrkPars; }; template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h deleted file mode 100644 index ce04da3dde622..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/UniquePointer.h +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file UniquePointer.h -/// \brief -/// - -#ifndef ITSTRACKINGGPU_UNIQUEPOINTER_H_ -#define ITSTRACKINGGPU_UNIQUEPOINTER_H_ - -#include "Utils.h" - -namespace o2 -{ -namespace its -{ -namespace gpu -{ - -namespace -{ -template -struct UniquePointerTraits final { - typedef T* InternalPointer; - - GPUhd() static constexpr T& getReference(const InternalPointer& internalPointer) noexcept - { - return const_cast(*internalPointer); - } - - GPUhd() static constexpr T* getPointer(const InternalPointer& internalPointer) noexcept - { - return const_cast(internalPointer); - } -}; -} // namespace - -template -class UniquePointer final -{ - typedef UniquePointerTraits PointerTraits; - - public: - UniquePointer(); - explicit UniquePointer(const T&); - ~UniquePointer(); - - UniquePointer(const UniquePointer&) = delete; - UniquePointer& operator=(const UniquePointer&) = delete; - - UniquePointer(UniquePointer&&); - UniquePointer& operator=(UniquePointer&&); - - GPUhd() T* get() noexcept; - GPUhd() const T* get() const noexcept; - GPUhd() T& operator*() noexcept; - GPUhd() const T& operator*() const noexcept; - - protected: - void destroy(); - - private: - typename PointerTraits::InternalPointer mDevicePointer; -}; - -template -UniquePointer::UniquePointer() : mDevicePointer{nullptr} -{ - // Nothing to do -} - -template -UniquePointer::UniquePointer(const T& ref) -{ - try { - - utils::host::gpuMalloc(reinterpret_cast(&mDevicePointer), sizeof(T)); - utils::host::gpuMemcpyHostToDevice(mDevicePointer, &ref, sizeof(T)); - - } catch (...) { - - destroy(); - - throw; - } -} - -template -UniquePointer::~UniquePointer() -{ - destroy(); -} - -template -UniquePointer::UniquePointer(UniquePointer&& other) : mDevicePointer{other.mDevicePointer} -{ - // Nothing to do -} - -template -UniquePointer& UniquePointer::operator=(UniquePointer&& other) -{ - mDevicePointer = other.mDevicePointer; - other.mDevicePointer = nullptr; - - return *this; -} - -template -void UniquePointer::destroy() -{ - if (mDevicePointer != nullptr) { - - utils::host::gpuFree(mDevicePointer); - } -} - -template -GPUhd() T* UniquePointer::get() noexcept -{ - return PointerTraits::getPointer(mDevicePointer); -} - -template -GPUhd() const T* UniquePointer::get() const noexcept -{ - return PointerTraits::getPointer(mDevicePointer); -} - -template -GPUhd() T& UniquePointer::operator*() noexcept -{ - return PointerTraits::getReference(mDevicePointer); -} - -template -GPUhd() const T& UniquePointer::operator*() const noexcept -{ - return PointerTraits::getReference(mDevicePointer); -} -} // namespace gpu -} // namespace its -} // namespace o2 - -#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index a88e51742e84a..7ed70a4e0fdd9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -31,6 +31,12 @@ struct gpuPair { namespace gpu { + +template +void discardResult(const T&) +{ +} + // Poor man implementation of a span-like struct. It is very limited. template struct gpuSpan { @@ -91,38 +97,6 @@ GPUhd() const T* getPtrFromRuler(int index, const T* src, const int* ruler, cons { return src + ruler[index] * stride; } - -GPUh() void gpuThrowOnError(); - -namespace utils -{ -#ifdef __CUDACC__ -void checkGPUError(const cudaError_t error, const char* file = __FILE__, const int line = __LINE__); -#endif -#ifdef __HIPCC__ -void checkGPUError(const hipError_t error, const char* file = __FILE__, const int line = __LINE__); -#endif - -// Dump device properties -void getDeviceProp(int, bool verbose = true); - -dim3 getBlockSize(const int); -dim3 getBlockSize(const int, const int); -dim3 getBlockSize(const int, const int, const int); -dim3 getBlocksGrid(const dim3&, const int); -dim3 getBlocksGrid(const dim3&, const int, const int); - -void gpuMalloc(void**, const int); -void gpuFree(void*); -void gpuMemset(void*, int, int); -void gpuMemcpyHostToDevice(void*, const void*, int); -void gpuMemcpyDeviceToHost(void*, const void*, int); -void gpuMemcpyToSymbol(const void* symbol, const void* src, int size); -void gpuMemcpyFromSymbol(void* dst, const void* symbol, int size); - -GPUd() int getLaneIndex(); -GPUd() int shareToWarp(const int, const int); -} // namespace utils } // namespace gpu } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h deleted file mode 100644 index 3912caec8449c..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Vector.h +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file Vector.h -/// \brief -/// - -#ifndef ITSTRACKINGGPU_VECTOR_H_ -#define ITSTRACKINGGPU_VECTOR_H_ - -#include -#include -#include -#include - -#include "Stream.h" -#include "Utils.h" - -namespace o2 -{ -namespace its -{ -namespace gpu -{ - -template -class Vector final -{ - static_assert(std::is_trivially_destructible::value, "Vector only supports trivially destructible objects."); - - public: - Vector(); - explicit Vector(const size_t, const size_t = 0); - Vector(const T* const, const size_t, const size_t = 0); - GPUhd() ~Vector(); - - Vector(const Vector&) = delete; - Vector& operator=(const Vector&) = delete; - - GPUhd() Vector(Vector&&); - Vector& operator=(Vector&&); - - size_t getSizeFromDevice() const; - - T getElementFromDevice(const size_t) const; - - void resize(const size_t); - void reset(const size_t, const size_t = 0); - void reset(const T* const, const size_t, const size_t = 0); - - void resetWithInt(const size_t, const int value = 0); - void copyIntoSizedVector(std::vector&); - - GPUhd() T* get() const; - GPUhd() size_t capacity() const; - GPUhd() Vector getWeakCopy() const; - GPUd() T& operator[](const size_t) const; - - GPUd() size_t size() const; - GPUhd() void dump(); - - template - GPUd() void emplace(const size_t, Args&&...); - - protected: - void destroy(); - - private: - GPUhd() Vector(const Vector&, const bool); - - T* mArrayPtr = nullptr; - size_t* mDeviceSizePtr = nullptr; - size_t mCapacity; - bool mIsWeak; -}; - -template -Vector::Vector() : Vector{nullptr, 0} -{ - // Nothing to do -} - -template -Vector::Vector(const size_t capacity, const size_t initialSize) : Vector{nullptr, capacity, initialSize} -{ - // Nothing to do -} - -template -Vector::Vector(const T* const source, const size_t size, const size_t initialSize) : mCapacity{size}, mIsWeak{false} -{ - if (size > 0) { - try { - - utils::gpuMalloc(reinterpret_cast(&mArrayPtr), size * sizeof(T)); - utils::gpuMalloc(reinterpret_cast(&mDeviceSizePtr), sizeof(size_t)); - - if (source != nullptr) { - - utils::gpuMemcpyHostToDevice(mArrayPtr, source, size * sizeof(T)); - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t)); - - } else { - - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &initialSize, sizeof(size_t)); - } - - } catch (...) { - - destroy(); - - throw; - } - } -} - -template -GPUhd() Vector::Vector(const Vector& other, const bool isWeak) - : mArrayPtr{other.mArrayPtr}, - mDeviceSizePtr{other.mDeviceSizePtr}, - mCapacity{other.mCapacity}, - mIsWeak{isWeak} -{ - // Nothing to do -} - -template -GPUhd() Vector::~Vector() -{ - if (mIsWeak) { - return; - } else { -#if defined(TRACKINGITSU_GPU_DEVICE) - assert(0); -#else - destroy(); -#endif - } -} - -template -GPUhd() Vector::Vector(Vector&& other) - : mArrayPtr{other.mArrayPtr}, - mDeviceSizePtr{other.mDeviceSizePtr}, - mCapacity{other.mCapacity}, - mIsWeak{other.mIsWeak} -{ - other.mArrayPtr = nullptr; - other.mDeviceSizePtr = nullptr; -} - -template -Vector& Vector::operator=(Vector&& other) -{ - destroy(); - - mArrayPtr = other.mArrayPtr; - mDeviceSizePtr = other.mDeviceSizePtr; - mCapacity = other.mCapacity; - mIsWeak = other.mIsWeak; - - other.mArrayPtr = nullptr; - other.mDeviceSizePtr = nullptr; - - return *this; -} - -template -size_t Vector::getSizeFromDevice() const -{ - size_t size; - utils::gpuMemcpyDeviceToHost(&size, mDeviceSizePtr, sizeof(size_t)); - - return size; -} - -template -void Vector::resize(const size_t size) -{ - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t)); -} - -template -void Vector::reset(const size_t capacity, const size_t initialSize) -{ - reset(nullptr, capacity, initialSize); -} - -template -void Vector::reset(const T* const source, const size_t size, const size_t initialSize) -{ - if (size > mCapacity) { - if (mArrayPtr != nullptr) { - utils::gpuFree(mArrayPtr); - } - utils::gpuMalloc(reinterpret_cast(&mArrayPtr), size * sizeof(T)); - mCapacity = size; - } - if (mDeviceSizePtr == nullptr) { - utils::gpuMalloc(reinterpret_cast(&mDeviceSizePtr), sizeof(size_t)); - } - - if (source != nullptr) { - utils::gpuMemcpyHostToDevice(mArrayPtr, source, size * sizeof(T)); - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(size_t)); - } else { - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &initialSize, sizeof(size_t)); - } -} - -template -void Vector::resetWithInt(const size_t size, const int value) -{ - if (size > mCapacity) { - if (mArrayPtr != nullptr) { - utils::gpuFree(mArrayPtr); - } - utils::gpuMalloc(reinterpret_cast(&mArrayPtr), size * sizeof(int)); - mCapacity = size; - } - if (mDeviceSizePtr == nullptr) { - utils::gpuMalloc(reinterpret_cast(&mDeviceSizePtr), sizeof(int)); - } - - utils::gpuMemset(mArrayPtr, value, size * sizeof(int)); - utils::gpuMemcpyHostToDevice(mDeviceSizePtr, &size, sizeof(int)); -} - -template -void Vector::copyIntoSizedVector(std::vector& destinationVector) -{ - utils::gpuMemcpyDeviceToHost(destinationVector.data(), mArrayPtr, destinationVector.size() * sizeof(T)); -} - -template -inline void Vector::destroy() -{ - if (mArrayPtr != nullptr) { - utils::gpuFree(mArrayPtr); - } - if (mDeviceSizePtr != nullptr) { - utils::gpuFree(mDeviceSizePtr); - } -} - -template -GPUhd() T* Vector::get() const -{ - return mArrayPtr; -} - -template -GPUhd() size_t Vector::capacity() const -{ - return mCapacity; -} - -template -GPUhd() Vector Vector::getWeakCopy() const -{ - return Vector{*this, true}; -} - -template -GPUd() T& Vector::operator[](const size_t index) const -{ - return mArrayPtr[index]; -} - -template -GPUh() T Vector::getElementFromDevice(const size_t index) const -{ - T element; - utils::gpuMemcpyDeviceToHost(&element, mArrayPtr + index, sizeof(T)); - - return element; -} - -template -GPUd() size_t Vector::size() const -{ - return *mDeviceSizePtr; -} - -template -template -GPUd() void Vector::emplace(const size_t index, Args&&... arguments) -{ - new (mArrayPtr + index) T(std::forward(arguments)...); -} - -template -GPUhd() void Vector::dump() -{ - printf("mArrayPtr = %p\nmDeviceSize = %p\nmCapacity = %d\nmIsWeak = %s\n", - mArrayPtr, mDeviceSizePtr, mCapacity, mIsWeak ? "true" : "false"); -} -} // namespace gpu -} // namespace its -} // namespace o2 - -#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 531cf2b0dcd33..56f6e57e2f305 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -17,16 +17,13 @@ message(STATUS "Building ITS CUDA tracker") # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu - Context.cu Stream.cu TrackerTraitsGPU.cxx TimeFrameGPU.cu - TimeFrameChunk.cu TracerGPU.cu TrackingKernels.cu VertexingKernels.cu VertexerTraitsGPU.cxx - Utils.cu PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::ITStracking O2::SimConfig diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu deleted file mode 100644 index f3bced9463020..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Context.cu +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -#include -#include "ITStrackingGPU/Context.h" -#include "ITStrackingGPU/Utils.h" - -#include -#include -#include - -namespace o2 -{ -namespace its -{ -namespace gpu -{ - -using utils::checkGPUError; - -Context::Context(bool dumpDevices) -{ - checkGPUError(cudaGetDeviceCount(&mDevicesNum), __FILE__, __LINE__); - - if (mDevicesNum == 0) { - throw std::runtime_error{"There are no available GPU device(s)\n"}; - } - - mDeviceProperties.resize(mDevicesNum, DeviceProperties{}); - - int currentDeviceIndex; - checkGPUError(cudaGetDevice(¤tDeviceIndex), __FILE__, __LINE__); - - for (int iDevice{0}; iDevice < mDevicesNum; ++iDevice) { - - cudaDeviceProp deviceProperties; - - checkGPUError(cudaSetDevice(iDevice), __FILE__, __LINE__); - checkGPUError(cudaGetDeviceProperties(&deviceProperties, iDevice), __FILE__, __LINE__); - - int major = deviceProperties.major; - int minor = deviceProperties.minor; - - mDeviceProperties[iDevice].name = deviceProperties.name; - mDeviceProperties[iDevice].gpuProcessors = deviceProperties.multiProcessorCount; - mDeviceProperties[iDevice].gpuCores = getGPUCores(major, minor) * deviceProperties.multiProcessorCount; - mDeviceProperties[iDevice].globalMemorySize = deviceProperties.totalGlobalMem; - mDeviceProperties[iDevice].constantMemorySize = deviceProperties.totalConstMem; - mDeviceProperties[iDevice].sharedMemorySize = deviceProperties.sharedMemPerBlock; - mDeviceProperties[iDevice].maxClockRate = deviceProperties.memoryClockRate; - mDeviceProperties[iDevice].busWidth = deviceProperties.memoryBusWidth; - mDeviceProperties[iDevice].l2CacheSize = deviceProperties.l2CacheSize; - mDeviceProperties[iDevice].registersPerBlock = deviceProperties.regsPerBlock; - mDeviceProperties[iDevice].warpSize = deviceProperties.warpSize; - mDeviceProperties[iDevice].maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - mDeviceProperties[iDevice].maxBlocksPerSM = getGPUMaxThreadsPerComputingUnit(); - mDeviceProperties[iDevice].maxThreadsDim = dim3{static_cast(deviceProperties.maxThreadsDim[0]), - static_cast(deviceProperties.maxThreadsDim[1]), - static_cast(deviceProperties.maxThreadsDim[2])}; - mDeviceProperties[iDevice].maxGridDim = dim3{static_cast(deviceProperties.maxGridSize[0]), - static_cast(deviceProperties.maxGridSize[1]), - static_cast(deviceProperties.maxGridSize[2])}; - if (dumpDevices) { - std::cout << "################ " << GPU_ARCH << " DEVICE " << iDevice << " ################" << std::endl; - std::cout << "Name " << mDeviceProperties[iDevice].name << std::endl; - std::cout << "minor " << minor << " major " << major << std::endl; - std::cout << "gpuProcessors " << mDeviceProperties[iDevice].gpuProcessors << std::endl; - std::cout << "gpuCores " << mDeviceProperties[iDevice].gpuCores << std::endl; - std::cout << "globalMemorySize " << mDeviceProperties[iDevice].globalMemorySize << std::endl; - std::cout << "constantMemorySize " << mDeviceProperties[iDevice].constantMemorySize << std::endl; - std::cout << "sharedMemorySize " << mDeviceProperties[iDevice].sharedMemorySize << std::endl; - std::cout << "maxClockRate " << mDeviceProperties[iDevice].maxClockRate << std::endl; - std::cout << "busWidth " << mDeviceProperties[iDevice].busWidth << std::endl; - std::cout << "l2CacheSize " << mDeviceProperties[iDevice].l2CacheSize << std::endl; - std::cout << "registersPerBlock " << mDeviceProperties[iDevice].registersPerBlock << std::endl; - std::cout << "warpSize " << mDeviceProperties[iDevice].warpSize << std::endl; - std::cout << "maxThreadsPerBlock " << mDeviceProperties[iDevice].maxThreadsPerBlock << std::endl; - std::cout << "maxBlocksPerSM " << mDeviceProperties[iDevice].maxBlocksPerSM << std::endl; - std::cout << "maxThreadsDim " << mDeviceProperties[iDevice].maxThreadsDim.x << ", " - << mDeviceProperties[iDevice].maxThreadsDim.y << ", " - << mDeviceProperties[iDevice].maxThreadsDim.z << std::endl; - std::cout << "maxGridDim " << mDeviceProperties[iDevice].maxGridDim.x << ", " - << mDeviceProperties[iDevice].maxGridDim.y << ", " - << mDeviceProperties[iDevice].maxGridDim.z << std::endl; - std::cout << std::endl; - } - } - - checkGPUError(cudaSetDevice(currentDeviceIndex), __FILE__, __LINE__); -} - -Context& Context::getInstance() -{ - static Context gpuContext; - return gpuContext; -} - -const DeviceProperties& Context::getDeviceProperties() -{ - int currentDeviceIndex; - checkGPUError(cudaGetDevice(¤tDeviceIndex), __FILE__, __LINE__); - - return getDeviceProperties(currentDeviceIndex); -} - -const DeviceProperties& Context::getDeviceProperties(const int deviceIndex) -{ - return mDeviceProperties[deviceIndex]; -} - -} // namespace gpu -} // namespace its -} // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu index 885587d8d4544..34bc3dc68a7a4 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Stream.cu @@ -10,10 +10,8 @@ // or submit itself to any jurisdiction. /// -#include #include "ITStrackingGPU/Stream.h" -#include "ITStrackingGPU/Utils.h" -#include "GPUCommonLogger.h" +#include "GPUCommonHelpers.h" namespace o2 { @@ -21,16 +19,15 @@ namespace its { namespace gpu { -using utils::checkGPUError; Stream::Stream() { - checkGPUError(cudaStreamCreate(&mStream)); + GPUChkErrS(cudaStreamCreate(&mStream)); } + Stream::~Stream() { - LOGP(info, "Destroying stream"); - checkGPUError(cudaStreamDestroy(mStream)); + GPUChkErrS(cudaStreamDestroy(mStream)); } const GPUStream& Stream::get() const diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu index 8353b6ff0aa8b..c8512e667aea8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu @@ -26,6 +26,7 @@ #include "GPUCommonDef.h" #include "GPUCommonMath.h" #include "GPUCommonLogger.h" +#include "GPUCommonHelpers.h" #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda @@ -39,38 +40,37 @@ using constants::GB; using constants::MB; namespace gpu { -using utils::checkGPUError; template GpuTimeFrameChunk::~GpuTimeFrameChunk() { if (mAllocated) { for (int i = 0; i < nLayers; ++i) { - checkGPUError(cudaFree(mClustersDevice[i])); - // checkGPUError(cudaFree(mTrackingFrameInfoDevice[i])); - checkGPUError(cudaFree(mClusterExternalIndicesDevice[i])); - checkGPUError(cudaFree(mIndexTablesDevice[i])); + GPUChkErrS(cudaFree(mClustersDevice[i])); + // GPUChkErrS(cudaFree(mTrackingFrameInfoDevice[i])); + GPUChkErrS(cudaFree(mClusterExternalIndicesDevice[i])); + GPUChkErrS(cudaFree(mIndexTablesDevice[i])); if (i < nLayers - 1) { - checkGPUError(cudaFree(mTrackletsDevice[i])); - checkGPUError(cudaFree(mTrackletsLookupTablesDevice[i])); + GPUChkErrS(cudaFree(mTrackletsDevice[i])); + GPUChkErrS(cudaFree(mTrackletsLookupTablesDevice[i])); if (i < nLayers - 2) { - checkGPUError(cudaFree(mCellsDevice[i])); - checkGPUError(cudaFree(mCellsLookupTablesDevice[i])); - checkGPUError(cudaFree(mRoadsLookupTablesDevice[i])); + GPUChkErrS(cudaFree(mCellsDevice[i])); + GPUChkErrS(cudaFree(mCellsLookupTablesDevice[i])); + GPUChkErrS(cudaFree(mRoadsLookupTablesDevice[i])); if (i < nLayers - 3) { - checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i])); - checkGPUError(cudaFree(mNeighboursCellDevice[i])); + GPUChkErrS(cudaFree(mNeighboursCellLookupTablesDevice[i])); + GPUChkErrS(cudaFree(mNeighboursCellDevice[i])); } } } } - // checkGPUError(cudaFree(mRoadsDevice)); - checkGPUError(cudaFree(mCUBTmpBufferDevice)); - checkGPUError(cudaFree(mFoundTrackletsDevice)); - checkGPUError(cudaFree(mNFoundCellsDevice)); - checkGPUError(cudaFree(mCellsDeviceArray)); - checkGPUError(cudaFree(mNeighboursCellDeviceArray)); - checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray)); + // GPUChkErrS(cudaFree(mRoadsDevice)); + GPUChkErrS(cudaFree(mCUBTmpBufferDevice)); + GPUChkErrS(cudaFree(mFoundTrackletsDevice)); + GPUChkErrS(cudaFree(mNFoundCellsDevice)); + GPUChkErrS(cudaFree(mCellsDeviceArray)); + GPUChkErrS(cudaFree(mNeighboursCellDeviceArray)); + GPUChkErrS(cudaFree(mNeighboursCellLookupTablesDeviceArray)); } } @@ -117,9 +117,9 @@ void GpuTimeFrameChunk::allocate(const size_t nrof, Stream& stream) // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true); // /// Copy pointers of allocated memory to regrouping arrays - // checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get())); - // checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); - // checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); + // GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get())); + // GPUChkErrS(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); + // GPUChkErrS(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); mAllocated = true; } @@ -133,28 +133,28 @@ void GpuTimeFrameChunk::reset(const Task task, Stream& stream) // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); - // checkGPUError(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); // } - // checkGPUError(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get())); // } else { // for (int i = 0; i < nLayers; ++i) { // if (i < nLayers - 1) { - // checkGPUError(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); // if (i < nLayers - 2) { - // checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); // if (i < nLayers - 3) { - // checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // GPUChkErrS(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); // } // } // } // } - // checkGPUError(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get())); + // GPUChkErrS(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get())); // } } @@ -275,12 +275,12 @@ size_t GpuTimeFrameChunk::loadDataOnDevice(const size_t startRof, const // if (mHostClusters[i].size() > mTFGPUParams->clustersPerROfCapacity * nRofs) { // LOGP(warning, "Clusters on layer {} exceed the expected value, resizing to config value: {}, will lose information!", i, mTFGPUParams->clustersPerROfCapacity * nRofs); // } - // checkGPUError(cudaMemcpyAsync(mClustersDevice[i], + // GPUChkErrS(cudaMemcpyAsync(mClustersDevice[i], // mHostClusters[i].data(), // (int)std::min(mHostClusters[i].size(), mTFGPUParams->clustersPerROfCapacity * nRofs) * sizeof(Cluster), // cudaMemcpyHostToDevice, stream.get())); // if (mHostIndexTables[i].data()) { - // checkGPUError(cudaMemcpyAsync(mIndexTablesDevice[i], + // GPUChkErrS(cudaMemcpyAsync(mIndexTablesDevice[i], // mHostIndexTables[i].data(), // mHostIndexTables[i].size() * sizeof(int), // cudaMemcpyHostToDevice, stream.get())); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index b1aa55f533c34..55d523dcc30d0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -26,23 +26,24 @@ #include "GPUCommonDef.h" #include "GPUCommonMath.h" #include "GPUCommonLogger.h" +#include "GPUCommonHelpers.h" #ifdef ITS_MEASURE_GPU_TIME -#define START_GPU_STREAM_TIMER(stream, name) \ - cudaEvent_t event_start, event_stop; \ - checkGPUError(cudaEventCreate(&event_start)); \ - checkGPUError(cudaEventCreate(&event_stop)); \ - checkGPUError(cudaEventRecord(event_start, stream)); \ +#define START_GPU_STREAM_TIMER(stream, name) \ + cudaEvent_t event_start, event_stop; \ + GPUChkErrS(cudaEventCreate(&event_start)); \ + GPUChkErrS(cudaEventCreate(&event_stop)); \ + GPUChkErrS(cudaEventRecord(event_start, stream)); \ const std::string task_name = name; #define STOP_GPU_STREAM_TIMER(stream) \ - checkGPUError(cudaEventRecord(event_stop, stream)); \ - checkGPUError(cudaEventSynchronize(event_stop)); \ + GPUChkErrS(cudaEventRecord(event_stop, stream)); \ + GPUChkErrS(cudaEventSynchronize(event_stop)); \ float ms; \ - checkGPUError(cudaEventElapsedTime(&ms, event_start, event_stop)); \ + GPUChkErrS(cudaEventElapsedTime(&ms, event_start, event_stop)); \ std::cout << "Elapsed time for " << task_name << ": " << ms << " ms" << std::endl; \ - checkGPUError(cudaEventDestroy(event_start)); \ - checkGPUError(cudaEventDestroy(event_stop)); + GPUChkErrS(cudaEventDestroy(event_start)); \ + GPUChkErrS(cudaEventDestroy(event_stop)); #else #define START_GPU_STREAM_TIMER(stream, name) #define STOP_GPU_STREAM_TIMER(stream) @@ -57,7 +58,6 @@ using constants::MB; namespace gpu { -using utils::checkGPUError; void* DefaultGPUAllocator::allocate(size_t size) { @@ -69,7 +69,6 @@ template TimeFrameGPU::TimeFrameGPU() { mIsGPU = true; - utils::getDeviceProp(0, true); } template @@ -82,7 +81,7 @@ void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream* strPt *ptr = mAllocator->allocate(size); } else { LOGP(debug, "Calling default CUDA allocator"); - checkGPUError(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); + GPUChkErrS(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); } } @@ -101,7 +100,7 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator()); } LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); - checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -113,12 +112,12 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -131,12 +130,12 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB); allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -149,10 +148,10 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -165,10 +164,10 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -179,7 +178,7 @@ void TimeFrameGPU::loadUsedClustersDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); - checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -192,10 +191,10 @@ void TimeFrameGPU::loadROframeClustersDevice(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -208,12 +207,12 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -225,7 +224,7 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -237,10 +236,10 @@ void TimeFrameGPU::loadVertices(const int iteration) START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -254,11 +253,11 @@ void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); } - checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); } if (!iteration) { allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -269,13 +268,13 @@ void TimeFrameGPU::createTrackletsBuffers() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { mNTracklets[iLayer] = 0; - checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); } allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -285,8 +284,8 @@ void TimeFrameGPU::loadTrackletsDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); - checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -297,11 +296,11 @@ void TimeFrameGPU::loadTrackletsLUTDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); - checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + GPUChkErrS(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); } - checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); + GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -311,12 +310,12 @@ void TimeFrameGPU::createNeighboursIndexTablesDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours"); // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -327,7 +326,7 @@ void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const uns START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc - checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -339,11 +338,11 @@ void TimeFrameGPU::loadCellsDevice() LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding. - checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); - checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -354,10 +353,10 @@ void TimeFrameGPU::createCellsLUTDevice() for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -366,7 +365,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); mNCells[layer] = 0; - checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); + GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); @@ -379,8 +378,8 @@ void TimeFrameGPU::loadCellsLUTDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); - checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -390,8 +389,8 @@ void TimeFrameGPU::loadRoadsDevice() { LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", mRoads.size(), mRoads.size() * sizeof(Road) / MB); allocMemAsync(reinterpret_cast(&mRoadsDevice), mRoads.size() * sizeof(Road), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template @@ -400,8 +399,8 @@ void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds"); LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -411,7 +410,7 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, con START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), &(mGpuStreams[0]), getExtAllocator()); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); @@ -425,7 +424,7 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std mCellsNeighbours[layer].resize(neighbours.size()); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator()); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); @@ -436,7 +435,7 @@ void TimeFrameGPU::createNeighboursDeviceArray() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -448,8 +447,8 @@ void TimeFrameGPU::createTrackITSExtDevice(std::vector& seeds mTrackITSExt.resize(seeds.size()); LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); - checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -460,7 +459,7 @@ void TimeFrameGPU::downloadCellsDevice() for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); mCells[iLayer].resize(mNCells[iLayer]); - checkGPUError(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -472,7 +471,7 @@ void TimeFrameGPU::downloadCellsLUTDevice() for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); - checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -483,7 +482,7 @@ void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector) / MB); // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) - checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template @@ -491,7 +490,7 @@ void TimeFrameGPU::downloadNeighboursLUTDevice(std::vector& lut, c { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB); - checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -500,9 +499,9 @@ void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& see { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks"); LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); - checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); - checkGPUError(cudaHostUnregister(mTrackITSExt.data())); - checkGPUError(cudaHostUnregister(seeds.data())); + GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaHostUnregister(mTrackITSExt.data())); + GPUChkErrS(cudaHostUnregister(seeds.data())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -511,8 +510,8 @@ void TimeFrameGPU::unregisterRest() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory"); LOGP(debug, "unregistering rest of the host memory..."); - checkGPUError(cudaHostUnregister(mCellsDevice.data())); - checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); + GPUChkErrS(cudaHostUnregister(mCellsDevice.data())); + GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -520,13 +519,13 @@ template void TimeFrameGPU::unregisterHostMemory(const int maxLayers) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - checkGPUError(cudaHostUnregister(mUnsortedClusters[iLayer].data())); - checkGPUError(cudaHostUnregister(mClusters[iLayer].data())); - checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data())); + GPUChkErrS(cudaHostUnregister(mUnsortedClusters[iLayer].data())); + GPUChkErrS(cudaHostUnregister(mClusters[iLayer].data())); + GPUChkErrS(cudaHostUnregister(mTrackingFrameInfo[iLayer].data())); } - checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data())); - checkGPUError(cudaHostUnregister(mUnsortedClustersDevice.data())); - checkGPUError(cudaHostUnregister(mClustersDevice.data())); + GPUChkErrS(cudaHostUnregister(mTrackingFrameInfoDevice.data())); + GPUChkErrS(cudaHostUnregister(mUnsortedClustersDevice.data())); + GPUChkErrS(cudaHostUnregister(mClustersDevice.data())); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 583452d0c429c..805e66675e1b9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -36,6 +36,9 @@ #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" +#include "ITStrackingGPU/Utils.h" + +#include "GPUCommonHelpers.h" #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda @@ -51,20 +54,6 @@ #include "DetectorsBase/Propagator.h" using namespace o2::track; -#define gpuCheckError(x) \ - { \ - gpuAssert((x), __FILE__, __LINE__); \ - } -inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) -{ - if (code != cudaSuccess) { - LOGF(error, "GPUassert: %s %s %d", cudaGetErrorString(code), file, line); - if (abort) { - throw std::runtime_error("GPU assert failed."); - } - } -} - namespace o2::its { using namespace constants::its2; @@ -873,20 +862,20 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, mulScatAng[iLayer]); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - gpuCheckError(cudaFree(d_temp_storage)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaFree(d_temp_storage)); } } @@ -955,24 +944,24 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); nTracklets[iLayer] = unique_end - tracklets_ptr; if (iLayer > 0) { - gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + GPUChkErrS(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - gpuCheckError(cudaFree(d_temp_storage)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaFree(d_temp_storage)); } } } @@ -1011,20 +1000,20 @@ void countCellsHandler( nSigmaCut); // const float void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - cellsLUTsHost, // d_in - cellsLUTsHost, // d_out - nTracklets + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - cellsLUTsHost, // d_in - cellsLUTsHost, // d_out - nTracklets + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - gpuCheckError(cudaFree(d_temp_storage)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + cellsLUTsHost, // d_in + cellsLUTsHost, // d_out + nTracklets + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + cellsLUTsHost, // d_in + cellsLUTsHost, // d_out + nTracklets + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaFree(d_temp_storage)); } void computeCellsHandler( @@ -1089,37 +1078,37 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; - gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - neighboursLUT, // d_in - neighboursLUT, // d_out - nCellsNext)); // num_items - - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - neighboursLUT, // d_in - neighboursLUT, // d_out - nCellsNext)); // num_items - - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - temp_storage_bytes_2, // temp_storage_bytes - neighboursIndexTable, // d_in - neighboursIndexTable, // d_out - nCells + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - - discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - temp_storage_bytes_2, // temp_storage_bytes - neighboursIndexTable, // d_in - neighboursIndexTable, // d_out - nCells + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext)); // num_items + + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext)); // num_items + + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + + GPUChkErrS(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer unsigned int nNeighbours; - gpuCheckError(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)); - gpuCheckError(cudaFree(d_temp_storage)); - gpuCheckError(cudaFree(d_temp_storage_2)); + GPUChkErrS(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)); + GPUChkErrS(cudaFree(d_temp_storage)); + GPUChkErrS(cudaFree(d_temp_storage_2)); return nNeighbours; } @@ -1150,8 +1139,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + GPUChkErrS(cudaPeekAtLastError()); + GPUChkErrS(cudaDeviceSynchronize()); } int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually remove this! @@ -1172,12 +1161,12 @@ int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()), thrust::make_permutation_iterator(neighVectorPairs, vals.end()), sortedNeigh.begin()); - discardResult(cudaDeviceSynchronize()); + GPUChkErrS(cudaDeviceSynchronize()); auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair()); // trim leading -1s auto trimmedSize = sortedNeigh.end() - trimmedBegin; neighHost.resize(trimmedSize); thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first()); - gpuCheckError(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + GPUChkErrS(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); return trimmedSize; } @@ -1225,19 +1214,19 @@ void processNeighboursHandler(const int startLayer, matCorrType); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[startLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[startLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[startLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[startLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer updatedCellId.resize(foundSeedsTable.back()); updatedCellSeed.resize(foundSeedsTable.back()); @@ -1260,7 +1249,7 @@ void processNeighboursHandler(const int startLayer, propagator, matCorrType); auto t1 = updatedCellSeed.size(); - gpuCheckError(cudaFree(d_temp_storage)); + GPUChkErrS(cudaFree(d_temp_storage)); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { temp_storage_bytes = 0; @@ -1289,19 +1278,19 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer auto foundSeeds{foundSeedsTable.back()}; updatedCellId.resize(foundSeeds); thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0); @@ -1325,7 +1314,7 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpuCheckError(cudaFree(d_temp_storage)); + GPUChkErrS(cudaFree(d_temp_storage)); } thrust::device_vector outSeeds(updatedCellSeed.size()); auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); @@ -1365,8 +1354,8 @@ void trackSeedHandler(CellSeed* trackSeeds, thrust::device_ptr tr_ptr(tracks); thrust::sort(tr_ptr, tr_ptr + nSeeds, gpu::compare_track_chi2()); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + GPUChkErrS(cudaPeekAtLastError()); + GPUChkErrS(cudaDeviceSynchronize()); } template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu deleted file mode 100644 index 99a24f347bd48..0000000000000 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/Utils.cu +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -#include -#include "ITStrackingGPU/Utils.h" -#include "ITStrackingGPU/Context.h" -#include "ITStracking/Constants.h" - -#include -#include -#include -#include -#include -#include -#include - -namespace -{ -int roundUp(const int numToRound, const int multiple) -{ - if (multiple == 0) { - return numToRound; - } - - int remainder{numToRound % multiple}; - if (remainder == 0) { - return numToRound; - } - return numToRound + multiple - remainder; -} - -int findNearestDivisor(const int numToRound, const int divisor) -{ - - if (numToRound > divisor) { - return divisor; - } - - int result = numToRound; - while (divisor % result != 0) { - ++result; - } - return result; -} - -} // namespace - -namespace o2 -{ -namespace its -{ -using constants::GB; -namespace gpu -{ -GPUh() void gpuThrowOnError() -{ - cudaError_t error = cudaGetLastError(); - - if (error != cudaSuccess) { - std::ostringstream errorString{}; - errorString << GPU_ARCH << " API returned error [" << cudaGetErrorString(error) << "] (code " << error << ")" << std::endl; - throw std::runtime_error{errorString.str()}; - } -} - -double bytesToconfig(size_t s) { return (double)s / (1024.0); } -double bytesToGB(size_t s) { return (double)s / GB; } - -void utils::checkGPUError(const cudaError_t error, const char* file, const int line) -{ - if (error != cudaSuccess) { - std::ostringstream errorString{}; - errorString << file << ":" << line << std::endl - << GPU_ARCH << " API returned error [" << cudaGetErrorString(error) << "] (code " - << error << ")" << std::endl; - throw std::runtime_error{errorString.str()}; - } -} - -void utils::getDeviceProp(int deviceId, bool print) -{ - const int w1 = 34; - std::cout << std::left; - std::cout << std::setw(w1) - << "--------------------------------------------------------------------------------" - << std::endl; - std::cout << std::setw(w1) << "device#" << deviceId << std::endl; - - cudaDeviceProp props; - checkGPUError(cudaGetDeviceProperties(&props, deviceId)); - if (print) { - std::cout << std::setw(w1) << "Name: " << props.name << std::endl; - std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl; - std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl; - std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl; - std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl; - std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor - << std::endl; - std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl; - std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl; - std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" - << std::endl; - std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl; - std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0 - << " Mhz" << std::endl; - std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2) - << bytesToGB(props.totalGlobalMem) << " GB" << std::endl; -#if !defined(__CUDACC__) - std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) - << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl; -#endif -#if defined(__HIPCC__) - std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) - << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl; -#endif - std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl; - std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config" - << std::endl; - std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl; - std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl; - std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl; - std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl; - std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl; - std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl; - std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl; - std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl; - std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl; - std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl; - std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl; - std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl; - std::cout << std::setw(w1) << "major: " << props.major << std::endl; - std::cout << std::setw(w1) << "minor: " << props.minor << std::endl; - std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl; - std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl; - std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl; -#if defined(__HIPCC__) - std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl; - std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch - << std::endl; - std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl; - std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch - << std::endl; - std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl; - std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl; - std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl; - std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl; - std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl; - std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl; - std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl; - std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl; - std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl; - std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl; - std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl; - std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl; - std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl; - std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl; -#endif - std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl; - std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl; - std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl; - std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl; - std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl; - std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl; - std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl; -#if defined(__HIPCC__) - std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl; - std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl; -#endif - - int deviceCnt; - checkGPUError(cudaGetDeviceCount(&deviceCnt)); - std::cout << std::setw(w1) << "peers: "; - for (int i = 0; i < deviceCnt; i++) { - int isPeer; - checkGPUError(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); - if (isPeer) { - std::cout << "device#" << i << " "; - } - } - std::cout << std::endl; - std::cout << std::setw(w1) << "non-peers: "; - for (int i = 0; i < deviceCnt; i++) { - int isPeer; - checkGPUError(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); - if (!isPeer) { - std::cout << "device#" << i << " "; - } - } - std::cout << std::endl; - - size_t free, total; - checkGPUError(cudaMemGetInfo(&free, &total)); - - std::cout << std::fixed << std::setprecision(2); - std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl; - std::cout << std::setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << std::setprecision(0) - << (float)free / total * 100.0 << "%)" << std::endl; - } -} - -dim3 utils::getBlockSize(const int colsNum) -{ - return getBlockSize(colsNum, 1); -} - -dim3 utils::getBlockSize(const int colsNum, const int rowsNum) -{ - const DeviceProperties& deviceProperties = Context::getInstance().getDeviceProperties(); - return getBlockSize(colsNum, rowsNum, deviceProperties.gpuCores / deviceProperties.maxBlocksPerSM); -} - -dim3 utils::getBlockSize(const int colsNum, const int rowsNum, const int maxThreadsPerBlock) -{ - const DeviceProperties& deviceProperties = Context::getInstance().getDeviceProperties(); - int xThreads = max(min(colsNum, deviceProperties.maxThreadsDim.x), 1); - int yThreads = max(min(rowsNum, deviceProperties.maxThreadsDim.y), 1); - const int totalThreads = roundUp(min(xThreads * yThreads, maxThreadsPerBlock), - deviceProperties.warpSize); - - if (xThreads > yThreads) { - - xThreads = findNearestDivisor(xThreads, totalThreads); - yThreads = totalThreads / xThreads; - - } else { - - yThreads = findNearestDivisor(yThreads, totalThreads); - xThreads = totalThreads / yThreads; - } - - return dim3{static_cast(xThreads), static_cast(yThreads)}; -} - -dim3 utils::getBlocksGrid(const dim3& threadsPerBlock, const int rowsNum) -{ - return getBlocksGrid(threadsPerBlock, rowsNum, 1); -} - -dim3 utils::getBlocksGrid(const dim3& threadsPerBlock, const int rowsNum, const int colsNum) -{ - - return dim3{1 + (rowsNum - 1) / threadsPerBlock.x, 1 + (colsNum - 1) / threadsPerBlock.y}; -} - -void utils::gpuMalloc(void** p, const int size) -{ - checkGPUError(cudaMalloc(p, size), __FILE__, __LINE__); -} - -void utils::gpuFree(void* p) -{ - checkGPUError(cudaFree(p), __FILE__, __LINE__); -} - -void utils::gpuMemset(void* p, int value, int size) -{ - checkGPUError(cudaMemset(p, value, size), __FILE__, __LINE__); -} - -void utils::gpuMemcpyHostToDevice(void* dst, const void* src, int size) -{ - checkGPUError(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice), __FILE__, __LINE__); -} - -void utils::gpuMemcpyDeviceToHost(void* dst, const void* src, int size) -{ - checkGPUError(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost), __FILE__, __LINE__); -} - -void utils::gpuMemcpyToSymbol(const void* symbol, const void* src, int size) -{ - checkGPUError(cudaMemcpyToSymbol(symbol, src, size, 0, cudaMemcpyHostToDevice), __FILE__, __LINE__); -} - -void utils::gpuMemcpyFromSymbol(void* dst, const void* symbol, int size) -{ - checkGPUError(cudaMemcpyFromSymbol(dst, symbol, size, 0, cudaMemcpyDeviceToHost), __FILE__, __LINE__); -} -} // namespace gpu -} // namespace its -} // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx index a26d52b2961c3..f630d2cdec76a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx @@ -54,162 +54,162 @@ void VertexerTraitsGPU::computeTracklets(const int iteration) if (!mTimeFrameGPU->getClusters().size()) { return; } - std::vector threads(mTimeFrameGPU->getNChunks()); - for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; - // mTimeFrameGPU->getVerticesInChunks()[chunkId].clear(); - // mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear(); - // mTimeFrameGPU->getLabelsInChunks()[chunkId].clear(); - // auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void { - // auto offset = chunkId * rofPerChunk; - // auto maxROF = offset + rofPerChunk; - // while (offset < maxROF) { - // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); - // RANGE("chunk_gpu_vertexing", 1); - // // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()}; - // // timer.Start("vtTrackletFinder"); - // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clustersNextLayer, // 0 2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeNextLClusters, - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0), // const int* nextIndexTables, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* Tracklets, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // mVrtParams.phiCut, // const float phiCut, - // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 - - // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2), // const Cluster* clustersNextLayer, // 0 2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 - // mTimeFrameGPU->getDeviceROframesClusters(2), // const int* sizeNextLClusters, - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2), // const int* nextIndexTables, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* Tracklets, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // mVrtParams.phiCut, // const float phiCut, - // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 - - // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets - // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan - // offset, // const unsigned int startRofId, // Starting ROF ID - // rofs, // const unsigned int rofSize, // Number of ROFs to consider - // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster - // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda - // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi - - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), - // mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Reset used tracklets - // checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), - // false, - // sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), - // mTimeFrameGPU->getStream(chunkId).get()), - // __FILE__, __LINE__); - - // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets - // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan - // offset, // const unsigned int startRofId, // Starting ROF ID - // rofs, // const unsigned int rofSize, // Number of ROFs to consider - // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster - // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda - // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi - - // int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1); - // int lastFoundLines; - // std::vector exclusiveFoundLinesHost(nClusters + 1); - - // // Obtain whole exclusive sum including nCluster+1 element (nCluster+1)th element is the total number of found lines. - // checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines; - - // std::vector lines(exclusiveFoundLinesHost[nClusters]); - - // checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); - - // // Compute vertices - // std::vector clusterLines; - // std::vector usedLines; - // for (int rofId{0}; rofId < rofs; ++rofId) { - // auto rof = offset + rofId; - // auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF - // auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof]; // number of clusters for this ROF - // auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof]; // starting line offset for this ROF - // auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof; - // gsl::span linesInRof(lines.data() + linesOffsetRof, static_cast::size_type>(nLinesRof)); - - // usedLines.resize(linesInRof.size(), false); - // usedLines.assign(linesInRof.size(), false); - // clusterLines.clear(); - // clusterLines.reserve(nClustersL1Rof); - // computeVerticesInRof(rof, - // linesInRof, - // usedLines, - // clusterLines, - // mTimeFrameGPU->getBeamXY(), - // mTimeFrameGPU->getVerticesInChunks()[chunkId], - // mTimeFrameGPU->getNVerticesInChunks()[chunkId], - // mTimeFrameGPU, - // mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr); - // } - // offset += rofs; - // } - // }; - // // Do work - // threads[chunkId] = std::thread(doVertexReconstruction); - // } - // for (auto& thread : threads) { - // thread.join(); - // } - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int start{0}; - // for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) { - // gsl::span rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])}; - // mTimeFrameGPU->addPrimaryVertices(rofVerts); - // if (mTimeFrameGPU->hasMCinformation()) { - // mTimeFrameGPU->getVerticesLabels().emplace_back(); - // // TODO: add MC labels - // } - // start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId]; - // } - // } - // mTimeFrameGPU->wipe(3); - } + // std::vector threads(mTimeFrameGPU->getNChunks()); + // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { + // int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; + // mTimeFrameGPU->getVerticesInChunks()[chunkId].clear(); + // mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear(); + // mTimeFrameGPU->getLabelsInChunks()[chunkId].clear(); + // auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void { + // auto offset = chunkId * rofPerChunk; + // auto maxROF = offset + rofPerChunk; + // while (offset < maxROF) { + // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); + // RANGE("chunk_gpu_vertexing", 1); + // // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()}; + // // timer.Start("vtTrackletFinder"); + // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clustersNextLayer, // 0 2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 + // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeNextLClusters, + // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, + // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0), // const int* nextIndexTables, + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* Tracklets, + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets, + // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, + // offset, // const unsigned int startRofId, + // rofs, // const unsigned int rofSize, + // mVrtParams.phiCut, // const float phiCut, + // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 + + // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2), // const Cluster* clustersNextLayer, // 0 2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 + // mTimeFrameGPU->getDeviceROframesClusters(2), // const int* sizeNextLClusters, + // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, + // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2), // const int* nextIndexTables, + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* Tracklets, + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets, + // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, + // offset, // const unsigned int startRofId, + // rofs, // const unsigned int rofSize, + // mVrtParams.phiCut, // const float phiCut, + // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 + + // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 + // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF + // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets + // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines + // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines + // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan + // offset, // const unsigned int startRofId, // Starting ROF ID + // rofs, // const unsigned int rofSize, // Number of ROFs to consider + // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster + // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda + // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi + + // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), + // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, + // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), + // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), + // mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), + // mTimeFrameGPU->getStream(chunkId).get())); + + // // Reset used tracklets + // checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), + // false, + // sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), + // mTimeFrameGPU->getStream(chunkId).get()), + // __FILE__, __LINE__); + + // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 + // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 + // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF + // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 + // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 + // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 + // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets + // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines + // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines + // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan + // offset, // const unsigned int startRofId, // Starting ROF ID + // rofs, // const unsigned int rofSize, // Number of ROFs to consider + // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster + // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda + // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi + + // int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1); + // int lastFoundLines; + // std::vector exclusiveFoundLinesHost(nClusters + 1); + + // // Obtain whole exclusive sum including nCluster+1 element (nCluster+1)th element is the total number of found lines. + // checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); + // checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); + // exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines; + + // std::vector lines(exclusiveFoundLinesHost[nClusters]); + + // checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); + // checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); + + // // Compute vertices + // std::vector clusterLines; + // std::vector usedLines; + // for (int rofId{0}; rofId < rofs; ++rofId) { + // auto rof = offset + rofId; + // auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF + // auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof]; // number of clusters for this ROF + // auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof]; // starting line offset for this ROF + // auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof; + // gsl::span linesInRof(lines.data() + linesOffsetRof, static_cast::size_type>(nLinesRof)); + + // usedLines.resize(linesInRof.size(), false); + // usedLines.assign(linesInRof.size(), false); + // clusterLines.clear(); + // clusterLines.reserve(nClustersL1Rof); + // computeVerticesInRof(rof, + // linesInRof, + // usedLines, + // clusterLines, + // mTimeFrameGPU->getBeamXY(), + // mTimeFrameGPU->getVerticesInChunks()[chunkId], + // mTimeFrameGPU->getNVerticesInChunks()[chunkId], + // mTimeFrameGPU, + // mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr); + // } + // offset += rofs; + // } + // }; + // // Do work + // threads[chunkId] = std::thread(doVertexReconstruction); + // } + // for (auto& thread : threads) { + // thread.join(); + // } + // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { + // int start{0}; + // for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) { + // gsl::span rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])}; + // mTimeFrameGPU->addPrimaryVertices(rofVerts); + // if (mTimeFrameGPU->hasMCinformation()) { + // mTimeFrameGPU->getVerticesLabels().emplace_back(); + // // TODO: add MC labels + // } + // start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId]; + // } + // } + // mTimeFrameGPU->wipe(3); + // } } void VertexerTraitsGPU::computeTrackletMatching(const int iteration) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu index 2ba4471ef61e5..3aab0624ef556 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu @@ -14,6 +14,7 @@ #include #include "ITStrackingGPU/VertexingKernels.h" +#include "GPUCommonHelpers.h" namespace o2 { @@ -21,7 +22,6 @@ namespace its { using constants::its::VertexerHistogramVolume; using constants::math::TwoPi; -using gpu::utils::checkGPUError; using math_utils::getNormalizedPhi; using namespace constants::its2; @@ -84,17 +84,6 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde getPhiBinIndex(phiRangeMax)}; } -GPUh() void gpuThrowOnError() -{ - cudaError_t error = cudaGetLastError(); - - if (error != cudaSuccess) { - std::ostringstream errorString{}; - errorString << GPU_ARCH << " API returned error [" << cudaGetErrorString(error) << "] (code " << error << ")" << std::endl; - throw std::runtime_error{errorString.str()}; - } -} - template GPUd() void printOnThread(const unsigned int tId, const char* str, Args... args) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt index 0b686273a159a..8015f0b20e862 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt @@ -14,16 +14,13 @@ if(HIP_ENABLED) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fgpu-rdc") o2_add_hipified_library(ITStrackingHIP SOURCES ../cuda/ClusterLinesGPU.cu - ../cuda/Context.cu ../cuda/TimeFrameGPU.cu - ../cuda/TimeFrameChunk.cu ../cuda/Stream.cu ../cuda/TrackerTraitsGPU.cxx ../cuda/TracerGPU.cu ../cuda/TrackingKernels.cu ../cuda/VertexingKernels.cu ../cuda/VertexerTraitsGPU.cxx - ../cuda/Utils.cu PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::ITStracking O2::GPUTracking diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h index 6324b03cb8ca6..da02149fbc432 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h @@ -23,8 +23,10 @@ #include "ITStracking/Definitions.h" #include "CommonConstants/MathConstants.h" + #include "GPUCommonMath.h" #include "GPUCommonDef.h" +#include "GPUCommonArray.h" namespace o2 { @@ -52,9 +54,9 @@ constexpr int ClustersPerCell{3}; constexpr int UnusedIndex{-1}; constexpr float Resolution{0.0005f}; -GPUhdi() constexpr GPUArray VertexerHistogramVolume() +GPUhdi() constexpr o2::gpu::gpustd::array VertexerHistogramVolume() { - return GPUArray{{1.98, 1.98, 40.f}}; + return o2::gpu::gpustd::array{{1.98, 1.98, 40.f}}; } } // namespace its @@ -64,27 +66,29 @@ constexpr int LayersNumber{7}; constexpr int TrackletsPerRoad{LayersNumber - 1}; constexpr int CellsPerRoad{LayersNumber - 2}; -GPUhdi() constexpr GPUArray LayersZCoordinate() +GPUhdi() constexpr o2::gpu::gpustd::array LayersZCoordinate() { constexpr double s = 1.; // safety margin - return GPUArray{{16.333f + s, 16.333f + s, 16.333f + s, 42.140f + s, 42.140f + s, 73.745f + s, 73.745f + s}}; + return o2::gpu::gpustd::array{16.333f + s, 16.333f + s, 16.333f + s, 42.140f + s, 42.140f + s, 73.745f + s, 73.745f + s}; } -GPUhdi() constexpr GPUArray LayersRCoordinate() + +GPUhdi() constexpr o2::gpu::gpustd::array LayersRCoordinate() { - return GPUArray{{2.33959f, 3.14076f, 3.91924f, 19.6213f, 24.5597f, 34.388f, 39.3329f}}; + return o2::gpu::gpustd::array{{2.33959f, 3.14076f, 3.91924f, 19.6213f, 24.5597f, 34.388f, 39.3329f}}; } constexpr int ZBins{256}; constexpr int PhiBins{128}; constexpr float InversePhiBinSize{PhiBins / constants::math::TwoPi}; -GPUhdi() constexpr GPUArray InverseZBinSize() +GPUhdi() constexpr o2::gpu::gpustd::array InverseZBinSize() { constexpr auto zSize = LayersZCoordinate(); - return GPUArray{{0.5f * ZBins / (zSize[0]), 0.5f * ZBins / (zSize[1]), 0.5f * ZBins / (zSize[2]), - 0.5f * ZBins / (zSize[3]), 0.5f * ZBins / (zSize[4]), 0.5f * ZBins / (zSize[5]), - 0.5f * ZBins / (zSize[6])}}; + return o2::gpu::gpustd::array{0.5f * ZBins / (zSize[0]), 0.5f * ZBins / (zSize[1]), 0.5f * ZBins / (zSize[2]), + 0.5f * ZBins / (zSize[3]), 0.5f * ZBins / (zSize[4]), 0.5f * ZBins / (zSize[5]), + 0.5f * ZBins / (zSize[6])}; } -inline float getInverseZCoordinate(const int layerIndex) + +GPUhdi() constexpr float getInverseZCoordinate(const int layerIndex) { return 0.5f * ZBins / LayersZCoordinate()[layerIndex]; } @@ -115,7 +119,7 @@ namespace pdgcodes constexpr int PionCode{211}; } } // namespace constants -#ifndef __OPENCL__ /// FIXME: this is for compatibility with OCL +#ifndef GPUCA_GPUCODE_DEVICE typedef std::vector> index_table_t; #endif } // namespace its diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h index a1d2fa338ba63..59d0e59eb0637 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h @@ -15,15 +15,6 @@ #ifndef TRACKINGITS_DEFINITIONS_H_ #define TRACKINGITS_DEFINITIONS_H_ -// #define CA_DEBUG -// #define VTX_DEBUG -#define __USE_GPU_TRACER__ - -template -void discardResult(const T&) -{ -} - #ifndef GPUCA_GPUCODE_DEVICE #include #endif @@ -36,103 +27,17 @@ void discardResult(const T&) } while (0) #endif -#if defined(__CUDA_ARCH__) // ???? -#define TRACKINGITSU_GPU_DEVICE -#endif - #if defined(__CUDACC__) || defined(__HIPCC__) -#define MATH_CEIL ceil - -#ifndef GPUCA_GPUCODE_DEVICE -#include -#endif -#include "../GPU/ITStrackingGPU/Array.h" - -template -using GPUArray = o2::its::gpu::Array; - #ifdef __CUDACC__ -#define GPU_ARCH "CUDA" - +#include typedef cudaStream_t GPUStream; -inline int getGPUCores(const int major, const int minor) -{ - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct - { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = - { - {0x20, 32}, // Fermi Generation (SM 2.0) GF100 class - {0x21, 48}, // Fermi Generation (SM 2.1) GF10x class - {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class - {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class - {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class - {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class - {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class - {0x70, 64}, // Volta Generation (SM 7.0) GV100 class - {0x72, 64}, // Volta Generation (SM 7.2) GV10B class - {0x75, 64}, // Turing Generation (SM 7.5) TU1xx class - {-1, -1}}; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { - return nGpuArchCoresPerSM[index].Cores; - } - - index++; - } - - // If we don't find the values, we default use the previous one to run properly - return nGpuArchCoresPerSM[index - 1].Cores; -} -inline int getGPUMaxThreadsPerComputingUnit() -{ - return 8; -} - #else // __HIPCC__ -#define GPU_ARCH "HIP" +#include typedef hipStream_t GPUStream; -inline int getGPUCores(const int major, const int minor) -{ - // Hardcoded result for AMD RADEON WX 9100, to be decided if and how determine this paramter - return 4096; -} - -inline int getGPUMaxThreadsPerComputingUnit() -{ - return 8; -} #endif - #else -#define MATH_CEIL std::ceil -#ifndef __VECTOR_TYPES_H__ -#include "GPUCommonDef.h" -#endif -#ifndef __OPENCL__ -#include -template -using GPUArray = std::array; -#else -#include "../GPU/ITStrackingGPU/Array.h" -template -using GPUArray = o2::its::gpu::Array; -#endif - -typedef struct _dummyStream { +typedef struct __dummyStream { } GPUStream; #endif -#endif +#endif \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx b/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx index 52119048b0ed8..630ad9acf59d2 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Cluster.cxx @@ -17,6 +17,8 @@ #include "ITStracking/MathUtils.h" #include "ITStracking/IndexTableUtils.h" +#include "GPUCommonArray.h" + namespace o2 { namespace its @@ -90,8 +92,8 @@ bool Cluster::operator==(const Cluster& rhs) const this->indexTableBinIndex == rhs.indexTableBinIndex; } -TrackingFrameInfo::TrackingFrameInfo(float x, float y, float z, float xTF, float alpha, GPUArray&& posTF, - GPUArray&& covTF) +TrackingFrameInfo::TrackingFrameInfo(float x, float y, float z, float xTF, float alpha, o2::gpu::gpustd::array&& posTF, + o2::gpu::gpustd::array&& covTF) : xCoordinate{x}, yCoordinate{y}, zCoordinate{z}, xTrackingFrame{xTF}, alphaTrackingFrame{alpha}, positionTrackingFrame{posTF}, covarianceTrackingFrame{covTF} { // Nothing to do