From a1efad8577bbaf0004842c408c69e6dc1d8f86df Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 28 Jan 2025 12:58:56 +0100 Subject: [PATCH 1/2] Cleanup --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 3 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 26 +++++----- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 14 +++++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 48 ++++++++----------- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 46 +++++++++--------- 5 files changed, 72 insertions(+), 65 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 066bef7631415..f6282786ccfa0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -77,7 +77,8 @@ class TimeFrameGPU : public TimeFrame void createCellsDevice(); void createCellsLUTDevice(); void createNeighboursIndexTablesDevice(); - void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); + void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours); + void createNeighboursDevice(const unsigned int layer, std::vector>& neighbours); void createNeighboursLUTDevice(const int, const unsigned int); void createNeighboursDeviceArray(); void createTrackITSExtDevice(std::vector&); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 78636d00788bf..720867ddaba29 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -148,19 +148,19 @@ void computeCellsHandler(const Cluster** sortedClusters, const int nBlocks, const int nThreads); -void countCellNeighboursHandler(CellSeed** cellsLayersDevice, - int* neighboursLUTs, - int** cellsLUTs, - gpuPair* cellNeighbours, - int* neighboursIndexTable, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const unsigned int nCells, - const unsigned int nCellsNext, - const int maxCellNeighbours, - const int nBlocks, - const int nThreads); +unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads); void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int* neighboursLUTs, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index fd067b9930fd0..b1aa55f533c34 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -406,7 +406,19 @@ void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) } template -void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) +void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); + LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), &(mGpuStreams[0]), getExtAllocator()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std::vector>& neighbours) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); mCellsNeighbours[layer].clear(); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 4821ebb636f54..3c65faddcff71 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -212,36 +212,30 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { mTimeFrameGPU->createNeighboursIndexTablesDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; - mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); - mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); - // if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || - // mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { - // mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - // continue; - // } + if (!nextLayerCellsNum) { + continue; + } mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); - countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), - mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. - mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), - mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), - mTrkParams[0].MaxChi2ClusterAttachment, - mBz, - iLayer, - mTimeFrameGPU->getNCells()[iLayer], - nextLayerCellsNum, - 1e2, - conf.nBlocks, - conf.nThreads); - mTimeFrameGPU->downloadNeighboursLUTDevice(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer); - // Get the number of found cells from LUT - cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back()); - mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]); + unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), + mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), + mTrkParams[0].MaxChi2ClusterAttachment, + mBz, + iLayer, + mTimeFrameGPU->getNCells()[iLayer], + nextLayerCellsNum, + 1e2, + conf.nBlocks, + conf.nThreads); + + mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh); + computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -255,13 +249,11 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) 1e2, conf.nBlocks, conf.nThreads); - mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), - cellsNeighboursLayer[iLayer].size()); + nNeigh); } mTimeFrameGPU->createNeighboursDeviceArray(); mTimeFrameGPU->unregisterRest(); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 10459cf800b6c..4fa7913c10e82 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -1066,19 +1066,19 @@ void computeCellsHandler( nSigmaCut); // const float } -void countCellNeighboursHandler(CellSeed** cellsLayersDevice, - int* neighboursLUT, - int** cellsLUTs, - gpuPair* cellNeighbours, - int* neighboursIndexTable, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const unsigned int nCells, - const unsigned int nCellsNext, - const int maxCellNeighbours, - const int nBlocks, - const int nThreads) +unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads) { gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, @@ -1091,8 +1091,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - // gpuCheckError(cudaPeekAtLastError()); - // gpuCheckError(cudaDeviceSynchronize()); + void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage @@ -1102,17 +1101,19 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, nCellsNext)); // num_items discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - neighboursLUT, // d_in - neighboursLUT, // d_out - nCellsNext)); // num_items + gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext)); // num_items + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage temp_storage_bytes_2, // temp_storage_bytes neighboursIndexTable, // d_in neighboursIndexTable, // d_out nCells + 1, // num_items 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage temp_storage_bytes_2, // temp_storage_bytes @@ -1120,10 +1121,11 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, neighboursIndexTable, // d_out nCells + 1, // num_items 0)); // NOLINT: this is the offset of the sum, not a pointer + unsigned int nNeighbours; + gpuCheckError(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)); gpuCheckError(cudaFree(d_temp_storage)); gpuCheckError(cudaFree(d_temp_storage_2)); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + return nNeighbours; } void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, From 77390f9d02c5d9adf1005b356600e0600098d51b Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 28 Jan 2025 13:39:02 +0100 Subject: [PATCH 2/2] Fix nCells printout --- .../ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index f6282786ccfa0..100e49def0d50 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -152,6 +152,9 @@ class TimeFrameGPU : public TimeFrame gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } + // Overridden getters + int getNumberOfCells() const; + private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations bool mHostRegistered = false; @@ -253,6 +256,12 @@ inline std::vector TimeFrameGPU::getClusterSizes() return sizes; } +template +inline int TimeFrameGPU::getNumberOfCells() const +{ + return std::accumulate(mNCells.begin(), mNCells.end(), 0); +} + } // namespace gpu } // namespace its } // namespace o2