From 14a816a1212282f604299b1d84c89d4cc9059857 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 19 Dec 2024 18:24:56 +0100 Subject: [PATCH] ITS::gpu: Update track selection logics to the state of the art (#13816) Add processNeighbours GPU kernel and handler Update Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt Fix second iteration Move the whole processNeighbours on GPU --- .../TrackParametrization.h | 4 +- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 2 + .../GPU/ITStrackingGPU/TrackingKernels.h | 10 +- .../ITS/tracking/GPU/cuda/CMakeLists.txt | 2 +- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 63 +---- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 253 ++++++++++-------- 6 files changed, 157 insertions(+), 177 deletions(-) diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrization.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrization.h index a988c96168170..a51ec3b7010a7 100644 --- a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrization.h +++ b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrization.h @@ -160,7 +160,7 @@ class TrackParametrization GPUd() value_t getZ() const; GPUd() value_t getSnp() const; GPUd() value_t getTgl() const; - GPUd() value_t getQ2Pt() const; + GPUhd() value_t getQ2Pt() const; GPUd() value_t getCharge2Pt() const; GPUd() int getAbsCharge() const; GPUd() PID getPID() const; @@ -357,7 +357,7 @@ GPUdi() auto TrackParametrization::getTgl() const -> value_t //____________________________________________________________ template -GPUdi() auto TrackParametrization::getQ2Pt() const -> value_t +GPUhdi() auto TrackParametrization::getQ2Pt() const -> value_t { return mP[kQ2Pt]; } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 4ac22607a580b..066bef7631415 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -116,6 +116,7 @@ class TimeFrameGPU : public TimeFrame int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } gsl::span getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; } gpuPair* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; } + std::array& getDeviceNeighboursAll() { return mNeighboursDevice; } int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); @@ -142,6 +143,7 @@ class TimeFrameGPU : public TimeFrame // Host-specific getters gsl::span getNTracklets() { return mNTracklets; } gsl::span getNCells() { return mNCells; } + std::array& getArrayNCells() { return mNCells; } // Host-available device getters gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index f50a11a83805f..78636d00788bf 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -186,19 +186,17 @@ void processNeighboursHandler(const int startLayer, const int startLevel, CellSeed** allCellSeeds, CellSeed* currentCellSeeds, - const unsigned int nCurrentCells, + std::array& nCells, const unsigned char** usedClusters, - int* neighbours, + std::array& neighbours, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, + std::vector& seedsHost, const float bz, const float MaxChi2ClusterAttachment, + const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, - const std::vector& lastCellIdHost, // temporary host vector - const std::vector& lastCellSeedHost, // temporary host vector - std::vector& updatedCellIdHost, // temporary host vector - std::vector& updatedCellSeedHost, // temporary host vector const int nBlocks, const int nThreads); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 3cdb107e07438..e2fc1f1388ad0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) +add_compile_options(-O0 -g -lineinfo -fPIC) # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 395aab3a470ac..4821ebb636f54 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -205,9 +205,6 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) conf.nBlocks, conf.nThreads); } - // Needed for processNeighbours() which is still on CPU. - mTimeFrameGPU->downloadCellsDevice(); - mTimeFrameGPU->downloadCellsLUTDevice(); } template @@ -221,11 +218,11 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); - if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || - mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { - mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - continue; - } + // if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || + // mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { + // mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); + // continue; + // } mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), @@ -267,7 +264,6 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) cellsNeighboursLayer[iLayer].size()); } mTimeFrameGPU->createNeighboursDeviceArray(); - mTimeFrameGPU->downloadCellsDevice(); mTimeFrameGPU->unregisterRest(); }; @@ -289,55 +285,21 @@ void TrackerTraitsGPU::findRoads(const int iteration) startLevel, mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceCells()[startLayer], - mTimeFrameGPU->getNCells()[startLayer], + mTimeFrameGPU->getArrayNCells(), mTimeFrameGPU->getDeviceArrayUsedClusters(), - mTimeFrameGPU->getDeviceNeighbours(startLayer - 1), + mTimeFrameGPU->getDeviceNeighboursAll(), mTimeFrameGPU->getDeviceNeighboursLUTs(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + trackSeeds, mBz, - mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTrkParams[0].MaxChi2ClusterAttachment, + mTrkParams[0].MaxChi2NDF, mTimeFrameGPU->getDevicePropagator(), mCorrType, - lastCellId, // temporary host vector - lastCellSeed, // temporary host vector - updatedCellId, // temporary host vectors - updatedCellSeed, // temporary host vectors conf.nBlocks, conf.nThreads); - - int level = startLevel; - for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { - lastCellSeed.swap(updatedCellSeed); - lastCellId.swap(updatedCellId); - std::vector().swap(updatedCellSeed); /// tame the memory peaks - updatedCellId.clear(); - processNeighboursHandler(iLayer, - --level, - mTimeFrameGPU->getDeviceArrayCells(), - mTimeFrameGPU->getDeviceCells()[iLayer], - mTimeFrameGPU->getNCells()[iLayer], - mTimeFrameGPU->getDeviceArrayUsedClusters(), - mTimeFrameGPU->getDeviceNeighbours(iLayer - 1), - mTimeFrameGPU->getDeviceNeighboursLUTs(), - mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), - mBz, - mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment - mTimeFrameGPU->getDevicePropagator(), - mCorrType, - lastCellId, // temporary host vector - lastCellSeed, // temporary host vector - updatedCellId, // temporary host vectors - updatedCellSeed, // temporary host vectors - conf.nBlocks, - conf.nThreads); - } - for (auto& seed : updatedCellSeed) { - if (seed.getQ2Pt() > 1.e3 || seed.getChi2() > mTrkParams[0].MaxChi2NDF * ((startLevel + 2) * 2 - 5)) { - continue; - } - trackSeeds.push_back(seed); - } } + // fixme: I don't want to move tracks back and forth, but I need a way to use a thrust::allocator that is aware of our managed memory. if (!trackSeeds.size()) { LOGP(info, "No track seeds found, skipping track finding"); continue; @@ -362,9 +324,6 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds); auto& tracks = mTimeFrameGPU->getTrackITSExt(); - std::sort(tracks.begin(), tracks.end(), [](const TrackITSExt& a, const TrackITSExt& b) { - return a.getChi2() < b.getChi2(); - }); for (auto& track : tracks) { if (!track.getChi2()) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 19edef6c40346..10459cf800b6c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -232,6 +232,24 @@ struct is_valid_pair { } }; +struct seed_selector { + float maxQ2Pt; + float maxChi2; + + GPUhd() seed_selector(float maxQ2Pt, float maxChi2) : maxQ2Pt(maxQ2Pt), maxChi2(maxChi2) {} + GPUhd() bool operator()(const CellSeed& seed) const + { + return !(seed.getQ2Pt() > maxQ2Pt || seed.getChi2() > maxChi2); + } +}; + +struct compare_track_chi2 { + GPUhd() bool operator()(const TrackITSExt& a, const TrackITSExt& b) const + { + return a.getChi2() < b.getChi2(); + } +}; + GPUd() gpuSpan getPrimaryVertices(const int rof, const int* roframesPV, const int nROF, @@ -596,7 +614,7 @@ GPUg() void processNeighboursKernel(const int layer, int* neighboursLUT, const TrackingFrameInfo** foundTrackingFrameInfo, const float bz, - const float MaxChi2ClusterAttachment, + const float maxChi2ClusterAttachment, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType) { @@ -650,7 +668,7 @@ GPUg() void processNeighboursKernel(const int layer, } auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; - if ((predChi2 > MaxChi2ClusterAttachment) || predChi2 < 0.f) { + if ((predChi2 > maxChi2ClusterAttachment) || predChi2 < 0.f) { continue; } seed.setChi2(seed.getChi2() + predChi2); @@ -1172,149 +1190,152 @@ void processNeighboursHandler(const int startLayer, const int startLevel, CellSeed** allCellSeeds, CellSeed* currentCellSeeds, - const unsigned int nCurrentCells, + std::array& nCells, const unsigned char** usedClusters, - int* neighbours, + std::array& neighbours, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, + std::vector& seedsHost, const float bz, - const float MaxChi2ClusterAttachment, + const float maxChi2ClusterAttachment, + const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, - const std::vector& lastCellIdHost, // temporary host vector - const std::vector& lastCellSeedHost, // temporary host vector - std::vector& updatedCellIdHost, // temporary host vector - std::vector& updatedCellSeedHost, // temporary host vector const int nBlocks, const int nThreads) { - thrust::device_vector foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. - thrust::device_vector lastCellIds(lastCellIdHost); - thrust::device_vector lastCellSeed(lastCellSeedHost); + thrust::device_vector foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. + // thrust::device_vector lastCellIds(lastCellIdHost); + // thrust::device_vector lastCellSeed(lastCellSeedHost); + thrust::device_vector lastCellId, updatedCellId; + thrust::device_vector lastCellSeed, updatedCellSeed; gpu::processNeighboursKernel<<>>(startLayer, startLevel, allCellSeeds, - lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds - lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]), // lastCellIds, - lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(), - nullptr, // updatedCellSeeds, - nullptr, // updatedCellsIds, - thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration - usedClusters, // Used clusters - neighbours, + currentCellSeeds, + nullptr, + nCells[startLayer], + nullptr, + nullptr, + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[startLayer - 1], neighboursDeviceLUTs[startLayer - 1], foundTrackingFrameInfo, bz, - MaxChi2ClusterAttachment, + maxChi2ClusterAttachment, propagator, matCorrType); - void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; - size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; - gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage temp_storage_bytes, // temp_storage_bytes thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCurrentCells + 1, // num_items - 0)); // NOLINT: failure in clang-tidy + nCells[startLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage temp_storage_bytes, // temp_storage_bytes thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCurrentCells + 1, // num_items - 0)); // NOLINT: failure in clang-tidy + nCells[startLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer - thrust::device_vector updatedCellIds(foundSeedsTable.back()) /*, lastCellIds(foundSeedsTable.back())*/; - thrust::device_vector updatedCellSeeds(foundSeedsTable.back()) /*, lastCellSeeds(foundSeedsTable.back())*/; + updatedCellId.resize(foundSeedsTable.back()); + updatedCellSeed.resize(foundSeedsTable.back()); gpu::processNeighboursKernel<<>>(startLayer, startLevel, allCellSeeds, - lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds - lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]), // lastCellIds, - lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(), - thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds - thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds - thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration - usedClusters, // Used clusters - neighbours, + currentCellSeeds, + nullptr, + nCells[startLayer], + thrust::raw_pointer_cast(&updatedCellSeed[0]), + thrust::raw_pointer_cast(&updatedCellId[0]), + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[startLayer - 1], neighboursDeviceLUTs[startLayer - 1], foundTrackingFrameInfo, bz, - MaxChi2ClusterAttachment, + maxChi2ClusterAttachment, propagator, matCorrType); - - // Temporary copyback to host to validate the kernel - updatedCellIdHost.resize(updatedCellIds.size()); - updatedCellSeedHost.resize(updatedCellSeeds.size()); - thrust::copy(updatedCellIds.begin(), updatedCellIds.end(), updatedCellIdHost.begin()); - thrust::copy(updatedCellSeeds.begin(), updatedCellSeeds.end(), updatedCellSeedHost.begin()); - - // int level = startLevel; - // for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { - // --level; - // lastCellSeeds.swap(updatedCellSeeds); - // lastCellIds.swap(updatedCellIds); - // foundSeedsTable.resize(lastCellSeeds.size() + 1); - // thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); - - // gpu::processNeighboursKernel<<<1, 1>>>(iLayer, - // level, - // allCellSeeds, - // thrust::raw_pointer_cast(&lastCellSeeds[0]), - // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, - // lastCellSeeds.size(), - // nullptr, // updatedCellSeeds, - // nullptr, // updatedCellsIds, - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration - // usedClusters, // Used clusters - // neighbours, - // neighboursDeviceLUTs[iLayer - 1], - // foundTrackingFrameInfo, - // bz, - // MaxChi2ClusterAttachment, - // propagator, - // matCorrType); - - // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - // temp_storage_bytes, // temp_storage_bytes - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - // foundSeedsTable.size(), // num_items - // 0)); - // discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - // temp_storage_bytes_2, // temp_storage_bytes - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - // foundSeedsTable.size(), // num_items - // 0)); - // updatedCellIds.resize(foundSeedsTable.back(), 0); - // updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed()); - - // gpu::processNeighboursKernel<<<1, 1>>>(iLayer, - // level, - // allCellSeeds, - // thrust::raw_pointer_cast(&lastCellSeeds[0]), - // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, - // lastCellSeeds.size(), - // thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds - // thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds - // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration - // usedClusters, // Used clusters - // neighbours, - // neighboursDeviceLUTs[iLayer - 1], - // foundTrackingFrameInfo, - // bz, - // MaxChi2ClusterAttachment, - // propagator, - // matCorrType); - // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), updatedCellSeeds.size()); - // } - + auto t1 = updatedCellSeed.size(); gpuCheckError(cudaFree(d_temp_storage)); - gpuCheckError(cudaFree(d_temp_storage_2)); + int level = startLevel; + for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { + temp_storage_bytes = 0; + lastCellSeed.swap(updatedCellSeed); + lastCellId.swap(updatedCellId); + thrust::device_vector().swap(updatedCellSeed); + thrust::device_vector().swap(updatedCellId); + auto lastCellSeedSize{lastCellSeed.size()}; + foundSeedsTable.resize(nCells[iLayer] + 1); + thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); + --level; + gpu::processNeighboursKernel<<>>(iLayer, + level, + allCellSeeds, + thrust::raw_pointer_cast(&lastCellSeed[0]), + thrust::raw_pointer_cast(&lastCellId[0]), + lastCellSeedSize, + nullptr, + nullptr, + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[iLayer - 1], + neighboursDeviceLUTs[iLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); + gpuCheckError(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCells[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + auto foundSeeds{foundSeedsTable.back()}; + updatedCellId.resize(foundSeeds); + thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0); + updatedCellSeed.resize(foundSeeds); + thrust::fill(updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed()); + + gpu::processNeighboursKernel<<>>(iLayer, + level, + allCellSeeds, + thrust::raw_pointer_cast(&lastCellSeed[0]), + thrust::raw_pointer_cast(&lastCellId[0]), + lastCellSeedSize, + thrust::raw_pointer_cast(&updatedCellSeed[0]), + thrust::raw_pointer_cast(&updatedCellId[0]), + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[iLayer - 1], + neighboursDeviceLUTs[iLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); + gpuCheckError(cudaFree(d_temp_storage)); + } + thrust::device_vector outSeeds(updatedCellSeed.size()); + auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); + auto s{end - outSeeds.begin()}; + std::vector outSeedsHost(s); + thrust::copy(updatedCellSeed.begin(), updatedCellSeed.begin() + s, outSeedsHost.begin()); + seedsHost.insert(seedsHost.end(), outSeedsHost.begin(), outSeedsHost.end()); } void trackSeedHandler(CellSeed* trackSeeds, @@ -1344,7 +1365,9 @@ void trackSeedHandler(CellSeed* trackSeeds, maxChi2NDF, // float propagator, // const o2::base::Propagator* matCorrType); // o2::base::PropagatorF::MatCorrType + thrust::device_ptr tr_ptr(tracks); + thrust::sort(tr_ptr, tr_ptr + nSeeds, gpu::compare_track_chi2()); gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } @@ -1414,19 +1437,17 @@ template void processNeighboursHandler<7>(const int startLayer, const int startLevel, CellSeed** allCellSeeds, CellSeed* currentCellSeeds, - const unsigned int nCurrentCells, + std::array& nCells, const unsigned char** usedClusters, - int* neighbours, + std::array& neighbours, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, + std::vector& seedsHost, const float bz, - const float MaxChi2ClusterAttachment, + const float maxChi2ClusterAttachment, + const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, - const std::vector& lastCellIdHost, // temporary host vector - const std::vector& lastCellSeedHost, // temporary host vector - std::vector& updatedCellIdHost, // temporary host vector - std::vector& updatedCellSeedHost, // temporary host vector const int nBlocks, const int nThreads); } // namespace o2::its \ No newline at end of file