Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@
#include "DetectorsBase/Propagator.h"
#include "GPUCommonDef.h"

namespace o2::its
namespace o2
{
namespace its
{
class CellSeed;
class ExternalAllocator;
namespace gpu
{
#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
Expand Down Expand Up @@ -178,7 +181,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,

int filterCellNeighboursHandler(gpuPair<int, int>*,
int*,
unsigned int);
unsigned int,
o2::its::ExternalAllocator* = nullptr);

template <int nLayers = 7>
void processNeighboursHandler(const int startLayer,
Expand All @@ -191,6 +195,7 @@ void processNeighboursHandler(const int startLayer,
gsl::span<int*> neighboursDeviceLUTs,
const TrackingFrameInfo** foundTrackingFrameInfo,
bounded_vector<CellSeed>& seedsHost,
o2::its::ExternalAllocator*,
const float bz,
const float MaxChi2ClusterAttachment,
const float maxChi2NDF,
Expand All @@ -212,5 +217,6 @@ void trackSeedHandler(CellSeed* trackSeeds,
const o2::base::PropagatorF::MatCorrType matCorrType,
const int nBlocks,
const int nThreads);
} // namespace o2::its
} // namespace its
} // namespace o2
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_
2 changes: 1 addition & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
if(CUDA_ENABLED)
find_package(CUDAToolkit)
message(STATUS "Building ITS CUDA tracker")
# add_compile_options(-O0 -g -lineinfo -fPIC)
add_compile_options(-O0 -g -lineinfo -fPIC)
# add_compile_definitions(ITS_MEASURE_GPU_TIME)
o2_add_library(ITStrackingCUDA
SOURCES ClusterLinesGPU.cu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "ITStrackingGPU/TrackerTraitsGPU.h"
#include "ITStrackingGPU/TrackingKernels.h"
#include "ITStracking/TrackingConfigParam.h"

namespace o2::its
{
constexpr int UnusedIndex{-1};
Expand Down Expand Up @@ -209,7 +210,8 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)

filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighbours(iLayer),
nNeigh);
nNeigh,
mTimeFrameGPU->getExternalAllocator());
}
mTimeFrameGPU->createNeighboursDeviceArray();
mTimeFrameGPU->unregisterRest();
Expand All @@ -236,6 +238,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
mTimeFrameGPU->getDeviceNeighboursLUTs(),
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
trackSeeds,
mTimeFrameGPU->getExternalAllocator(),
this->mBz,
this->mTrkParams[0].MaxChi2ClusterAttachment,
this->mTrkParams[0].MaxChi2NDF,
Expand Down
60 changes: 47 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,14 @@
#include "ITStracking/Constants.h"
#include "ITStracking/IndexTableUtils.h"
#include "ITStracking/MathUtils.h"
#include "ITStracking/ExternalAllocator.h"
#include "DataFormatsITS/TrackITS.h"
#include "ReconstructionDataFormats/Vertex.h"

#include "ITStrackingGPU/TrackerTraitsGPU.h"
#include "ITStrackingGPU/TrackingKernels.h"
#include "ITStrackingGPU/Utils.h"

#include "GPUCommonHelpers.h"

#ifndef __HIPCC__
#define THRUST_NAMESPACE thrust::cuda
#else
Expand Down Expand Up @@ -64,6 +63,37 @@ GPUdii() float Sq(float v)
namespace gpu
{

template <typename T>
class TypedAllocator : public thrust::device_allocator<T>
{
public:
using value_type = T;
using pointer = T*;

template <typename U>
struct rebind {
using other = TypedAllocator<U>;
};

explicit TypedAllocator(ExternalAllocator* allocPtr)
: mInternalAllocator(allocPtr) {}

T* allocate(size_t n)
{
return reinterpret_cast<T*>(mInternalAllocator->allocate(n * sizeof(T)));
}

void deallocate(T* p, size_t n)
{
char* raw_ptr = reinterpret_cast<char*>(p);
size_t bytes = n * sizeof(T);
mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op.
}

private:
ExternalAllocator* mInternalAllocator;
};

GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
const o2::its::IndexTableUtils& utils,
const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
Expand Down Expand Up @@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,

int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
int* cellNeighbours,
unsigned int nNeigh)
unsigned int nNeigh,
o2::its::ExternalAllocator* allocator)
{
thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
thrust::device_ptr<int> validNeighs(cellNeighbours);
Expand All @@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer,
gsl::span<int*> neighboursDeviceLUTs,
const TrackingFrameInfo** foundTrackingFrameInfo,
bounded_vector<CellSeed>& seedsHost,
o2::its::ExternalAllocator* allocator,
const float bz,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
Expand All @@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer,
const int nBlocks,
const int nThreads)
{
thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
// TODO: fix this.
auto allocInt = gpu::TypedAllocator<int>(allocator);
auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
// TODO: fix this.

gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
Expand All @@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer,
matCorrType);
gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1);

thrust::device_vector<int> updatedCellId(foundSeedsTable.back());
thrust::device_vector<CellSeed> updatedCellSeed(foundSeedsTable.back());
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
startLayer,
Expand All @@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer,
matCorrType);

int level = startLevel;
thrust::device_vector<int> lastCellId;
thrust::device_vector<CellSeed> lastCellSeed;
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> lastCellSeed(allocCellSeed);
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
lastCellSeed.swap(updatedCellSeed);
lastCellId.swap(updatedCellId);
thrust::device_vector<CellSeed>().swap(updatedCellSeed);
thrust::device_vector<int>().swap(updatedCellId);
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed).swap(updatedCellSeed);
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
auto lastCellSeedSize{lastCellSeed.size()};
foundSeedsTable.resize(lastCellSeedSize + 1);
thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
Expand Down Expand Up @@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer,
propagator,
matCorrType);
}

thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
auto s{end - outSeeds.begin()};
seedsHost.reserve(seedsHost.size() + s);
Expand Down Expand Up @@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer,
gsl::span<int*> neighboursDeviceLUTs,
const TrackingFrameInfo** foundTrackingFrameInfo,
bounded_vector<CellSeed>& seedsHost,
o2::its::ExternalAllocator*,
const float bz,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ class ExternalAllocator
{
public:
virtual void* allocate(size_t) = 0;
virtual void deallocate(char*, size_t) = 0;
};

} // namespace o2::its

#endif
29 changes: 17 additions & 12 deletions Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,23 @@ struct TimeFrame {
void setBz(float bz) { mBz = bz; }
float getBz() const { return mBz; }

virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) { return; }
void setExternalAllocator(ExternalAllocator* allocator)
{
if (mIsGPU) {
LOGP(debug, "Setting timeFrame allocator to external");
mAllocator = allocator;
mExtAllocator = true; // to be removed
} else {
LOGP(debug, "External allocator is currently only supported for GPU");
}
}

ExternalAllocator* getExternalAllocator() { return mAllocator; }

virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
{
return;
};
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }

template <typename... T>
Expand Down Expand Up @@ -277,17 +293,6 @@ struct TimeFrame {
// State if memory will be externally managed.
bool mExtAllocator = false;
ExternalAllocator* mAllocator = nullptr;
void setExternalAllocator(ExternalAllocator* allocator)
{
if (mIsGPU) {
LOGP(debug, "Setting timeFrame allocator to external");
mAllocator = allocator;
mExtAllocator = true; // to be removed
} else {
LOGP(fatal, "External allocator is currently only supported for GPU");
}
}
void setExtAllocator(bool ext) { mExtAllocator = ext; }
bool getExtAllocator() const { return mExtAllocator; }

std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChainITS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
{
return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);
}

void deallocate(char* ptr, size_t) {}
void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }

private:
Expand Down