Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
~TimeFrameGPU() = default;

/// Most relevant operations
void pushMemoryStack(const int);
void popMemoryStack(const int);
void registerHostMemory(const int);
void unregisterHostMemory(const int);
void initialise(const int, const TrackingParameters&, const int, IndexTableUtilsN* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
Expand Down Expand Up @@ -177,8 +179,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
int getNumberOfNeighbours() const final;

private:
void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream
void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream
void allocMemAsync(void**, size_t, Stream&, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on specific stream
void allocMem(void**, size_t, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on default stream
TimeFrameGPUParameters mGpuParams;

// Host-available device buffer sizes
Expand Down
2 changes: 1 addition & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ struct TypedAllocator {

pointer allocate(size_type n)
{
void* raw = mInternalAllocator->allocate(n * sizeof(T));
void* raw = mInternalAllocator->allocateStack(n * sizeof(T));
return thrust::device_pointer_cast(static_cast<T*>(raw));
}

Expand Down
129 changes: 74 additions & 55 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Large diffs are not rendered by default.

17 changes: 10 additions & 7 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ template <int nLayers>
void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
{
mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers);

// on default stream
mTimeFrameGPU->loadVertices(iteration);
mTimeFrameGPU->loadIndexTableUtils(iteration);
Expand All @@ -45,6 +44,8 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
mTimeFrameGPU->createTrackletsBuffersArray(iteration);
mTimeFrameGPU->createCellsBuffersArray(iteration);
mTimeFrameGPU->createCellsLUTDeviceArray(iteration);
// push every create artefact on the stack
mTimeFrameGPU->pushMemoryStack(iteration);
}

template <int nLayers>
Expand Down Expand Up @@ -108,7 +109,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
mTimeFrameGPU->getPositionResolutions(),
this->mTrkParams[iteration].LayerRadii,
mTimeFrameGPU->getMSangles(),
mTimeFrameGPU->getExternalDeviceAllocator(),
mTimeFrameGPU->getFrameworkAllocator(),
conf.nBlocksLayerTracklets[iteration],
conf.nThreadsLayerTracklets[iteration],
mTimeFrameGPU->getStreams());
Expand Down Expand Up @@ -146,7 +147,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
mTimeFrameGPU->getPositionResolutions(),
this->mTrkParams[iteration].LayerRadii,
mTimeFrameGPU->getMSangles(),
mTimeFrameGPU->getExternalDeviceAllocator(),
mTimeFrameGPU->getFrameworkAllocator(),
conf.nBlocksLayerTracklets[iteration],
conf.nThreadsLayerTracklets[iteration],
mTimeFrameGPU->getStreams());
Expand Down Expand Up @@ -197,7 +198,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
this->mTrkParams[iteration].MaxChi2ClusterAttachment,
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
this->mTrkParams[iteration].NSigmaCut,
mTimeFrameGPU->getExternalDeviceAllocator(),
mTimeFrameGPU->getFrameworkAllocator(),
conf.nBlocksLayerCells[iteration],
conf.nThreadsLayerCells[iteration],
mTimeFrameGPU->getStreams());
Expand Down Expand Up @@ -253,7 +254,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
currentLayerCellsNum,
nextLayerCellsNum,
1e2,
mTimeFrameGPU->getExternalDeviceAllocator(),
mTimeFrameGPU->getFrameworkAllocator(),
conf.nBlocksFindNeighbours[iteration],
conf.nThreadsFindNeighbours[iteration],
mTimeFrameGPU->getStream(iLayer));
Expand Down Expand Up @@ -281,7 +282,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
mTimeFrameGPU->getDeviceNeighbours(iLayer),
mTimeFrameGPU->getArrayNNeighbours()[iLayer],
mTimeFrameGPU->getStream(iLayer),
mTimeFrameGPU->getExternalDeviceAllocator());
mTimeFrameGPU->getFrameworkAllocator());
}
mTimeFrameGPU->syncStreams(false);
}
Expand Down Expand Up @@ -312,7 +313,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
this->mTrkParams[0].MaxChi2NDF,
mTimeFrameGPU->getDevicePropagator(),
this->mTrkParams[0].CorrType,
mTimeFrameGPU->getExternalDeviceAllocator(),
mTimeFrameGPU->getFrameworkAllocator(),
conf.nBlocksProcessNeighbours[iteration],
conf.nThreadsProcessNeighbours[iteration]);
}
Expand Down Expand Up @@ -386,6 +387,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
}
mTimeFrameGPU->loadUsedClustersDevice();
}
// wipe the artefact memory
mTimeFrameGPU->popMemoryStack(iteration);
};

template <int nLayers>
Expand Down
17 changes: 0 additions & 17 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -644,23 +644,6 @@ GPUg() void __launch_bounds__(256, 1) processNeighboursKernel(
}
}

GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
{
if (alloc) {
*p = alloc->allocate(bytes);
} else {
GPUChkErrS(cudaMallocAsync(p, bytes, stream));
}
}

GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
{
if (alloc) {
alloc->deallocate(reinterpret_cast<char*>(p), bytes);
} else {
GPUChkErrS(cudaFreeAsync(p, stream));
}
}
} // namespace gpu

template <int nLayers>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,39 @@
#define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_

#include <memory_resource>
#include "GPUO2ExternalUser.h"
#include "Base/GPUMemoryResource.h"

namespace o2::its
{

class ExternalAllocator
{
using Type = std::underlying_type_t<o2::gpu::GPUMemoryResource::MemoryType>;

public:
virtual void* allocate(size_t) = 0;
virtual void deallocate(char*, size_t) = 0;
virtual void* allocate(size_t) = 0;
void* allocate(size_t s, Type type)
{
auto old = mType;
mType = type;
void* p = allocate(s);
mType = old;
return p;
}
void* allocateStack(size_t s)
{
return allocate(s, (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
}
virtual void pushTagOnStack(uint64_t) = 0;
virtual void popTagOffStack(uint64_t) = 0;

void setType(Type t) noexcept { mType = t; }
Type getType() const noexcept { return mType; }

protected:
Type mType;
};

class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
Expand All @@ -36,7 +60,7 @@ class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
protected:
void* do_allocate(size_t bytes, size_t alignment) override
{
void* p = mAlloc->allocate(bytes);
void* p = mAlloc->allocate(bytes, o2::gpu::GPUMemoryResource::MemoryType::MEMORY_HOST);
if (!p) {
throw std::bad_alloc();
}
Expand Down
26 changes: 8 additions & 18 deletions Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
#include "ITStracking/IndexTableUtils.h"
#include "ITStracking/ExternalAllocator.h"
#include "ITStracking/BoundedAllocator.h"

#include "SimulationDataFormat/MCCompLabel.h"
#include "SimulationDataFormat/MCTruthContainer.h"

Expand Down Expand Up @@ -235,23 +234,14 @@ struct TimeFrame {
void setBz(float bz) { mBz = bz; }
float getBz() const { return mBz; }

/// State if memory will be externally managed.
// device
ExternalAllocator* mExtDeviceAllocator{nullptr};
void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; }
ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; }
bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; }
// host
ExternalAllocator* mExtHostAllocator{nullptr};
void setExternalHostAllocator(ExternalAllocator* allocator)
{
mExtHostAllocator = allocator;
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExtHostAllocator);
}
ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; }
bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; }
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool;
std::pmr::memory_resource* getMaybeExternalHostResource(bool forceHost = false) { return (hasExternalHostAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); }
/// State if memory will be externally managed by the GPU framework
ExternalAllocator* mExternalAllocator{nullptr};
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool; // host memory pool managed by the framework
auto getFrameworkAllocator() { return mExternalAllocator; };
void setFrameworkAllocator(ExternalAllocator* ext);
bool hasFrameworkAllocator() const noexcept { return mExternalAllocator != nullptr; }
std::pmr::memory_resource* getMaybeFrameworkHostResource(bool forceHost = false) { return (hasFrameworkAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); }

// Propagator
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) {};
Expand Down
34 changes: 21 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ template <int nLayers>
void TimeFrame<nLayers>::resetROFrameData(size_t nRofs)
{
for (int iLayer{0}; iLayer < nLayers; ++iLayer) {
deepVectorClear(mUnsortedClusters[iLayer], getMaybeExternalHostResource());
deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeExternalHostResource());
clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeExternalHostResource());
deepVectorClear(mUnsortedClusters[iLayer], getMaybeFrameworkHostResource());
deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeFrameworkHostResource());
clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeFrameworkHostResource());
deepVectorClear(mClusterExternalIndices[iLayer], mMemoryPool.get());

if (iLayer < 2) {
Expand Down Expand Up @@ -302,11 +302,11 @@ void TimeFrame<nLayers>::initialise(const int iteration, const TrackingParameter
clearResizeBoundedVector(mBogusClusters, trkParam.NLayers, mMemoryPool.get());
deepVectorClear(mTrackletClusters);
for (unsigned int iLayer{0}; iLayer < std::min((int)mClusters.size(), maxLayers); ++iLayer) {
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers));
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers));
mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]);
}
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeExternalHostResource(maxLayers != nLayers));
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeFrameworkHostResource(maxLayers != nLayers));
clearResizeBoundedVector(mLines, mNrof, mMemoryPool.get());
clearResizeBoundedVector(mTrackletClusters, mNrof, mMemoryPool.get());

Expand Down Expand Up @@ -574,6 +574,7 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
initVector(v, useExternal);
}
};

// these will only reside on the host for the cpu part
initVector(mTotVertPerIteration);
initContainers(mClusterExternalIndices);
Expand Down Expand Up @@ -603,12 +604,19 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
initVector(mRoadLabels);
initContainers(mTracksLabel);
// these will use possibly an externally provided allocator
initContainers(mClusters, hasExternalHostAllocator());
initContainers(mUsedClusters, hasExternalHostAllocator());
initContainers(mUnsortedClusters, hasExternalHostAllocator());
initContainers(mIndexTables, hasExternalHostAllocator());
initContainers(mTrackingFrameInfo, hasExternalHostAllocator());
initContainers(mROFramesClusters, hasExternalHostAllocator());
initContainers(mClusters, hasFrameworkAllocator());
initContainers(mUsedClusters, hasFrameworkAllocator());
initContainers(mUnsortedClusters, hasFrameworkAllocator());
initContainers(mIndexTables, hasFrameworkAllocator());
initContainers(mTrackingFrameInfo, hasFrameworkAllocator());
initContainers(mROFramesClusters, hasFrameworkAllocator());
}

template <int nLayers>
void TimeFrame<nLayers>::setFrameworkAllocator(ExternalAllocator* ext)
{
mExternalAllocator = ext;
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExternalAllocator);
}

template <int nLayers>
Expand Down Expand Up @@ -639,7 +647,7 @@ void TimeFrame<nLayers>::wipe()
deepVectorClear(mLines);
// if we use the external host allocator then the assumption is that we
// don't clear the memory ourself
if (!hasExternalHostAllocator()) {
if (!hasFrameworkAllocator()) {
deepVectorClear(mClusters);
deepVectorClear(mUsedClusters);
deepVectorClear(mUnsortedClusters);
Expand Down
24 changes: 13 additions & 11 deletions GPU/GPUTracking/Global/GPUChainITS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,23 @@ namespace o2::its
class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
{
public:
GPUFrameworkExternalAllocator(GPUMemoryResource::MemoryType type) : mType(type) {}

void* allocate(size_t size) override
void* allocate(size_t size) final
{
return mFWReco->AllocateDirectMemory(size, mType);
}
void deallocate(char* ptr, size_t size) override {}
void deallocate(char* ptr, size_t size) final {} // this is a simple no-op
void pushTagOnStack(uint64_t tag)
{
mFWReco->PushNonPersistentMemory(tag);
}
void popTagOffStack(uint64_t tag)
{
mFWReco->PopNonPersistentMemory(GPUDataTypes::RecoStep::ITSTracking, tag);
}
void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }

private:
o2::gpu::GPUReconstruction* mFWReco;
GPUMemoryResource::MemoryType mType;
};
} // namespace o2::its

Expand Down Expand Up @@ -73,12 +78,9 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe()
}
#if !defined(GPUCA_STANDALONE)
if (mITSTimeFrame->isGPU()) {
mFrameworkDeviceAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_GPU));
mFrameworkDeviceAllocator->setReconstructionFramework(rec());
mITSTimeFrame->setExternalDeviceAllocator(mFrameworkDeviceAllocator.get());
mFrameworkHostAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_HOST));
mFrameworkHostAllocator->setReconstructionFramework(rec());
mITSTimeFrame->setExternalHostAllocator(mFrameworkHostAllocator.get());
mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator());
mFrameworkAllocator->setReconstructionFramework(rec());
mITSTimeFrame->setFrameworkAllocator(mFrameworkAllocator.get());
}
#endif
return mITSTimeFrame.get();
Expand Down
3 changes: 1 addition & 2 deletions GPU/GPUTracking/Global/GPUChainITS.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ class GPUChainITS final : public GPUChain
std::unique_ptr<o2::its::TrackerTraits<7>> mITSTrackerTraits;
std::unique_ptr<o2::its::VertexerTraits<7>> mITSVertexerTraits;
std::unique_ptr<o2::its::TimeFrame<7>> mITSTimeFrame;
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkDeviceAllocator;
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkHostAllocator;
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkAllocator;
};
} // namespace o2::gpu

Expand Down