Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
}
AllocateRegisteredMemory(clustererNN.mMemoryId);
// nnApplications[lane].createBoundary(clustererNNShadow);
// nnApplications[lane].createIndexLookup(clustererNNShadow);
});
if (doGPU) {
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
Expand Down
27 changes: 0 additions & 27 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -65,33 +65,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
return mem;
}

// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
// std::vector<int32_t> sizes(7, -1);
// if (mNnClusterizerBatchedMode > 0) {
// if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
// sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16
// } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
// sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32
// }
// sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags
// if (mNnClusterizerModelClassNumOutputNodes > 0) {
// sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities
// }
// if (!mNnClusterizerUseCfRegression) {
// if (mNnClusterizerModelReg1NumOutputNodes > 0) {
// sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1
// }
// if (mNnClusterizerModelReg2NumOutputNodes > 0) {
// sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2
// }
// }
// }
// if (mNnClusterizerTotalClusters > 0) {
// sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass
// }
// return sizes;
// }

void GPUTPCNNClusterizer::RegisterMemoryAllocation()
{
AllocateAndInitializeLate();
Expand Down
62 changes: 41 additions & 21 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,31 +37,51 @@ class GPUTPCNNClusterizer : public GPUProcessor

// Neural network clusterization

int mNnClusterizerSizeInputRow = 3;
int mNnClusterizerSizeInputPad = 3;
int mNnClusterizerSizeInputTime = 3;
int mNnClusterizerElementSize = -1;
bool mNnClusterizerAddIndexData = true;
int32_t mNnClusterizerSizeInputRow = 3;
int32_t mNnClusterizerSizeInputPad = 3;
int32_t mNnClusterizerSizeInputTime = 3;
int32_t mNnClusterizerChargeArraySize = -1;
int32_t mNnClusterizerElementSize = -1;
int8_t mNnClusterizerAddIndexData = 1;
float mNnClassThreshold = 0.01;
bool mNnSigmoidTrafoClassThreshold = 1;
bool mNnClusterizerSetDeconvolutionFlags = true;
int mNnClusterizerUseCfRegression = 0;
int mNnClusterizerBatchedMode = 1;
int mNnClusterizerTotalClusters = 1;
int mNnClusterizerVerbosity = 0;
int mNnClusterizerBoundaryFillValue = -1;
int mNnClusterizerModelClassNumOutputNodes = -1;
int mNnClusterizerModelReg1NumOutputNodes = -1;
int mNnClusterizerModelReg2NumOutputNodes = -1;
int mNnInferenceInputDType = 0; // 0: float16, 1: float32
int mNnInferenceOutputDType = 0; // 0: float16, 1: float32
int mISector = -1;
int mDeviceId = -1;
int8_t mNnSigmoidTrafoClassThreshold = 1;
int8_t mNnClusterizerSetDeconvolutionFlags = 1;
int32_t mNnClusterizerUseCfRegression = 0;
int32_t mNnClusterizerBatchedMode = 1;
int32_t mNnClusterizerTotalClusters = 1;
int32_t mNnClusterizerVerbosity = 0;
int32_t mNnClusterizerBoundaryFillValue = -1;
int32_t mNnClusterizerModelClassNumOutputNodes = -1;
int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
int32_t mNnClusterizerModelReg2NumOutputNodes = -1;
int32_t mNnInferenceInputDType = 0; // 0: float16, 1: float32
int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32
int32_t mISector = -1;
int32_t mDeviceId = -1;

// GPU optimizations
uint32_t mNnClusterizerFullRowSize = 0;
uint32_t mNnClusterizerFullPadSize = 0;
uint32_t mNnClusterizerFullTimeSize = 0;
uint32_t mNnClusterizerPadTimeSize = 0;
uint32_t mNnClusterizerRowTimeSize = 0;
uint32_t mNnClusterizerRowTimeSizeFull = 0;

// Boundary lookup table
// int32_t mBoundaryMapSizeRow = 0;
// int32_t mBoundaryMapSizePadsPerRow = 0;
// int32_t mBoundaryMapSize = 0;
// int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
// int8_t* mIsBoundary = nullptr;

// Index lookup table
// int32_t mIndexLookupSize = 0;
// int32_t* mIndexLookup = nullptr;

// Memory allocation for neural network

bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
int* mOutputDataClass = nullptr;
int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
int32_t* mOutputDataClass = nullptr;

// FP32
float* mInputData_32 = nullptr;
Expand Down
49 changes: 48 additions & 1 deletion GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "GPUSettings.h"
#include "ML/3rdparty/GPUORTFloat16.h"
#include "GPUReconstruction.h"
#include "GPUTPCGeometry.h"
#include "DataFormatsTPC/Constants.h"

#ifdef GPUCA_HAS_ONNX
#include <onnxruntime_cxx_api.h>
Expand Down Expand Up @@ -87,8 +89,20 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
// clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
// clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
// clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
// clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
// clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
Expand Down Expand Up @@ -116,6 +130,39 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
}
}

// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
// {
// // Call after init of the clustererNN elements
// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
// int8_t skipCheckInRow = 0;
// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
// clustererNN.mIsBoundary[i] = 1;
// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
// }
// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
// }
// }
// }
// }

// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
// {
// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
// clustererNN.mIndexLookup[3 * i] = r;
// clustererNN.mIndexLookup[3 * i + 1] = p;
// clustererNN.mIndexLookup[3 * i + 2] = t;
// }
// }

// MockedOrtAllocator implementation to be able to use volatile assignment
struct MockedOrtAllocator : OrtAllocator {
MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
Expand Down
2 changes: 2 additions & 0 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost

void init(const GPUSettingsProcessingNNclusterizer&);
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
void createBoundary(GPUTPCNNClusterizer&);
void createIndexLookup(GPUTPCNNClusterizer&);

// ONNX
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
Expand Down
Loading