Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM

// Load from options map
if (!optionsMap.contains("model-path")) {
LOG(fatal) << "(ORT) Model path cannot be empty!";
LOG(fatal) << "(ORT) Model path must be contained in options map!";
}

if (!optionsMap["model-path"].empty()) {
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,15 +259,15 @@ AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables gr
AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
AddOption(nnInferenceVerbosity, int, 2, "", 0, "0: All debugs; 1: Warnings + major debugs; 2: Warnings; >=3: No messages")
AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed. Higher number = higher verbosity")
AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")
Expand Down
44 changes: 41 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
#include "DataFormatsTPC/Digit.h"
#include "DataFormatsTPC/Constants.h"
#include "TPCBase/RDHUtils.h"
#include "GPULogging.h"

#ifdef GPUCA_HAS_ONNX
#include "GPUTPCNNClusterizerKernels.h"
Expand Down Expand Up @@ -706,7 +705,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
(nnApplications[lane].mModelReg2).initSession();
}
if (nn_settings.nnClusterizerVerbosity < 3) {
if (nn_settings.nnClusterizerVerbosity > 0) {
LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
}
});
Expand All @@ -724,12 +723,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
}
if (nn_settings.nnClusterizerVerbosity > 2) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
AllocateRegisteredMemory(clustererNN.mMemoryId);
if (nn_settings.nnClusterizerVerbosity > 2) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
// nnApplications[lane].createBoundary(clustererNNShadow);
// nnApplications[lane].createIndexLookup(clustererNNShadow);
}
if (doGPU) {
if (nn_settings.nnClusterizerVerbosity > 2) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
}
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
if (nn_settings.nnClusterizerVerbosity > 2) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
}
}
}
#endif
Expand Down Expand Up @@ -1010,7 +1021,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}

// float time_clusterizer = 0, time_fill = 0, time_networks = 0;
if (nn_settings.nnClusterizerVerbosity > 2) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));

Expand All @@ -1022,9 +1039,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
}
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}

if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
}

// NN evaluations
Expand All @@ -1044,6 +1067,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
}
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
}
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
Expand Down Expand Up @@ -1078,9 +1104,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
}
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
}

// Publishing kernels for class labels and regression results
// In case classification should not be used, this kernel should still be executed to fill the mOutputDataClass array with default values
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
} else {
Expand All @@ -1092,6 +1122,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
}
}
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
}

if (clustererNNShadow.mNnClusterizerUseCfRegression) {
Expand All @@ -1100,6 +1133,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
if (nn_settings.nnClusterizerVerbosity > 3) {
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
}
}
#else
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
Expand Down Expand Up @@ -1203,7 +1239,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
#ifdef GPUCA_HAS_ONNX
if (GetProcessingSettings().nn.applyNNclusterizer) {
LOG(info) << "(ORT) Environment releasing...";
if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
LOG(info) << "(ORT) Environment releasing...";
}
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
nnApplication.mModelClass.release(true);
nnApplication.mModelReg1.release(true);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,

tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;

GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, true);
}
2 changes: 1 addition & 1 deletion GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
template <int32_t iKernel = defaultKernel>
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);

static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*, int8_t);

static GPUd() void buildCluster(const GPUSettingsRec&, const CfArray2D<PackedCharge>&, CfChargePos, CfChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);

Expand Down
6 changes: 5 additions & 1 deletion GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
uint32_t maxClusterPerRow,
uint32_t* clusterInRow,
tpc::ClusterNative* clusterByRow,
uint32_t* clusterPosInRow)
uint32_t* clusterPosInRow,
int8_t isAccepted)
{
uint32_t idx = get_global_id(0);

Expand Down Expand Up @@ -62,6 +63,9 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
tpc::ClusterNative myCluster;
pc.finalize(pos, charge, fragment.start);
bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), chargeMap);
if (!isAccepted) {
rejectCluster = true;
}

if (rejectCluster) {
if (clusterPosInRow) {
Expand Down
Loading