Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
5ef448c
Adding first version of kernel timers
ChSonnabend Jun 28, 2025
520de51
Merge branch 'dev' into onnx_gpu_timer
ChSonnabend Jul 2, 2025
fcf46d9
Removing GPU_CONFIG_KEY from dpl-workflow.sh to set my own values
ChSonnabend Jul 2, 2025
f235071
Merge branch 'dev' into onnx_gpu_timer
ChSonnabend Jul 6, 2025
4d64371
Merge branch 'dev' into onnx_gpu_timer
ChSonnabend Jul 7, 2025
a221eb4
Merge branch 'dev' into onnx_gpu_timer
ChSonnabend Jul 18, 2025
7f70002
Bug fixes
ChSonnabend Jul 19, 2025
ff34a9d
undoing changes in dpl-workflow.sh
ChSonnabend Jul 19, 2025
05a64bb
Furhter fixes and beautifications
ChSonnabend Jul 21, 2025
8f06331
Please consider the following formatting changes
alibuild Jul 21, 2025
27e6ead
Merge pull request #30 from alibuild/alibot-cleanup-14530
ChSonnabend Jul 21, 2025
152f459
Removing unused timers
ChSonnabend Jul 21, 2025
5801e3a
Moving Stop() of classification timer
ChSonnabend Jul 21, 2025
f4dcbaa
Adding force method to fill input like it is done on GPU
ChSonnabend Jul 21, 2025
e6482ab
Removing unnecessary static asserts
ChSonnabend Jul 23, 2025
f2d2b86
Adding deterministic mode (unfortunately that did not make it determi…
ChSonnabend Jul 25, 2025
f03fdc3
Please consider the following formatting changes
alibuild Jul 25, 2025
38c3856
Merge pull request #31 from alibuild/alibot-cleanup-14530
ChSonnabend Jul 25, 2025
52235a3
Adjusting for comment
ChSonnabend Jul 27, 2025
8c87d37
Adding deterministic mode
ChSonnabend Jul 28, 2025
f8139a7
Please consider the following formatting changes
alibuild Jul 28, 2025
7bec287
Merge pull request #32 from alibuild/alibot-cleanup-14530
ChSonnabend Jul 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/ML/include/ML/OrtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class OrtModel
int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs

// Environment settings
bool mInitialized = false;
bool mInitialized = false, mDeterministicMode = false;
std::string mModelPath, mEnvName = "", mDeviceType = "CPU", mThreadAffinity = ""; // device options should be cpu, rocm, migraphx, cuda
int32_t mIntraOpNumThreads = 1, mInterOpNumThreads = 1, mDeviceId = -1, mEnableProfiling = 0, mLoggingLevel = 0, mAllocateDeviceMemory = 0, mEnableOptimizations = 0;

Expand Down
5 changes: 5 additions & 0 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
mEnableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
mEnableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference");
mDeterministicMode = (optionsMap.contains("deterministic-compute") ? std::stoi(optionsMap["deterministic-compute"]) : 0);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you use the existing flag for the deterministic mode? GPU_proc.deterministicGPUReconstruction

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because from what I could see it actually doesn't do what it's supposed to do (it's not deterministic even when setting that flag to 1, also reported like this in several other forum entries). On the other hand one can switch it on like this manually. But I can make it an or-statement to set it to 1 if GPU_proc.deterministicGPUReconstruction

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Did in the clusterizer code though, not directly in the OrtInterface class

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the GPU_proc.deterministicGPUReconstruction should guarantee that we get deterministic results. If we are not there yet in the NN clusterization then we have to work to understand what is non-deterministic.
But anything that is needed for deterministic results should automatically be enabled by this flag.
If you want to have more fine grained settings to enable partial deterministic stuff, you can add them in the NN settings. But the global flag should automatically enable all of them.


if (mDeviceType == "CPU") {
(mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads);
Expand Down Expand Up @@ -99,6 +100,10 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
(mPImplOrt->sessionOptions).DisableProfiling();
}

if (mDeterministicMode > 0) {
(mPImplOrt->sessionOptions).AddConfigEntry("session_options.use_deterministic_compute", "1");
}

(mPImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(mEnableOptimizations));
(mPImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(mLoggingLevel));

Expand Down
3 changes: 3 additions & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Spe
AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
Expand All @@ -275,6 +276,8 @@ AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters wil
AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification")
AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function")
// CCDB
AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally")
AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")
Expand Down
53 changes: 39 additions & 14 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -641,13 +641,30 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];

// Maximum of 4 lanes supported
HighResTimer* nnTimers[12];
if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().debugLevel >= 1) {
nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0);
nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1);
nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3);
nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4);
nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6);
nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7);
nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9);
nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10);
nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
}

if (GetProcessingSettings().nn.applyNNclusterizer) {
int32_t deviceId = -1;
int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
int32_t maxThreads = mRec->getNKernelHostThreads(true);
// bool recreateMemoryAllocator = false;
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
nnApplications[lane].init(nn_settings);
nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
if (nnApplications[lane].mModelsUsed[0]) {
SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
(nnApplications[lane].mModelClass).setDeviceId(deviceId);
Expand Down Expand Up @@ -993,9 +1010,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));

// Filling the data
if (mRec->IsGPU()) {
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
// Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
} else {
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
Expand All @@ -1006,20 +1023,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}

// NN evaluations
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
}
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
if(clustererNNShadow.mNnClusterizerUseClassification) {
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
}
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
}
}
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
}
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
Expand All @@ -1033,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
}
}
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
Expand All @@ -1047,6 +1071,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
}
}
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
}
}

Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
int32_t mNnClusterizerChargeArraySize = -1;
int32_t mNnClusterizerElementSize = -1;
int8_t mNnClusterizerAddIndexData = 1;
int8_t mNnClusterizerUseClassification = 1;
float mNnClassThreshold = 0.01;
int8_t mNnSigmoidTrafoClassThreshold = 1;
int8_t mNnClusterizerSetDeconvolutionFlags = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

using namespace o2::gpu;

void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode)
{
std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
std::vector<std::string> reg_model_paths_local;
Expand All @@ -54,6 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
{"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
{"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
{"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
{"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025)
{"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
{"profiling-output-path", settings.nnInferenceOrtProfilingPath},
{"logging-level", std::to_string(settings.nnInferenceVerbosity)},
Expand Down Expand Up @@ -106,6 +107,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
if (clustererNN.mNnSigmoidTrafoClassThreshold) {
clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ class GPUTPCNNClusterizerHost
{
public:
GPUTPCNNClusterizerHost() = default;
GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); }
GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); }

void init(const GPUSettingsProcessingNNclusterizer&);
void init(const GPUSettingsProcessingNNclusterizer&, bool = false);
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
void createBoundary(GPUTPCNNClusterizer&);
void createIndexLookup(GPUTPCNNClusterizer&);
Expand Down
26 changes: 11 additions & 15 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
}

if (clustererNN.mNnClusterizerAddIndexData) {
float sector_norm = sector / 36.f;
float row_norm = row / 152.f;
float pad_norm = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);

if (dtype == 0) {
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm;
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm;
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm;
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
} else {
clustererNN.mInputData_32[write_idx] = sector_norm;
clustererNN.mInputData_32[write_idx + 1] = row_norm;
clustererNN.mInputData_32[write_idx + 2] = pad_norm;
clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
}
}

Expand Down Expand Up @@ -178,8 +174,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;

float index_values[3] = {
sector / 36.f,
row / 152.f,
static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};

if (dtype == 0) {
Expand Down Expand Up @@ -335,11 +331,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
return;
}

tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;

// LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();

if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes != -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) {
if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) {

ClusterAccumulator pc;

Expand Down Expand Up @@ -451,7 +447,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub

uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;

if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {
if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) {

ClusterAccumulator pc;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
{
public:
// Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
struct GPUSharedMemory {
// Regular cluster finder
Expand Down