Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 15 additions & 56 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -977,20 +977,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];

// // bool recreateMemoryAllocator = false;
// if (lane == 0) {
// (nnApplications[lane].mModelClass).initEnvironment();
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
// }
// // recreateMemoryAllocator = true;
// (nnApplications[lane].mModelClass).initSession();
// (nnApplications[lane].mModelReg1).initSession();

int withMC = (doGPU && propagateMCLabels);

if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
// int withMC = (doGPU && propagateMCLabels);

if (nn_settings.nnClusterizerApplyCfDeconvolution) {
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
} else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false);
}
Expand All @@ -1001,15 +991,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));

// auto start0 = std::chrono::high_resolution_clock::now();
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Filling the data

if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Filling the regression data
}

// auto stop0 = std::chrono::high_resolution_clock::now();
// auto start1 = std::chrono::high_resolution_clock::now();

// NN evaluations
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
Expand Down Expand Up @@ -1055,55 +1042,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
}

// auto stopNNs = std::chrono::high_resolution_clock::now();

// Publishing kernels
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
} else {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
}
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 1 regression results
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
}
}

// for(int i = 0; i < iSize; ++i) {
// if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) {
// LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.mModelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.mOutputDataClass[i + batchStart] << " thresh " << clustererNNShadow.mNnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
// // std::string input = "[";
// // for(int j = 0; j < clustererNNShadow.mNnClusterizerElementSize; j++){
// // input += std::to_string(clustererNNShadow.mInputData_16[i * clustererNNShadow.mNnClusterizerElementSize + j].ToFloat()) + ", ";
// // }
// // input += "]";
// // LOG(info) << "Input is: " << input;
// }
// }

// auto stop1 = std::chrono::high_resolution_clock::now();

// time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
// time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
// time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
}

if (clustererNNShadow.mNnClusterizerUseCfRegression) {
// auto start1 = std::chrono::high_resolution_clock::now();
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
// auto stop1 = std::chrono::high_resolution_clock::now();
// time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
if(!nn_settings.nnClusterizerApplyCfDeconvolution) {
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
}
DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
}
// if (clustererNNShadow.mNnClusterizerVerbosity < 3) {
// int acceptedClusters = 0;
// for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
// if(clustererNNShadow.mOutputDataClass[i] > 1 || clustererNNShadow.mOutputDataClass[i] < 0) {
// LOG(info) << "WARNING ORT 2: " << clustererNNShadow.mOutputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
// }
// acceptedClusters += clustererNNShadow.mOutputDataClass[i];
// }
// LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
// }
#else
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
clustererNN.mClusterFlags[2 * base_idx] = 0;
clustererNN.mClusterFlags[2 * base_idx + 1] = 0;
for (uint16_t i = 0; i < 8; i++) { // This solution needs testing. It is not the same as the deconvolution flags
Delta2 d = cfconsts::InnerNeighbors[i];
CfChargePos tmp_pos = peak.delta(d);
Expand Down Expand Up @@ -470,6 +472,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];

clustererNN.mClusterFlags[2 * idx] = 0;
clustererNN.mClusterFlags[2 * idx + 1] = 0;
for (int i = 0; i < 8; i++) {
Delta2 d = cfconsts::InnerNeighbors[i];
CfChargePos tmp_pos = peak.delta(d);
Expand Down