diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index b4f40f3f5c694..7224645425856 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -45,14 +45,10 @@ class OrtModel public: // Constructors & destructors - OrtModel() = default; - OrtModel(std::unordered_map optionsMap) { init(optionsMap); } - void init(std::unordered_map optionsMap) - { - initOptions(optionsMap); - initEnvironment(); - } - virtual ~OrtModel() = default; + OrtModel(); + OrtModel(std::unordered_map optionsMap); + void init(std::unordered_map optionsMap); + virtual ~OrtModel(); // General purpose void initOptions(std::unordered_map optionsMap); @@ -113,7 +109,7 @@ class OrtModel private: // ORT variables -> need to be hidden as pImpl struct OrtVariables; - OrtVariables* mPImplOrt; + std::unique_ptr mPImplOrt; // Input & Output specifications of the loaded network std::vector mInputNamesChar, mOutputNamesChar; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index df7f0a2deba82..8f31761489997 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -27,11 +27,20 @@ namespace o2 namespace ml { +OrtModel::OrtModel() = default; +OrtModel::OrtModel(std::unordered_map optionsMap) { init(optionsMap); } +OrtModel::~OrtModel() = default; +void OrtModel::init(std::unordered_map optionsMap) +{ + initOptions(optionsMap); + initEnvironment(); +} + struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file // ORT runtime objects Ort::RunOptions runOptions; - std::shared_ptr env = nullptr; - std::shared_ptr session = nullptr; ///< ONNX session + std::unique_ptr env = nullptr; + std::unique_ptr session = nullptr; ///< ONNX session Ort::SessionOptions sessionOptions; Ort::AllocatorWithDefaultOptions allocator; Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); @@ -41,7 +50,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c // General purpose void OrtModel::initOptions(std::unordered_map optionsMap) { - mPImplOrt = new OrtVariables(); + mPImplOrt = std::make_unique(); // Load from options map if (!optionsMap.contains("model-path")) { @@ -101,7 +110,7 @@ void OrtModel::initOptions(std::unordered_map optionsM void OrtModel::initEnvironment() { - mPImplOrt->env = std::make_shared( + mPImplOrt->env = std::make_unique( OrtLoggingLevel(mLoggingLevel), (mEnvName.empty() ? "ORT" : mEnvName.c_str()), // Integrate ORT logging into Fairlogger @@ -129,7 +138,7 @@ void OrtModel::initSession() if (mAllocateDeviceMemory) { memoryOnDevice(mDeviceId); } - mPImplOrt->session = std::make_shared(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions); + mPImplOrt->session = std::make_unique(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions); mPImplOrt->ioBinding = std::make_unique(*mPImplOrt->session); setIO(); @@ -147,12 +156,12 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) (mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time (mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time // Arena memory shrinkage comes at performance cost - /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; - // (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 + // For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; + (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 std::string dev_mem_str = ""; if (mDeviceType == "ROCM") { - dev_mem_str = "Hip"; + dev_mem_str = "HipPinned"; } if (mDeviceType == "CUDA") { dev_mem_str = "Cuda"; @@ -166,7 +175,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) void OrtModel::resetSession() { - mPImplOrt->session = std::make_shared(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions); + mPImplOrt->session = std::make_unique(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions); } // Getters @@ -252,7 +261,7 @@ void OrtModel::setIO() void OrtModel::setEnv(Ort::Env* env) { - mPImplOrt->env = std::shared_ptr(env); + mPImplOrt->env.reset(env); } // Inference @@ -308,6 +317,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output) (mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); (mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding); + // mPImplOrt->session->Run( + // mPImplOrt->runOptions, + // mInputNamesChar.data(), + // &inputTensor, + // mInputNamesChar.size(), + // mOutputNamesChar.data(), + // &outputTensor, + // mOutputNamesChar.size()); } template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*); @@ -427,10 +444,7 @@ template std::vector OrtModel::inferencesession->EndProfiling(); - // } - LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes"; + mPImplOrt.reset(); } // private diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index f188388e76a02..f904ced60fcfa 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // But environment must be valid, so we init the model environment first and use it here afterwards. // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator. // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); + // if (lane == 0) { + // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); + // } // recreateMemoryAllocator = true; (nnApplications[lane].mModelClass).initSession(); } @@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg1).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator); + // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator); (nnApplications[lane].mModelReg1).initSession(); } if (nnApplications[lane].mModelsUsed[2]) { @@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads); } + // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg2).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); + // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); (nnApplications[lane].mModelReg2).initSession(); } if (nn_settings.nnClusterizerVerbosity < 3) { @@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU) { WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); } - LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes"; - LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes"; } #endif @@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane]; + // // bool recreateMemoryAllocator = false; + // if (lane == 0) { + // (nnApplications[lane].mModelClass).initEnvironment(); + // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0); + // } + // // recreateMemoryAllocator = true; + // (nnApplications[lane].mModelClass).initSession(); + // (nnApplications[lane].mModelReg1).initSession(); + int withMC = (doGPU && propagateMCLabels); if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { @@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) { - // if (GetProcessingSettings().nn.applyNNclusterizer) { - // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; - // nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); - // nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); - // nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); - // } + if (GetProcessingSettings().nn.applyNNclusterizer) { + LOG(info) << "(ORT) Environment releasing..."; + GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; + nnApplication.mModelClass.release(true); + nnApplication.mModelReg1.release(true); + nnApplication.mModelReg2.release(true); + } if (transferRunning[i]) { ReleaseEvent(mEvents->stream[i], doGPU); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index ca2deec60601c..90f1d6e27246f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator { std::atomic memory_inuse{0}; std::atomic num_allocations{0}; std::atomic num_reserve_allocations{0}; - OrtMemoryInfo* memory_info; - GPUReconstruction* rec; + OrtMemoryInfo* mMemoryInfoInternal; + GPUReconstruction* mRecInternal; }; MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) @@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast(this_)->Info(); }; OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Reserve(size); }; - rec = r; - memory_info = info; + mRecInternal = r; + mMemoryInfoInternal = info; } MockedOrtAllocator::~MockedOrtAllocator() { - // Ort::GetApi().ReleaseMemoryInfo(memory_info); + // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal); (void)0; // Suppress warning for empty destructor } void* MockedOrtAllocator::Alloc(size_t size) { - // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes"; - return rec->AllocateVolatileDeviceMemory(size); + LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes"; + return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK); } void* MockedOrtAllocator::Reserve(size_t size) { - // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes"; - return rec->AllocateVolatileDeviceMemory(size); + LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes"; + return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK); } void MockedOrtAllocator::Free(void* p) { // LOG(info) << "(ORT) Freeing volatile memory " << p; - rec->ReturnVolatileDeviceMemory(); } const OrtMemoryInfo* MockedOrtAllocator::Info() const { - return memory_info; + return mMemoryInfoInternal; } size_t MockedOrtAllocator::NumAllocations() const @@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck() } } -void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) +void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) { mMockedAlloc = std::make_shared(rec, (OrtMemoryInfo*)(*memInfo)); if (recreate) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index e659753f21d7d..4334c3418eb09 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); // ONNX - void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false); + void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false); MockedOrtAllocator* getMockedAllocator(); const OrtMemoryInfo* getMockedMemoryInfo(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 47bc5e8da80ca..8ca61602ab4e9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -35,9 +35,9 @@ using namespace o2::gpu::tpccf; // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; if (clustererNN.mOutputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices @@ -51,29 +51,29 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; - uint write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId + uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; - int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; + int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); #ifndef GPUCA_GPUCODE GPUCA_UNROLL(U(), U()); #endif - for (int r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { + for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); - int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); - for (int p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { + int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); + for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); - for (int t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { + for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { if (!is_boundary) { CfChargePos tmp_pos(row + r, pad + p, time + t); if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization @@ -111,21 +111,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; - uint base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize); - uint transient_index = glo_idx % clustererNN.mNnClusterizerElementSize; + uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize); + uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize); CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; - int row = static_cast(peak.row()), pad = static_cast(peak.pad()); + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; + int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()); if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) { - uint top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize; + uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); @@ -142,8 +142,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); } } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) { - int time = static_cast(peak.time()); - int r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; + int32_t time = static_cast(peak.time()); + int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); if (is_row_boundary) { if (dtype == 0) { @@ -152,15 +152,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mNnClusterizerBoundaryFillValue); } } else { - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); - int rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); - int p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset; - bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); + int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); + int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); + int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset; + int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; + + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU); if (!is_boundary) { float central_charge = static_cast(chargeMap[peak].unpack()); - int t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; CfChargePos tmp_pos(row + r, pad + p, time + t); if (dtype == 0) { clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); @@ -179,9 +180,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); if (dtype == 0) { processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold); } else if (dtype == 1) { @@ -190,14 +191,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) { auto& clustererNN = processors.tpcNNClusterer[sector]; - uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes; + uint32_t glo_idx = get_global_id(0); + uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes; float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] - uint class_label = 0; - for (uint pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) { + uint32_t class_label = 0; + for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) { if (pIdx == elem_iterator) { if (dtype == 0) { current_max_prob = static_cast(clustererNN.mModelProbabilities_16[pIdx]); @@ -212,7 +213,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 1) { clustererNN.mClusterFlags[2 * glo_idx] = 1; @@ -221,25 +222,30 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters; + uint32_t full_glo_idx = glo_idx + batchStart; + if (full_glo_idx >= maxClusterNum) { + return; + } + int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)]; float central_charge = static_cast(chargeMap[peak].unpack()); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; - uint full_glo_idx = glo_idx + batchStart; - int model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes; // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size(); - if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) { + if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes != -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) { ClusterAccumulator pc; @@ -291,7 +297,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint glo_idx = get_global_id(0); + uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; float central_charge = static_cast(chargeMap[peak].unpack()); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; - uint full_glo_idx = glo_idx + batchStart; - int model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes; + uint32_t full_glo_idx = glo_idx + batchStart; + uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes; if (clustererNN.mOutputDataClass[full_glo_idx] > 0) { @@ -384,7 +390,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 62 ? global_shift : 0); } -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift) +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift) { if (pad < 0 || row < 0) { // Faster short-circuit return true; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index dc7f537c6c1e8..dac2bf9554849 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -73,11 +73,12 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate private: static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint); static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint); + static GPUd() uint32_t sortIntoBuckets(GPUTPCClusterFinder&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t); static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint); - static GPUd() int padOffset(int, int); - static GPUd() int rowOffset(int, int); - static GPUd() bool isBoundary(int, int, int); + static GPUd() int32_t padOffset(int32_t, int32_t); + static GPUd() int32_t rowOffset(int32_t, int32_t); + static GPUd() bool isBoundary(int32_t, int32_t, int32_t); }; } // namespace o2::gpu