diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index c76bf11c3e25d..a4e5d5e1189f5 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -193,6 +193,7 @@ int32_t GPUReconstruction::Init() } mSlaves[i]->ClearAllocatedMemory(); } + debugInit(); return 0; } @@ -469,6 +470,7 @@ int32_t GPUReconstruction::Exit() if (mInitialized) { ExitDevice(); } + debugExit(); mInitialized = false; return 0; } diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index d5c0b8e828087..e0c866fd9421b 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -239,6 +240,9 @@ class GPUReconstruction virtual void PrintKernelOccupancies() {} double GetStatKernelTime() { return mStatKernelTime; } double GetStatWallTime() { return mStatWallTime; } + void setDebugDumpCallback(std::function&& callback = std::function(nullptr)); + bool triggerDebugDump(); + std::string getDebugFolder(const std::string& prefix = ""); // empty string = no debug // Threading std::shared_ptr mThreading; @@ -407,6 +411,13 @@ class GPUReconstruction }; static std::shared_ptr sLibCUDA, sLibHIP, sLibOCL; + // Debugging + struct debugInternal; + static std::unique_ptr mDebugData; + bool mDebugEnabled = false; + void debugInit(); + void debugExit(); + static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg); }; diff --git a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx new file mode 100644 index 0000000000000..c1c31eedde1b2 --- /dev/null +++ b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx @@ -0,0 +1,188 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionDebug.cxx +/// \author David Rohr + +#include "GPUReconstruction.h" +#include "GPULogging.h" +#include "GPUSettings.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace o2::gpu; + +struct GPUReconstruction::debugInternal { + std::function signalCallback; + std::function debugCallback = nullptr; + std::function reinstallCallback = nullptr; + std::unordered_map oldActions; + size_t debugCount = 0; + static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext) + { + GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext); + } +}; + +std::unique_ptr GPUReconstruction::mDebugData; + +void GPUReconstruction::debugInit() +{ + if (GetProcessingSettings().debugOnFailure) { + static std::mutex initMutex; + { + std::lock_guard guard(initMutex); + if (mDebugData) { + GPUFatal("Error handlers for debug dumps already set, cannot set them again"); + } + mDebugData = std::make_unique(); + } + mDebugEnabled = true; + if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 2)) { + struct sigaction sa, oldsa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = GPUReconstruction::debugInternal::globalCallback; + sa.sa_flags = SA_SIGINFO; + uint32_t mask = GetProcessingSettings().debugOnFailureSignalMask == (uint32_t)-1 ? ((1 << SIGINT) | (1 << SIGABRT) | (1 << SIGBUS) | (1 << SIGTERM) | (1 << SIGSEGV)) : GetProcessingSettings().debugOnFailureSignalMask; + if (mask) { + for (uint32_t i = 0; i < sizeof(mask) * 8; i++) { + if (mask & (1 << i)) { + if (sigaction(i, &sa, &oldsa)) { + GPUFatal("Error installing signal handler for error dump on signal %d", i); + } + mDebugData->oldActions.emplace(i, oldsa); + } + } + } + + mDebugData->signalCallback = [this, &oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) { + static std::mutex callbackMutex; + std::lock_guard guard(callbackMutex); + if (mDebugData->debugCallback) { + GPUInfo("Running debug callback for signal %d", signal); + mDebugData->debugCallback(); + mDebugData->debugCount++; + } + mDebugData->debugCallback = nullptr; + if (!GetProcessingSettings().debugOnFailureNoForwardSignal) { + sigaction(signal, &oldActions[signal], nullptr); + raise(signal); + mDebugData->reinstallCallback = [signal, myAction]() { sigaction(signal, &myAction, nullptr); }; + } + }; + } + } +} + +void GPUReconstruction::debugExit() +{ + if (!mDebugEnabled) { + return; + } + if (mDebugData) { + for (auto& it : mDebugData->oldActions) { + if (sigaction(it.first, &it.second, nullptr)) { + GPUFatal("Error restoring signal handler for signal %d", it.first); + } + } + } + mDebugEnabled = false; +} + +void GPUReconstruction::setDebugDumpCallback(std::function&& callback) +{ + if (mMaster) { + if (mDebugData->reinstallCallback) { + mDebugData->reinstallCallback(); + mDebugData->reinstallCallback = nullptr; + } + mMaster->setDebugDumpCallback(std::move(callback)); + } else if (mDebugEnabled && mDebugData) { + mDebugData->debugCallback = callback; + } +} + +std::string GPUReconstruction::getDebugFolder(const std::string& prefix) +{ + const std::filesystem::path target_dir = GetProcessingSettings().debugOnFailureDirectory; + + std::size_t total_size = 0; + std::size_t subfolder_count = 0; + + if (!std::filesystem::exists(target_dir) || !std::filesystem::is_directory(target_dir)) { + GPUError("Invalid debugOnFailureDirectory %s", GetProcessingSettings().debugOnFailureDirectory.c_str()); + return ""; + } + + for (const auto& entry : std::filesystem::directory_iterator(target_dir)) { + if (entry.is_directory()) { + subfolder_count++; + + for (const auto& subentry : std::filesystem::directory_iterator(entry.path())) { + if (subentry.is_regular_file()) { + std::error_code ec; + auto size = std::filesystem::file_size(subentry.path(), ec); + if (!ec) { + total_size += size; + } + } + } + } + } + + if ((GetProcessingSettings().debugOnFailureMaxFiles && subfolder_count >= GetProcessingSettings().debugOnFailureMaxFiles) || (GetProcessingSettings().debugOnFailureMaxSize && (total_size >> 30) >= GetProcessingSettings().debugOnFailureMaxSize)) { + GPUError("Cannot store debug dump files, target storage exceeded: %zu dumps, %zu bytes", subfolder_count, total_size); + return ""; + } + + auto currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::ostringstream dateTime; + dateTime << std::put_time(std::localtime(¤tTime), "%Y-%m-%d_%H-%M-%S"); + + int32_t attempt = 0; + std::string outname; + while (true) { + if (attempt++ >= 512) { + GPUError("Error creating debug dump folder"); + return ""; + } + + outname = GetProcessingSettings().debugOnFailureDirectory + "/debug_" + prefix + (prefix == "" ? "" : "_") + dateTime.str() + "_" + std::to_string(attempt); + std::error_code ec; + bool created = std::filesystem::create_directory(outname, ec); + if (!ec && created) { + break; + } + } + + GPUInfo("Debug dump to %s", outname.c_str()); + return outname; +} + +bool GPUReconstruction::triggerDebugDump() +{ + if (mMaster) { + return mMaster->triggerDebugDump(); + } else if (mDebugEnabled && mDebugData && mDebugData->debugCallback) { + GPUInfo("Running triggered debug callback"); + mDebugData->debugCallback(); + mDebugData->debugCount++; + mDebugData->debugCallback = nullptr; + return true; + } + return false; +} diff --git a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx index 89517c612403b..64184dd724acd 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUReconstruction.cxx +/// \file GPUReconstructionLibrary.cxx /// \author David Rohr #ifdef _WIN32 diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 52848692e7516..1b108bc74190d 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -95,6 +95,7 @@ set(SRCS_NO_CINT set(SRCS_NO_H SectorTracker/GPUTPCTrackerDump.cxx Merger/GPUTPCGMMergerDump.cxx Base/GPUReconstructionLibrary.cxx + Base/GPUReconstructionDebug.cxx Global/GPUChainTrackingClusterizer.cxx Global/GPUChainTrackingTransformation.cxx Global/GPUChainTrackingTRD.cxx diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index b9be1db881816..12f40cda4c398 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -360,6 +360,14 @@ AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options") AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing") AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain") +AddOption(debugOnFailure, int32_t, 0, "", 0, "Dump raw data in case an error occured, bit 1 enables all dumps, otherwise bitmask for: 2 = signal, 3 = GPUErrorCode", def(1)) +AddOption(debugOnFailureSignalMask, uint32_t, (uint32_t)-1, "", 0, "Mask of signals that trigger debug / dump") +AddOption(debugOnFailureErrorMask, uint64_t, (uint64_t)-1, "", 0, "Mask of GPUCA_ERRORS that trigger debug / dump") +AddOption(debugOnFailureNoForwardSignal, bool, false, "", 0, "Do not forward signal to original signal handler") +AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run the debug / dump") +AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder") +AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB") +AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump") AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index c1c3e368ce90c..db84050772312 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -705,10 +705,14 @@ int32_t GPUChainTracking::RunChain() } mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop(); - PrepareDebugOutput(); + PrepareKernelDebugOutput(); SynchronizeStream(0); // Synchronize all init copies that might be ongoing + if (GetProcessingSettings().debugOnFailure) { + mRec->setDebugDumpCallback([this]() { DoDebugRawDump(); }); + } + if (mIOPtrs.tpcCompressedClusters) { if (runRecoStep(RecoStep::TPCDecompression, &GPUChainTracking::RunTPCDecompression)) { return 1; @@ -775,7 +779,7 @@ int32_t GPUChainTracking::RunChain() } int32_t retVal = 0; - if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) { + if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) { // TODO: Eventually, we should use GPUReconstruction::CheckErrorCodes retVal = 3; if (!GetProcessingSettings().ignoreNonFatalGPUErrors) { return retVal; @@ -815,7 +819,7 @@ int32_t GPUChainTracking::RunChainFinalize() PrintOutputStat(); } - PrintDebugOutput(); + PrintKernelDebugOutput(); // PrintMemoryRelations(); @@ -884,6 +888,7 @@ int32_t GPUChainTracking::FinalizePipelinedProcessing() int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector>* fillErrors) { int32_t retVal = 0; + bool hasDebugError = false; for (int32_t i = 0; i < 1 + (!cpuOnly && mRec->IsGPU()); i++) { if (i) { const auto& threadContext = GetThreadContext(); @@ -925,9 +930,26 @@ int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, st fillErrors->emplace_back(std::array{pErrors[4 * j], pErrors[4 * j + 1], pErrors[4 * j + 2], pErrors[4 * j + 3]}); } } + if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 4)) { + if (GetProcessingSettings().debugOnFailureErrorMask == (uint64_t)-1) { + hasDebugError = true; + } else { + uint32_t nErrors = processors()->errorCodes.getNErrors(); + const uint32_t* pErrors = processors()->errorCodes.getErrorPtr(); + for (uint32_t j = 0; j < nErrors; j++) { + if (GetProcessingSettings().debugOnFailureErrorMask & (1 << pErrors[4 * j])) { + hasDebugError = true; + break; + } + } + } + } } } ClearErrorCodes(cpuOnly); + if (hasDebugError) { + mRec->triggerDebugDump(); + } return retVal; } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 2a2996895dbcf..7d4adcd70af7f 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -134,7 +134,7 @@ class GPUChainTracking : public GPUChain void ClearIOPointers(); void AllocateIOMemory(); using GPUChain::DumpData; - void DumpData(const char* filename); + void DumpData(const char* filename, const GPUTrackingInOutPointers* ioPtrs = nullptr); using GPUChain::ReadData; int32_t ReadData(const char* filename); void DumpSettings(const char* dir = "") override; @@ -231,11 +231,12 @@ class GPUChainTracking : public GPUChain int32_t DoProfile(); void PrintMemoryRelations(); void PrintMemoryStatistics() override; - void PrepareDebugOutput(); - void PrintDebugOutput(); + void PrepareKernelDebugOutput(); + void PrintKernelDebugOutput(); void PrintOutputStat(); static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters); static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls); + void DoDebugRawDump(); bool ValidateSteps(); bool ValidateSettings(); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index f72943e6bcd5a..e9721ec9d12bf 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -185,7 +185,7 @@ void GPUChainTracking::PrintMemoryRelations() GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTrackClusters()); } -void GPUChainTracking::PrepareDebugOutput() +void GPUChainTracking::PrepareKernelDebugOutput() { #ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT const auto& threadContext = GetThreadContext(); @@ -198,7 +198,7 @@ void GPUChainTracking::PrepareDebugOutput() #endif } -void GPUChainTracking::PrintDebugOutput() +void GPUChainTracking::PrintKernelDebugOutput() { #ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT const auto& threadContext = GetThreadContext(); @@ -390,3 +390,23 @@ void GPUChainTracking::DebugSortCompressedClusters(o2::tpc::CompressedClustersFl sortMultiple(c.nAttachedClustersReduced, getReducedOffset, getN1, c.rowDiffA, c.sliceLegDiffA, c.padResA, c.timeResA); sortMultiple(c.nTracks, getIndex, get1, c.qPtA, c.rowA, c.sliceA, c.timeA, c.padA, c.nTrackClusters); // NOTE: This must be last, since nTrackClusters is used for handling the arrays above! } + +void GPUChainTracking::DoDebugRawDump() +{ + std::string dirName = mRec->getDebugFolder("tpc_raw"); + if (dirName == "") { + return; + } + GPUTrackingInOutPointers ioPtrs; + if (mIOPtrs.tpcZS) { + ioPtrs.tpcZS = mIOPtrs.tpcZS; + } else if (mIOPtrs.tpcPackedDigits) { + ioPtrs.tpcPackedDigits = mIOPtrs.tpcPackedDigits; + } else if (mIOPtrs.clustersNative) { + ioPtrs.clustersNative = mIOPtrs.clustersNative; + } + + GPUInfo("Doing debug raw dump"); + mRec->DumpSettings((dirName + "/").c_str()); + DumpData((dirName + "/event.0.dump").c_str(), &ioPtrs); +} diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx index 035e257ca7952..5a141cd08eb65 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx @@ -63,33 +63,36 @@ GPUChainTracking::InOutMemory::~InOutMemory() = default; GPUChainTracking::InOutMemory::InOutMemory(GPUChainTracking::InOutMemory&&) = default; GPUChainTracking::InOutMemory& GPUChainTracking::InOutMemory::operator=(GPUChainTracking::InOutMemory&&) = default; // NOLINT: False positive in clang-tidy -void GPUChainTracking::DumpData(const char* filename) +void GPUChainTracking::DumpData(const char* filename, const GPUTrackingInOutPointers* ioPtrs) { FILE* fp = fopen(filename, "w+b"); if (fp == nullptr) { return; } + if (ioPtrs == nullptr) { + ioPtrs = &mIOPtrs; + } fwrite(DUMP_HEADER, 1, DUMP_HEADER_SIZE, fp); fwrite(&GPUReconstruction::geometryType, sizeof(GPUReconstruction::geometryType), 1, fp); - DumpData(fp, mIOPtrs.clusterData, mIOPtrs.nClusterData, InOutPointerType::CLUSTER_DATA); - DumpData(fp, mIOPtrs.rawClusters, mIOPtrs.nRawClusters, InOutPointerType::RAW_CLUSTERS); - if (mIOPtrs.clustersNative) { - if (DumpData(fp, &mIOPtrs.clustersNative->clustersLinear, &mIOPtrs.clustersNative->nClustersTotal, InOutPointerType::CLUSTERS_NATIVE)) { - fwrite(&mIOPtrs.clustersNative->nClusters[0][0], sizeof(mIOPtrs.clustersNative->nClusters[0][0]), NSECTORS * GPUCA_ROW_COUNT, fp); - if (mIOPtrs.clustersNative->clustersMCTruth) { - const auto& buffer = mIOPtrs.clustersNative->clustersMCTruth->getBuffer(); + DumpData(fp, ioPtrs->clusterData, ioPtrs->nClusterData, InOutPointerType::CLUSTER_DATA); + DumpData(fp, ioPtrs->rawClusters, ioPtrs->nRawClusters, InOutPointerType::RAW_CLUSTERS); + if (ioPtrs->clustersNative) { + if (DumpData(fp, &ioPtrs->clustersNative->clustersLinear, &ioPtrs->clustersNative->nClustersTotal, InOutPointerType::CLUSTERS_NATIVE)) { + fwrite(&ioPtrs->clustersNative->nClusters[0][0], sizeof(ioPtrs->clustersNative->nClusters[0][0]), NSECTORS * GPUCA_ROW_COUNT, fp); + if (ioPtrs->clustersNative->clustersMCTruth) { + const auto& buffer = ioPtrs->clustersNative->clustersMCTruth->getBuffer(); std::pair tmp = {buffer.data(), buffer.size()}; DumpData(fp, &tmp.first, &tmp.second, InOutPointerType::CLUSTER_NATIVE_MC); } } } - if (mIOPtrs.tpcPackedDigits) { - if (DumpData(fp, mIOPtrs.tpcPackedDigits->tpcDigits, mIOPtrs.tpcPackedDigits->nTPCDigits, InOutPointerType::TPC_DIGIT) && mIOPtrs.tpcPackedDigits->tpcDigitsMC) { + if (ioPtrs->tpcPackedDigits) { + if (DumpData(fp, ioPtrs->tpcPackedDigits->tpcDigits, ioPtrs->tpcPackedDigits->nTPCDigits, InOutPointerType::TPC_DIGIT) && ioPtrs->tpcPackedDigits->tpcDigitsMC) { const char* ptrs[NSECTORS]; size_t sizes[NSECTORS]; for (uint32_t i = 0; i < NSECTORS; i++) { - if (mIOPtrs.tpcPackedDigits->tpcDigitsMC->v[i]) { - const auto& buffer = mIOPtrs.tpcPackedDigits->tpcDigitsMC->v[i]->getBuffer(); + if (ioPtrs->tpcPackedDigits->tpcDigitsMC->v[i]) { + const auto& buffer = ioPtrs->tpcPackedDigits->tpcDigitsMC->v[i]->getBuffer(); ptrs[i] = buffer.data(); sizes[i] = buffer.size(); } else { @@ -100,12 +103,12 @@ void GPUChainTracking::DumpData(const char* filename) DumpData(fp, ptrs, sizes, InOutPointerType::TPC_DIGIT_MC); } } - if (mIOPtrs.tpcZS) { + if (ioPtrs->tpcZS) { size_t total = 0; for (int32_t i = 0; i < NSECTORS; i++) { for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { - for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[i].count[j]; k++) { - total += mIOPtrs.tpcZS->sector[i].nZSPtr[j][k]; + for (uint32_t k = 0; k < ioPtrs->tpcZS->sector[i].count[j]; k++) { + total += ioPtrs->tpcZS->sector[i].nZSPtr[j][k]; } } } @@ -115,10 +118,10 @@ void GPUChainTracking::DumpData(const char* filename) total = 0; for (int32_t i = 0; i < NSECTORS; i++) { for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { - for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[i].count[j]; k++) { - memcpy(&ptr[total * TPCZSHDR::TPC_ZS_PAGE_SIZE], mIOPtrs.tpcZS->sector[i].zsPtr[j][k], mIOPtrs.tpcZS->sector[i].nZSPtr[j][k] * TPCZSHDR::TPC_ZS_PAGE_SIZE); - counts.count[i][j] += mIOPtrs.tpcZS->sector[i].nZSPtr[j][k]; - total += mIOPtrs.tpcZS->sector[i].nZSPtr[j][k]; + for (uint32_t k = 0; k < ioPtrs->tpcZS->sector[i].count[j]; k++) { + memcpy(&ptr[total * TPCZSHDR::TPC_ZS_PAGE_SIZE], ioPtrs->tpcZS->sector[i].zsPtr[j][k], ioPtrs->tpcZS->sector[i].nZSPtr[j][k] * TPCZSHDR::TPC_ZS_PAGE_SIZE); + counts.count[i][j] += ioPtrs->tpcZS->sector[i].nZSPtr[j][k]; + total += ioPtrs->tpcZS->sector[i].nZSPtr[j][k]; } } } @@ -127,33 +130,33 @@ void GPUChainTracking::DumpData(const char* filename) fwrite(&counts, sizeof(counts), 1, fp); } } - if (mIOPtrs.tpcCompressedClusters) { - if (mIOPtrs.tpcCompressedClusters->ptrForward) { + if (ioPtrs->tpcCompressedClusters) { + if (ioPtrs->tpcCompressedClusters->ptrForward) { throw std::runtime_error("Cannot dump non-flat compressed clusters"); } - char* ptr = (char*)mIOPtrs.tpcCompressedClusters; - size_t size = mIOPtrs.tpcCompressedClusters->totalDataSize; + char* ptr = (char*)ioPtrs->tpcCompressedClusters; + size_t size = ioPtrs->tpcCompressedClusters->totalDataSize; DumpData(fp, &ptr, &size, InOutPointerType::TPC_COMPRESSED_CL); } - if (mIOPtrs.settingsTF) { + if (ioPtrs->settingsTF) { uint32_t n = 1; - DumpData(fp, &mIOPtrs.settingsTF, &n, InOutPointerType::TF_SETTINGS); + DumpData(fp, &ioPtrs->settingsTF, &n, InOutPointerType::TF_SETTINGS); } - DumpData(fp, mIOPtrs.sectorTracks, mIOPtrs.nSectorTracks, InOutPointerType::SECTOR_OUT_TRACK); - DumpData(fp, mIOPtrs.sectorClusters, mIOPtrs.nSectorClusters, InOutPointerType::SECTOR_OUT_CLUSTER); - DumpData(fp, &mIOPtrs.mcLabelsTPC, &mIOPtrs.nMCLabelsTPC, InOutPointerType::MC_LABEL_TPC); - DumpData(fp, &mIOPtrs.mcInfosTPC, &mIOPtrs.nMCInfosTPC, InOutPointerType::MC_INFO_TPC); - DumpData(fp, &mIOPtrs.mcInfosTPCCol, &mIOPtrs.nMCInfosTPCCol, InOutPointerType::MC_INFO_TPC); - DumpData(fp, &mIOPtrs.mergedTracks, &mIOPtrs.nMergedTracks, InOutPointerType::MERGED_TRACK); - DumpData(fp, &mIOPtrs.mergedTrackHits, &mIOPtrs.nMergedTrackHits, InOutPointerType::MERGED_TRACK_HIT); - DumpData(fp, &mIOPtrs.trdTracks, &mIOPtrs.nTRDTracks, InOutPointerType::TRD_TRACK); - DumpData(fp, &mIOPtrs.trdTracklets, &mIOPtrs.nTRDTracklets, InOutPointerType::TRD_TRACKLET); - if (mIOPtrs.trdSpacePoints) { - DumpData(fp, &mIOPtrs.trdSpacePoints, &mIOPtrs.nTRDTracklets, InOutPointerType::TRD_SPACEPOINT); + DumpData(fp, ioPtrs->sectorTracks, ioPtrs->nSectorTracks, InOutPointerType::SECTOR_OUT_TRACK); + DumpData(fp, ioPtrs->sectorClusters, ioPtrs->nSectorClusters, InOutPointerType::SECTOR_OUT_CLUSTER); + DumpData(fp, &ioPtrs->mcLabelsTPC, &ioPtrs->nMCLabelsTPC, InOutPointerType::MC_LABEL_TPC); + DumpData(fp, &ioPtrs->mcInfosTPC, &ioPtrs->nMCInfosTPC, InOutPointerType::MC_INFO_TPC); + DumpData(fp, &ioPtrs->mcInfosTPCCol, &ioPtrs->nMCInfosTPCCol, InOutPointerType::MC_INFO_TPC); + DumpData(fp, &ioPtrs->mergedTracks, &ioPtrs->nMergedTracks, InOutPointerType::MERGED_TRACK); + DumpData(fp, &ioPtrs->mergedTrackHits, &ioPtrs->nMergedTrackHits, InOutPointerType::MERGED_TRACK_HIT); + DumpData(fp, &ioPtrs->trdTracks, &ioPtrs->nTRDTracks, InOutPointerType::TRD_TRACK); + DumpData(fp, &ioPtrs->trdTracklets, &ioPtrs->nTRDTracklets, InOutPointerType::TRD_TRACKLET); + if (ioPtrs->trdSpacePoints) { + DumpData(fp, &ioPtrs->trdSpacePoints, &ioPtrs->nTRDTracklets, InOutPointerType::TRD_SPACEPOINT); } - DumpData(fp, &mIOPtrs.trdTriggerTimes, &mIOPtrs.nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); - DumpData(fp, &mIOPtrs.trdTrackletIdxFirst, &mIOPtrs.nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); - DumpData(fp, &mIOPtrs.trdTrigRecMask, &mIOPtrs.nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); + DumpData(fp, &ioPtrs->trdTriggerTimes, &ioPtrs->nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); + DumpData(fp, &ioPtrs->trdTrackletIdxFirst, &ioPtrs->nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); + DumpData(fp, &ioPtrs->trdTrigRecMask, &ioPtrs->nTRDTriggerRecords, InOutPointerType::TRD_TRIGGERRECORDS); fclose(fp); } diff --git a/GPU/GPUTracking/Global/GPUErrorCodes.h b/GPU/GPUTracking/Global/GPUErrorCodes.h index f35f5fc81a382..8fec23be00a09 100644 --- a/GPU/GPUTracking/Global/GPUErrorCodes.h +++ b/GPU/GPUTracking/Global/GPUErrorCodes.h @@ -47,5 +47,6 @@ GPUCA_ERROR_CODE(26, ERROR_TPCZS_INVALID_ROW, SectorRow) GPUCA_ERROR_CODE(27, ERROR_TPCZS_INVALID_NADC, SectorCRU, SamplesInPage, SamplesWritten) // Invalid number of ADC samples in header, existing samples were decoded GPUCA_ERROR_CODE(28, ERROR_TPCZS_INCOMPLETE_HBF, SectorCRU, PacketCount, NextPacketCount) // Part of HBF is missing, decoding incomplete GPUCA_ERROR_CODE(29, ERROR_TPCZS_INVALID_OFFSET, SectorEndpoint, Value, Expected) // Raw page is skipped since it contains invalid payload offset +GPUCA_ERROR_CODE(29, MAX_GPUCA_ERROR_NUMBER) // #define GPUCA_CHECK_TPCZS_CORRUPTION diff --git a/GPU/GPUTracking/Global/GPUErrors.cxx b/GPU/GPUTracking/Global/GPUErrors.cxx index 7f3ed1d8206d9..e9d5a74c6567a 100644 --- a/GPU/GPUTracking/Global/GPUErrors.cxx +++ b/GPU/GPUTracking/Global/GPUErrors.cxx @@ -54,12 +54,17 @@ static std::unordered_map errorNames = { #undef GPUCA_ERROR_CODE }; -void GPUErrors::printErrors(bool silent) +bool GPUErrors::printErrors(bool silent, uint64_t mask) { + bool retVal = 0; for (uint32_t i = 0; i < std::min(*mErrors, GPUCA_MAX_ERRORS); i++) { uint32_t errorCode = mErrors[4 * i + 1]; const auto& it = errorNames.find(errorCode); const char* errorName = it == errorNames.end() ? "INVALID ERROR CODE" : it->second; + static_assert(MAX_GPUCA_ERROR_NUMBER <= sizeof(mask) * 8); + if (mask & (1 << errorCode)) { + retVal = 1; + } if (silent && i) { GPUWarning("GPU Error Code (%u:%u) %s : %u / %u / %u", i, errorCode, errorName, mErrors[4 * i + 2], mErrors[4 * i + 3], mErrors[4 * i + 4]); } else if (silent) { @@ -75,6 +80,7 @@ void GPUErrors::printErrors(bool silent) GPUError("Additional errors occured (codes not stored)"); } } + return retVal; } uint32_t GPUErrors::getNErrors() const diff --git a/GPU/GPUTracking/Global/GPUErrors.h b/GPU/GPUTracking/Global/GPUErrors.h index cd86390bc1b01..1cbc4a019601d 100644 --- a/GPU/GPUTracking/Global/GPUErrors.h +++ b/GPU/GPUTracking/Global/GPUErrors.h @@ -33,7 +33,7 @@ class GPUErrors GPUd() bool hasError() { return *mErrors > 0; } void setMemory(GPUglobalref() uint32_t* m) { mErrors = m; } void clear(); - void printErrors(bool silent = false); + bool printErrors(bool silent = false, uint64_t mask = 0); uint32_t getNErrors() const; const uint32_t* getErrorPtr() const; static uint32_t getMaxErrors();