From ab5cad3107f2be5d85f07252c5750d6561a11095 Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Fri, 14 Feb 2025 14:02:39 +0100 Subject: [PATCH 1/6] Feat: add skeleton for GPUErrorQA task --- Detectors/TPC/qc/CMakeLists.txt | 4 +- Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h | 69 +++++++++++++++++++++ Detectors/TPC/qc/src/GPUErrorQA.cxx | 55 ++++++++++++++++ Detectors/TPC/qc/src/TPCQCLinkDef.h | 1 + 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h create mode 100644 Detectors/TPC/qc/src/GPUErrorQA.cxx diff --git a/Detectors/TPC/qc/CMakeLists.txt b/Detectors/TPC/qc/CMakeLists.txt index 6bb4c726a90fa..60195ed6d451a 100644 --- a/Detectors/TPC/qc/CMakeLists.txt +++ b/Detectors/TPC/qc/CMakeLists.txt @@ -19,6 +19,7 @@ o2_add_library(TPCQC src/SACs.cxx src/IDCsVsSACs.cxx src/TrackClusters.cxx + src/GPUErrorQA.cxx PUBLIC_LINK_LIBRARIES O2::TPCBase O2::DataFormatsTPC O2::GPUO2Interface @@ -36,7 +37,8 @@ o2_target_root_dictionary(TPCQC include/TPCQC/DCSPTemperature.h include/TPCQC/SACs.h include/TPCQC/IDCsVsSACs.h - include/TPCQC/TrackClusters.h) + include/TPCQC/TrackClusters.h + include/TPCQC/GPUErrorQA.h) o2_add_test(PID COMPONENT_NAME tpc diff --git a/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h new file mode 100644 index 0000000000000..e9ddcfb66d594 --- /dev/null +++ b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h @@ -0,0 +1,69 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// +/// @file GPUErrorQA.h +/// @author Anton Riedel, anton.riedel@cern.ch +/// + +#ifndef AliceO2_TPC_QC_GPUERRORQA_H +#define AliceO2_TPC_QC_GPUERRORQA_H + +#include +#include + +// root includes +#include "TH1.h" + +// o2 includes +// #include "DataFormatsTPC/Defs.h" + +namespace o2 +{ +namespace tpc +{ +namespace qc +{ + +/// @brief TPC QC task for errors from GPU reconstruction +/// +/// This class is used to retrieve and visualize GPU errors +/// according to corresponding error code and location. +/// +/// origin: TPC +/// @author Anton Riedel, anton.riedel@cern.ch +class GPUErrorQA +{ + public: + /// \brief Constructor. + GPUErrorQA() = default; + + /// process gpu error reported by the reconstruction workflow + void processErrors(gsl::span> errors); + + /// Initialize all histograms + void initializeHistograms(); + + /// Reset all histograms + void resetHistograms(); + + /// Dump results to a file + void dumpToFile(std::string filename); + + private: + std::unique_ptr mHist; + ClassDefNV(GPUErrorQA, 1) +}; +} // namespace qc +} // namespace tpc +} // namespace o2 + +#endif // AliceO2_TPC_QC_GPUERRORQA_H diff --git a/Detectors/TPC/qc/src/GPUErrorQA.cxx b/Detectors/TPC/qc/src/GPUErrorQA.cxx new file mode 100644 index 0000000000000..876c63d6e89f5 --- /dev/null +++ b/Detectors/TPC/qc/src/GPUErrorQA.cxx @@ -0,0 +1,55 @@ +// Copyright 2019-2025 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +#define _USE_MATH_DEFINES + +#include +#include + +// root includes +#include "TFile.h" +#include + +// o2 includes +#include "TPCQC/GPUErrorQA.h" +#include "GPUErrors.h" + +ClassImp(o2::tpc::qc::GPUErrorQA); + +using namespace o2::tpc::qc; + +//______________________________________________________________________________ +void GPUErrorQA::initializeHistograms() +{ + TH1::AddDirectory(false); + mHist = std::make_unique("ErrorCounter", "ErrorCounter", o2::gpu::GPUErrors::getMaxErrors(), 0, o2::gpu::GPUErrors::getMaxErrors()); +} +//______________________________________________________________________________ +void GPUErrorQA::resetHistograms() +{ + mHist->Reset(); +} +//______________________________________________________________________________ +void GPUErrorQA::processErrors(gsl::span> errors) +{ + for (const auto& error : errors) { + uint32_t errorCode = error[0]; + mHist->Fill(static_cast(errorCode)); + } +} + +//______________________________________________________________________________ +void GPUErrorQA::dumpToFile(const std::string filename) +{ + auto f = std::unique_ptr(TFile::Open(filename.c_str(), "recreate")); + mHist->Write(); + f->Close(); +} diff --git a/Detectors/TPC/qc/src/TPCQCLinkDef.h b/Detectors/TPC/qc/src/TPCQCLinkDef.h index c227ebcad8c09..3921d7dfe5649 100644 --- a/Detectors/TPC/qc/src/TPCQCLinkDef.h +++ b/Detectors/TPC/qc/src/TPCQCLinkDef.h @@ -24,6 +24,7 @@ #pragma link C++ class o2::tpc::qc::SACs + ; #pragma link C++ class o2::tpc::qc::IDCsVsSACs + ; #pragma link C++ class o2::tpc::qc::TrackClusters + ; +#pragma link C++ class o2::tpc::qc::GPUErrorQA + ; #pragma link C++ function o2::tpc::qc::helpers::makeLogBinning + ; #pragma link C++ function o2::tpc::qc::helpers::setStyleHistogram1D + ; #pragma link C++ function o2::tpc::qc::helpers::setStyleHistogram2D + ; From 3873f12211f986d3f138dc749148dbfb08b2c134 Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Thu, 6 Mar 2025 10:26:24 +0100 Subject: [PATCH 2/6] Feat: add GPUErrorQA class --- Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h | 28 ++++++------- Detectors/TPC/qc/src/GPUErrorQA.cxx | 44 +++++++++++++++------ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h index e9ddcfb66d594..797e5da0223b7 100644 --- a/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h +++ b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h @@ -18,22 +18,20 @@ #define AliceO2_TPC_QC_GPUERRORQA_H #include -#include +#include +#include +#include // root includes -#include "TH1.h" // o2 includes // #include "DataFormatsTPC/Defs.h" -namespace o2 -{ -namespace tpc -{ -namespace qc +class TH1; +namespace o2::tpc::qc { -/// @brief TPC QC task for errors from GPU reconstruction +/// @brief TPC QC task for errors from GPU reconstruction /// /// This class is used to retrieve and visualize GPU errors /// according to corresponding error code and location. @@ -47,7 +45,7 @@ class GPUErrorQA GPUErrorQA() = default; /// process gpu error reported by the reconstruction workflow - void processErrors(gsl::span> errors); + void processErrors(std::vector> errors); /// Initialize all histograms void initializeHistograms(); @@ -55,15 +53,17 @@ class GPUErrorQA /// Reset all histograms void resetHistograms(); + /// return histograms + const std::unordered_map>& getMapHist() const { return mMapHist; }; + /// Dump results to a file void dumpToFile(std::string filename); private: - std::unique_ptr mHist; - ClassDefNV(GPUErrorQA, 1) + std::unordered_map> mMapHist; + + ClassDefNV(GPUErrorQA, 1); }; -} // namespace qc -} // namespace tpc -} // namespace o2 +} // namespace o2::tpc::qc #endif // AliceO2_TPC_QC_GPUERRORQA_H diff --git a/Detectors/TPC/qc/src/GPUErrorQA.cxx b/Detectors/TPC/qc/src/GPUErrorQA.cxx index 876c63d6e89f5..f59332b658466 100644 --- a/Detectors/TPC/qc/src/GPUErrorQA.cxx +++ b/Detectors/TPC/qc/src/GPUErrorQA.cxx @@ -11,16 +11,13 @@ #define _USE_MATH_DEFINES -#include -#include - // root includes #include "TFile.h" -#include +#include "TH1I.h" // o2 includes #include "TPCQC/GPUErrorQA.h" -#include "GPUErrors.h" +#include "GPUDefMacros.h" ClassImp(o2::tpc::qc::GPUErrorQA); @@ -30,26 +27,49 @@ using namespace o2::tpc::qc; void GPUErrorQA::initializeHistograms() { TH1::AddDirectory(false); - mHist = std::make_unique("ErrorCounter", "ErrorCounter", o2::gpu::GPUErrors::getMaxErrors(), 0, o2::gpu::GPUErrors::getMaxErrors()); + + // get gpu error names + // copied from GPUErrors.h + static std::unordered_map errorNames = { +#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, +#include "GPUErrorCodes.h" +#undef GPUCA_ERROR_CODE + }; + + // 1D histogram counting all reported errors + mMapHist["ErrorCounter"] = std::make_unique("ErrorCounter", "ErrorCounter", errorNames.size(), -0.5, errorNames.size() - 0.5); + mMapHist["ErrorCounter"]->GetXaxis()->SetTitle("Error Codes"); + mMapHist["ErrorCounter"]->GetYaxis()->SetTitle("Entries"); + // for convienence, label each bin with the error name + for (size_t bin = 1; bin < mMapHist["ErrorCounter"]->GetNbinsX(); bin++) { + auto const& it = errorNames.find(bin); + mMapHist["ErrorCounter"]->GetXaxis()->SetBinLabel(bin, it->second); + } } //______________________________________________________________________________ void GPUErrorQA::resetHistograms() { - mHist->Reset(); + for (const auto& pair : mMapHist) { + pair.second->Reset(); + } } //______________________________________________________________________________ -void GPUErrorQA::processErrors(gsl::span> errors) +void GPUErrorQA::processErrors(std::vector> errors) { for (const auto& error : errors) { uint32_t errorCode = error[0]; - mHist->Fill(static_cast(errorCode)); + mMapHist["ErrorCounter"]->AddBinContent(errorCode); } } //______________________________________________________________________________ void GPUErrorQA::dumpToFile(const std::string filename) { - auto f = std::unique_ptr(TFile::Open(filename.c_str(), "recreate")); - mHist->Write(); - f->Close(); + auto f = std::unique_ptr(TFile::Open(filename.data(), "recreate")); + TObjArray arr; + arr.SetName("GPUErrorQA_Hists"); + for (const auto& [name, hist] : mMapHist) { + arr.Add(hist.get()); + } + arr.Write(arr.GetName(), TObject::kSingleKey); } From 78c3f10201b5b620955bed128035e7f37e50f028 Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Thu, 12 Jun 2025 15:14:07 +0200 Subject: [PATCH 3/6] Feat: movde error names to GPUErrors.h --- Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h | 4 ++-- Detectors/TPC/qc/src/GPUErrorQA.cxx | 16 ++++------------ GPU/GPUTracking/Global/GPUErrors.cxx | 7 ------- GPU/GPUTracking/Global/GPUErrors.h | 8 ++++++++ 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h index 797e5da0223b7..ec171a6925a98 100644 --- a/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h +++ b/Detectors/TPC/qc/include/TPCQC/GPUErrorQA.h @@ -54,7 +54,7 @@ class GPUErrorQA void resetHistograms(); /// return histograms - const std::unordered_map>& getMapHist() const { return mMapHist; }; + const std::unordered_map>& getMapHist() const { return mMapHist; } /// Dump results to a file void dumpToFile(std::string filename); @@ -62,7 +62,7 @@ class GPUErrorQA private: std::unordered_map> mMapHist; - ClassDefNV(GPUErrorQA, 1); + ClassDefNV(GPUErrorQA, 2); }; } // namespace o2::tpc::qc diff --git a/Detectors/TPC/qc/src/GPUErrorQA.cxx b/Detectors/TPC/qc/src/GPUErrorQA.cxx index f59332b658466..912b036a6cccf 100644 --- a/Detectors/TPC/qc/src/GPUErrorQA.cxx +++ b/Detectors/TPC/qc/src/GPUErrorQA.cxx @@ -17,7 +17,7 @@ // o2 includes #include "TPCQC/GPUErrorQA.h" -#include "GPUDefMacros.h" +#include "GPUErrors.h" ClassImp(o2::tpc::qc::GPUErrorQA); @@ -28,21 +28,13 @@ void GPUErrorQA::initializeHistograms() { TH1::AddDirectory(false); - // get gpu error names - // copied from GPUErrors.h - static std::unordered_map errorNames = { -#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, -#include "GPUErrorCodes.h" -#undef GPUCA_ERROR_CODE - }; - // 1D histogram counting all reported errors - mMapHist["ErrorCounter"] = std::make_unique("ErrorCounter", "ErrorCounter", errorNames.size(), -0.5, errorNames.size() - 0.5); + mMapHist["ErrorCounter"] = std::make_unique("ErrorCounter", "ErrorCounter", o2::gpu::errorNames.size(), -0.5, o2::gpu::errorNames.size() - 0.5); mMapHist["ErrorCounter"]->GetXaxis()->SetTitle("Error Codes"); mMapHist["ErrorCounter"]->GetYaxis()->SetTitle("Entries"); // for convienence, label each bin with the error name for (size_t bin = 1; bin < mMapHist["ErrorCounter"]->GetNbinsX(); bin++) { - auto const& it = errorNames.find(bin); + auto const& it = o2::gpu::errorNames.find(bin); mMapHist["ErrorCounter"]->GetXaxis()->SetBinLabel(bin, it->second); } } @@ -68,7 +60,7 @@ void GPUErrorQA::dumpToFile(const std::string filename) auto f = std::unique_ptr(TFile::Open(filename.data(), "recreate")); TObjArray arr; arr.SetName("GPUErrorQA_Hists"); - for (const auto& [name, hist] : mMapHist) { + for ([[maybe_unused]] const auto& [name, hist] : mMapHist) { arr.Add(hist.get()); } arr.Write(arr.GetName(), TObject::kSingleKey); diff --git a/GPU/GPUTracking/Global/GPUErrors.cxx b/GPU/GPUTracking/Global/GPUErrors.cxx index e9d5a74c6567a..ed4ca892cc331 100644 --- a/GPU/GPUTracking/Global/GPUErrors.cxx +++ b/GPU/GPUTracking/Global/GPUErrors.cxx @@ -36,7 +36,6 @@ GPUd() void GPUErrors::raiseError(uint32_t code, uint32_t param1, uint32_t param #ifndef GPUCA_GPUCODE #include -#include uint32_t GPUErrors::getMaxErrors() { @@ -48,12 +47,6 @@ void GPUErrors::clear() memset(mErrors, 0, GPUCA_MAX_ERRORS * sizeof(*mErrors)); } -static std::unordered_map errorNames = { -#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, -#include "GPUErrorCodes.h" -#undef GPUCA_ERROR_CODE -}; - bool GPUErrors::printErrors(bool silent, uint64_t mask) { bool retVal = 0; diff --git a/GPU/GPUTracking/Global/GPUErrors.h b/GPU/GPUTracking/Global/GPUErrors.h index 1cbc4a019601d..698c0ef0c026c 100644 --- a/GPU/GPUTracking/Global/GPUErrors.h +++ b/GPU/GPUTracking/Global/GPUErrors.h @@ -16,10 +16,18 @@ #define GPUERRORS_H #include "GPUCommonDef.h" +#include "GPUDefMacros.h" +#include namespace o2::gpu { +static std::unordered_map errorNames = { +#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, +#include "GPUErrorCodes.h" +#undef GPUCA_ERROR_CODE +}; + class GPUErrors { public: From 0772de257504dbe6d25278a9d7d9ad72f0eaf9d3 Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Tue, 17 Jun 2025 16:49:10 +0200 Subject: [PATCH 4/6] Fix: expose gpu error names with static function --- GPU/GPUTracking/Global/GPUErrors.cxx | 12 ++++++++++++ GPU/GPUTracking/Global/GPUErrors.h | 11 ++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUErrors.cxx b/GPU/GPUTracking/Global/GPUErrors.cxx index ed4ca892cc331..4baa299c6b976 100644 --- a/GPU/GPUTracking/Global/GPUErrors.cxx +++ b/GPU/GPUTracking/Global/GPUErrors.cxx @@ -36,6 +36,7 @@ GPUd() void GPUErrors::raiseError(uint32_t code, uint32_t param1, uint32_t param #ifndef GPUCA_GPUCODE #include +#include uint32_t GPUErrors::getMaxErrors() { @@ -47,9 +48,20 @@ void GPUErrors::clear() memset(mErrors, 0, GPUCA_MAX_ERRORS * sizeof(*mErrors)); } +const std::unordered_map& GPUErrors::getErrorNames() +{ + static std::unordered_map errorNames = { +#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, +#include "GPUErrorCodes.h" +#undef GPUCA_ERROR_CODE + }; + return errorNames; +} + bool GPUErrors::printErrors(bool silent, uint64_t mask) { bool retVal = 0; + const auto& errorNames = getErrorNames(); for (uint32_t i = 0; i < std::min(*mErrors, GPUCA_MAX_ERRORS); i++) { uint32_t errorCode = mErrors[4 * i + 1]; const auto& it = errorNames.find(errorCode); diff --git a/GPU/GPUTracking/Global/GPUErrors.h b/GPU/GPUTracking/Global/GPUErrors.h index 698c0ef0c026c..d14c9ffaeda96 100644 --- a/GPU/GPUTracking/Global/GPUErrors.h +++ b/GPU/GPUTracking/Global/GPUErrors.h @@ -16,18 +16,14 @@ #define GPUERRORS_H #include "GPUCommonDef.h" -#include "GPUDefMacros.h" + +#ifndef GPUCA_GPUCODE #include +#endif namespace o2::gpu { -static std::unordered_map errorNames = { -#define GPUCA_ERROR_CODE(num, name, ...) {num, GPUCA_M_STR(name)}, -#include "GPUErrorCodes.h" -#undef GPUCA_ERROR_CODE -}; - class GPUErrors { public: @@ -42,6 +38,7 @@ class GPUErrors void setMemory(GPUglobalref() uint32_t* m) { mErrors = m; } void clear(); bool printErrors(bool silent = false, uint64_t mask = 0); + static const std::unordered_map& getErrorNames(); uint32_t getNErrors() const; const uint32_t* getErrorPtr() const; static uint32_t getMaxErrors(); From 7fd0535c73ba3540f42517deaae7f73b8c41e069 Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Tue, 17 Jun 2025 17:12:50 +0200 Subject: [PATCH 5/6] Feat: account for missing error codes --- Detectors/TPC/qc/src/GPUErrorQA.cxx | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/Detectors/TPC/qc/src/GPUErrorQA.cxx b/Detectors/TPC/qc/src/GPUErrorQA.cxx index 912b036a6cccf..d4848aaefecb7 100644 --- a/Detectors/TPC/qc/src/GPUErrorQA.cxx +++ b/Detectors/TPC/qc/src/GPUErrorQA.cxx @@ -28,14 +28,27 @@ void GPUErrorQA::initializeHistograms() { TH1::AddDirectory(false); + auto const& errorNames = o2::gpu::GPUErrors::getErrorNames(); + + int maxErrorCode = 1; + for (const auto& [key, _] : errorNames) { + if (static_cast(key) > maxErrorCode) { + maxErrorCode = key; + } + } + // 1D histogram counting all reported errors - mMapHist["ErrorCounter"] = std::make_unique("ErrorCounter", "ErrorCounter", o2::gpu::errorNames.size(), -0.5, o2::gpu::errorNames.size() - 0.5); + mMapHist["ErrorCounter"] = std::make_unique("ErrorCounter", "ErrorCounter", maxErrorCode, -0.5, maxErrorCode - 0.5); mMapHist["ErrorCounter"]->GetXaxis()->SetTitle("Error Codes"); mMapHist["ErrorCounter"]->GetYaxis()->SetTitle("Entries"); // for convienence, label each bin with the error name - for (size_t bin = 1; bin < mMapHist["ErrorCounter"]->GetNbinsX(); bin++) { - auto const& it = o2::gpu::errorNames.find(bin); - mMapHist["ErrorCounter"]->GetXaxis()->SetBinLabel(bin, it->second); + for (size_t bin = 1; bin <= maxErrorCode; bin++) { + auto const& it = errorNames.find(bin); + if (it != errorNames.end()) { + mMapHist["ErrorCounter"]->GetXaxis()->SetBinLabel(bin, it->second); + } else { + mMapHist["ErrorCounter"]->GetXaxis()->SetBinLabel(bin, "NO_DEF"); + } } } //______________________________________________________________________________ From 1b7252630196b2fad183435872658c591341437d Mon Sep 17 00:00:00 2001 From: Anton Riedel Date: Thu, 3 Jul 2025 16:32:01 +0200 Subject: [PATCH 6/6] Fix: add header guard --- GPU/GPUTracking/Global/GPUErrors.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Global/GPUErrors.h b/GPU/GPUTracking/Global/GPUErrors.h index d14c9ffaeda96..535364bf08ce1 100644 --- a/GPU/GPUTracking/Global/GPUErrors.h +++ b/GPU/GPUTracking/Global/GPUErrors.h @@ -16,7 +16,6 @@ #define GPUERRORS_H #include "GPUCommonDef.h" - #ifndef GPUCA_GPUCODE #include #endif @@ -38,7 +37,9 @@ class GPUErrors void setMemory(GPUglobalref() uint32_t* m) { mErrors = m; } void clear(); bool printErrors(bool silent = false, uint64_t mask = 0); +#ifndef GPUCA_GPUCODE static const std::unordered_map& getErrorNames(); +#endif uint32_t getNErrors() const; const uint32_t* getErrorPtr() const; static uint32_t getMaxErrors();