From 9308c4c450bea828af5f618bd39a72a34b7cf20a Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 8 May 2025 21:16:48 +0200 Subject: [PATCH 1/4] GPU: Add debugSuffix option for debug files --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + GPU/GPUTracking/Global/GPUChainTracking.cxx | 6 +++--- GPU/GPUTracking/Global/GPUChainTrackingIO.cxx | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 638a3ed43d2aa..9e0aa32155f0d 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -296,6 +296,7 @@ AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GP AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))") AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, 262143, "", 0, "Mask for debug output dumps to file") +AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1)) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index f8d4165477220..c1c3e368ce90c 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -378,7 +378,7 @@ int32_t GPUChainTracking::Init() } if (GetProcessingSettings().debugLevel >= 6) { - std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + ".out"; + std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + GetProcessingSettings().debugLogSuffix + ".out"; mDebugFile->open(filename.c_str()); } @@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunChainFinalize() int32_t iKey; do { - Sleep(10); + usleep(10000); if (GetProcessingSettings().eventDisplay->EnableSendKey()) { iKey = kbhit() ? getch() : 0; if (iKey == 27) { @@ -847,7 +847,7 @@ int32_t GPUChainTracking::RunChainFinalize() break; } else if (iKey) { while (GetProcessingSettings().eventDisplay->getSendKey() != 0) { - Sleep(1); + usleep(1000); } GetProcessingSettings().eventDisplay->setSendKey(iKey); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx index 5e7672022b3ff..035e257ca7952 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx @@ -47,8 +47,6 @@ #include "TPCFastTransform.h" #include "CorrectionMapsHelper.h" -#include "utils/linux_helpers.h" - using namespace o2::gpu; #include "GPUO2DataTypes.h" From 6e3244af8d8813b8c79b04f3aa138c30dd5cd9bf Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 8 May 2025 20:53:50 +0200 Subject: [PATCH 2/4] GPU: Add sorting of tracks of attached compressed clusters in deterministic mode --- .../DataCompression/GPUTPCCompression.cxx | 26 ++++++------ GPU/GPUTracking/Global/GPUChainTracking.h | 1 + .../Global/GPUChainTrackingCompression.cxx | 4 ++ .../GPUChainTrackingDebugAndProfiling.cxx | 42 +++++++++++++++++++ 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx index a107f749ddd77..82834a694d0ba 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx @@ -140,7 +140,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out) for (uint32_t i = 0; i < NSECTORS; i++) { out << "Sector " << i << ": "; for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { - out << O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] << ", "; + out << (O.nSliceRowClusters ? O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] : 0) << ", "; } out << "\n"; } @@ -153,18 +153,20 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out) } out << "\n\nUnattached Clusters\n"; uint32_t offset = 0; - for (uint32_t i = 0; i < NSECTORS; i++) { - for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { - out << "Sector " << i << " Row " << j << ": "; - for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) { - if (k && k % 10 == 0) { - out << "\n "; + if (O.nSliceRowClusters) { + for (uint32_t i = 0; i < NSECTORS; i++) { + for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { + out << "Sector " << i << " Row " << j << ": "; + for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) { + if (k && k % 10 == 0) { + out << "\n "; + } + const uint32_t l = k + offset; + out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] "; } - const uint32_t l = k + offset; - out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] "; + offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; + out << "\n"; } - offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; - out << "\n"; } } out << "\n\nAttached Clusters\n"; @@ -175,7 +177,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out) if (k && k % 10 == 0) { out << "\n "; } - const uint32_t l1 = k + offset, l2 = k + offset - i; + const uint32_t l1 = offset + k, l2 = offset - i + k - 1; out << "["; if (k) { out << (int32_t)O.rowDiffA[l2] << ", " << (int32_t)O.sliceLegDiffA[l2] << ", " << (uint32_t)O.padResA[l2] << ", " << (uint32_t)O.timeResA[l2] << ", "; diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 13773a97d4e3d..2a2996895dbcf 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -235,6 +235,7 @@ class GPUChainTracking : public GPUChain void PrintDebugOutput(); void PrintOutputStat(); static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters); + static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls); bool ValidateSteps(); bool ValidateSettings(); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 534c02a4c0a84..3bcd2390eae52 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -203,6 +203,10 @@ int32_t GPUChainTracking::RunTPCCompression() ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec); } mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR")); + if (GetProcessingSettings().deterministicGPUReconstruction) { + SynchronizeGPU(); + DebugSortCompressedClusters(Compressor.mOutputFlat); + } DoDebugAndDump(RecoStep::TPCCompression, GPUChainTrackingDebugFlags::TPCCompressedClusters, Compressor, &GPUTPCCompression::DumpCompressedClusters, *mDebugFile); return 0; } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index 903505068ad2c..00cf127162b94 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE #include "bitmapfile.h" @@ -348,3 +349,44 @@ void GPUChainTracking::DumpClusters(std::ostream& out, const o2::tpc::ClusterNat } } } + +void GPUChainTracking::DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls) +{ + o2::tpc::CompressedClusters c = *cls; + std::vector sorted(c.nTracks), offsets(c.nTracks); + std::iota(sorted.begin(), sorted.end(), 0); + auto sorter = [&c](const auto a, const auto b) { + return std::tie(c.sliceA[a], c.rowA[a], c.timeA[a], c.padA[a], c.qPtA[a]) < + std::tie(c.sliceA[b], c.rowA[b], c.timeA[b], c.padA[b], c.qPtA[b]); + }; + std::sort(sorted.begin(), sorted.end(), sorter); + uint32_t offset = 0; + for (uint32_t i = 0; i < c.nTracks; i++) { + offsets[i] = offset; + offset += c.nTrackClusters[i]; + } + + auto sortArray = [&c, &sorted, &offsets](auto* src, size_t totalSize, auto getOffset, auto getSize) { + auto buf = std::make_unique[]>(totalSize); + memcpy(buf.get(), src, totalSize * sizeof(*src)); + uint32_t targetOffset = 0; + for (uint32_t i = 0; i < c.nTracks; i++) { + const uint32_t j = sorted[i]; + memcpy(src + targetOffset, buf.get() + getOffset(offsets[j], j), getSize(j) * sizeof(*src)); + targetOffset += getSize(j); + } + }; + auto sortMultiple = [&sortArray](size_t totalSize, auto getOffset, auto getSize, auto&&... arrays) { + (..., sortArray(std::forward(arrays), totalSize, getOffset, getSize)); + }; + auto getFullOffset = [](uint32_t off, uint32_t ind) { return off; }; + auto getReducedOffset = [](uint32_t off, uint32_t ind) { return off - ind; }; + auto getIndex = [](uint32_t off, uint32_t ind) { return ind; }; + auto getN = [&c](uint32_t j) { return c.nTrackClusters[j]; }; + auto getN1 = [&c](uint32_t j) { return c.nTrackClusters[j] - 1; }; + auto get1 = [](uint32_t j) { return 1; }; + + sortMultiple(c.nAttachedClusters, getFullOffset, getN, c.qTotA, c.qMaxA, c.flagsA, c.sigmaPadA, c.sigmaTimeA); + sortMultiple(c.nAttachedClustersReduced, getReducedOffset, getN1, c.rowDiffA, c.sliceLegDiffA, c.padResA, c.timeResA); + sortMultiple(c.nTracks, getIndex, get1, c.qPtA, c.rowA, c.sliceA, c.timeA, c.padA, c.nTrackClusters); // NOTE: This must be last, since nTrackClusters is used for handling the arrays above! +} From 8480305977bff25887ea3d8cd43e5fc4d178898d Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 8 May 2025 23:26:22 +0200 Subject: [PATCH 3/4] GPU: Make GPUCommonAlgorithm::sortInBlock deterministic with GPUCA_DETERMINISTIC_MODE --- GPU/Common/GPUCommonAlgorithm.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h index 417c9e0d1f8c1..d0643391246a8 100644 --- a/GPU/Common/GPUCommonAlgorithm.h +++ b/GPU/Common/GPUCommonAlgorithm.h @@ -283,21 +283,29 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp) #ifndef GPUCA_GPUCODE GPUCommonAlgorithm::sort(begin, end, comp); #else - int32_t n = end - begin; - for (int32_t i = 0; i < n; i++) { - for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) { - int32_t offset = i % 2; - int32_t curPos = 2 * tIdx + offset; - int32_t nextPos = curPos + 1; - - if (nextPos < n) { - if (!comp(begin[curPos], begin[nextPos])) { - IterSwap(&begin[curPos], &begin[nextPos]); + GPUCA_DETERMINISTIC_CODE( // clang-format off + GPUbarrier(); + if (get_local_id(0) == 0) { + GPUCommonAlgorithm::sort(begin, end, comp); + } + GPUbarrier(); + , // !GPUCA_DETERMINISTIC_CODE + int32_t n = end - begin; + for (int32_t i = 0; i < n; i++) { + for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) { + int32_t offset = i % 2; + int32_t curPos = 2 * tIdx + offset; + int32_t nextPos = curPos + 1; + + if (nextPos < n) { + if (!comp(begin[curPos], begin[nextPos])) { + IterSwap(&begin[curPos], &begin[nextPos]); + } } } + GPUbarrier(); } - GPUbarrier(); - } + ) // clang-format on #endif } From a5475261b766e3a1e512a7b0176136b014c6464a Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 8 May 2025 23:51:00 +0200 Subject: [PATCH 4/4] GPU: Use total sorting in deterministic mode for unattached clusters --- .../GPUTPCCompressionKernels.cxx | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx index 5dbbf63ca8264..3b88c8764d0fd 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx @@ -148,19 +148,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread -GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<0>::operator()(uint32_t a, uint32_t b) const +GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare::operator()(uint32_t a, uint32_t b) const { return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked(); } template <> -GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<1>::operator()(uint32_t a, uint32_t b) const +GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare::operator()(uint32_t a, uint32_t b) const { return mClsPtr[a].padPacked < mClsPtr[b].padPacked; } template <> -GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::operator()(uint32_t a, uint32_t b) const +GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare::operator()(uint32_t a, uint32_t b) const { if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) { return mClsPtr[a].padPacked < mClsPtr[b].padPacked; @@ -169,7 +169,7 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::opera } template <> -GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::operator()(uint32_t a, uint32_t b) const +GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare::operator()(uint32_t a, uint32_t b) const { if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) { return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked(); @@ -177,6 +177,18 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::opera return mClsPtr[a].padPacked < mClsPtr[b].padPacked; } +template <> // Deterministic comparison +GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::operator()(uint32_t a, uint32_t b) const +{ + if (mClsPtr[a].getTimePacked() != mClsPtr[b].getTimePacked()) { + return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked(); + } + if (mClsPtr[a].padPacked != mClsPtr[b].padPacked) { + return mClsPtr[a].padPacked < mClsPtr[b].padPacked; + } + return mClsPtr[a].qTot < mClsPtr[b].qTot; +} + template <> GPUdii() void GPUTPCCompressionKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors) { @@ -261,15 +273,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread(clusters->clusters[iSector][iRow])); - } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) { - CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); - } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) { - CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); - } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) { - CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); - } + , // !GPUCA_DETERMINISTIC_CODE + if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) { + CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); + } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) { + CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); + } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) { + CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); + } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) { + CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare(clusters->clusters[iSector][iRow])); + } + ) // clang-format on GPUbarrier(); }