From 27002274c1fe6a15156e6a58108772012bd0d152 Mon Sep 17 00:00:00 2001 From: Felix Weiglhofer Date: Fri, 17 Oct 2025 15:31:27 +0200 Subject: [PATCH] GPU: Replace assertions with error counters in ZS decoding. --- GPU/GPUTracking/Global/GPUErrorCodes.h | 6 +- .../TPCClusterFinder/GPUTPCCFDecodeZS.cxx | 156 ++++++++++++------ .../TPCClusterFinder/GPUTPCCFDecodeZS.h | 9 +- GPU/GPUTracking/kernels.cmake | 2 +- 4 files changed, 116 insertions(+), 57 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUErrorCodes.h b/GPU/GPUTracking/Global/GPUErrorCodes.h index 8fec23be00a09..a4921f478b107 100644 --- a/GPU/GPUTracking/Global/GPUErrorCodes.h +++ b/GPU/GPUTracking/Global/GPUErrorCodes.h @@ -47,6 +47,10 @@ GPUCA_ERROR_CODE(26, ERROR_TPCZS_INVALID_ROW, SectorRow) GPUCA_ERROR_CODE(27, ERROR_TPCZS_INVALID_NADC, SectorCRU, SamplesInPage, SamplesWritten) // Invalid number of ADC samples in header, existing samples were decoded GPUCA_ERROR_CODE(28, ERROR_TPCZS_INCOMPLETE_HBF, SectorCRU, PacketCount, NextPacketCount) // Part of HBF is missing, decoding incomplete GPUCA_ERROR_CODE(29, ERROR_TPCZS_INVALID_OFFSET, SectorEndpoint, Value, Expected) // Raw page is skipped since it contains invalid payload offset -GPUCA_ERROR_CODE(29, MAX_GPUCA_ERROR_NUMBER) +GPUCA_ERROR_CODE(30, ERROR_TPCZS_INVALID_MAGIC_WORD, Value) // ZS header contains wrong magic word +GPUCA_ERROR_CODE(31, ERROR_TPCZS_PAGE_OVERFLOW, Position, PageEnd) // Ran out of page to decode +GPUCA_ERROR_CODE(32, ERROR_TPCZS_VERSION_MISMATCH, Value, Expected) // ZS decoder received page with wrong version +GPUCA_ERROR_CODE(33, ERROR_TPCZS_UNKNOWN, ErrorCode) // Unkown or invalid error code raised in decoder +GPUCA_ERROR_CODE(33, MAX_GPUCA_ERROR_NUMBER) // #define GPUCA_CHECK_TPCZS_CORRUPTION diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index f7bb64106fe4f..a548217e26b64 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -604,61 +604,107 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro const auto* decHeader = Peek(page, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2)); ConsumeHeader(page); - assert(decHeader->version >= ZSVersionDenseLinkBased); - assert(decHeader->magicWord == tpc::zerosupp_link_based::CommonHeader::MagicWordLinkZSMetaHeader); - uint16_t nSamplesWritten = 0; const uint16_t nSamplesInPage = decHeader->nADCsamples; const auto* payloadEnd = Peek(pageStart, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2) - ((decHeader->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) ? TPCZSHDRV2::TRIGGER_WORD_SIZE : 0)); const auto* nextPage = Peek(pageStart, TPCZSHDR::TPC_ZS_PAGE_SIZE); + const bool extendsToNextPage = decHeader->flags & TPCZSHDRV2::ZSFlags::payloadExtendsToNextPage; + ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader)); - for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) { + int err = GPUErrors::ERROR_NONE; - [[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page; - assert(sizeLeftInPage > 0); + if (decHeader->version < ZSVersionDenseLinkBased) { + err = GPUErrors::ERROR_TPCZS_VERSION_MISMATCH; + } - uint16_t nSamplesWrittenTB = 0; + if (decHeader->magicWord != zerosupp_link_based::CommonHeader::MagicWordLinkZSMetaHeader) { + err = GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD; + } + + for (uint16_t i = 0; i < decHeader->nTimebinHeaders && !err; i++) { + + ptrdiff_t sizeLeftInPage = payloadEnd - page; + if (sizeLeftInPage <= 0) { + err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; + break; + } + + int16_t nSamplesWrittenTB = 0; + uint16_t nSamplesLeftInPage = nSamplesInPage - nSamplesWritten; + + if (i == decHeader->nTimebinHeaders - 1 && extendsToNextPage) { + if (raw::RDHUtils::getMemorySize(*rawDataHeader) != TPCZSHDR::TPC_ZS_PAGE_SIZE) { + err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; + break; + } - if (i == decHeader->nTimebinHeaders - 1 && decHeader->flags & o2::tpc::TPCZSHDRV2::ZSFlags::payloadExtendsToNextPage) { - assert(o2::raw::RDHUtils::getMemorySize(*rawDataHeader) == TPCZSHDR::TPC_ZS_PAGE_SIZE); if ((uint16_t)(raw::RDHUtils::getPageCounter(rawDataHeader) + 1) == raw::RDHUtils::getPageCounter(nextPage)) { - nSamplesWrittenTB = DecodeTB(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, payloadEnd, nextPage); + nSamplesWrittenTB = DecodeTB(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); } else { - nSamplesWrittenTB = FillWithInvalid(clusterer, iThread, nThreads, pageDigitOffset, nSamplesInPage - nSamplesWritten); -#ifdef GPUCA_CHECK_TPCZS_CORRUPTION - if (iThread == 0) { - clusterer.raiseError(GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF, clusterer.mISector * 1000 + decHeader->cruID, raw::RDHUtils::getPageCounter(rawDataHeader), raw::RDHUtils::getPageCounter(nextPage)); - } -#endif + err = GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF; + break; } } else { - nSamplesWrittenTB = DecodeTB(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, payloadEnd, nextPage); + nSamplesWrittenTB = DecodeTB(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); + } + + // Abort decoding the page if an error was detected. + if (nSamplesWrittenTB < 0) { + err = -nSamplesWrittenTB; + break; } - assert(nSamplesWritten <= nSamplesInPage); nSamplesWritten += nSamplesWrittenTB; pageDigitOffset += nSamplesWrittenTB; } // for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) -#ifdef GPUCA_CHECK_TPCZS_CORRUPTION - if (iThread == 0 && nSamplesWritten != nSamplesInPage) { - clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHeader->cruID, nSamplesInPage, nSamplesWritten); - /*#ifndef GPUCA_GPUCODE - FILE* foo = fopen("dump.bin", "w+b"); - fwrite(pageSrc, 1, o2::raw::RDHUtils::getMemorySize(*rdHdr), foo); - fclose(foo); - #endif*/ + if (nSamplesWritten != nSamplesInPage) { + if (nSamplesWritten < nSamplesInPage) { + pageDigitOffset += FillWithInvalid(clusterer, iThread, nThreads, pageDigitOffset, nSamplesInPage - nSamplesWritten); + } + err = !err ? GPUErrors::ERROR_TPCZS_INVALID_NADC : err; // Ensure we don't overwrite any previous error } + + if (iThread == 0 && err) { + [[maybe_unused]] bool dumpPage = false; + + if (err == GPUErrors::ERROR_TPCZS_VERSION_MISMATCH) { + clusterer.raiseError(err, decHeader->version, ZSVersionDenseLinkBased); + } else if (err == GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD) { + clusterer.raiseError(err, decHeader->magicWord); + } else if (err == GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF) { + clusterer.raiseError(err, clusterer.mISector * 1000 + decHeader->cruID, raw::RDHUtils::getPageCounter(rawDataHeader), raw::RDHUtils::getPageCounter(nextPage)); + } else if (err == GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW) { + clusterer.raiseError(err, extendsToNextPage); + dumpPage = true; + } else if (err == GPUErrors::ERROR_TPCZS_INVALID_NADC) { + clusterer.raiseError(err, nSamplesInPage, nSamplesWritten, extendsToNextPage); + dumpPage = true; + } else { + clusterer.raiseError(GPUErrors::ERROR_TPCZS_UNKNOWN, err); + } + +#ifdef GPUCA_CHECK_TPCZS_CORRUPTION +#ifndef GPUCA_GPUCODE + if (dumpPage) { + // allocate more space on the stack for fname, so it can be overwritten by hand in a debugger. + const char fname[64] = "dump00.bin"; + FILE* foo = fopen(fname, "w+b"); + fwrite(pageStart, 1, TPCZSHDR::TPC_ZS_PAGE_SIZE, foo); + fclose(foo); + } +#endif #endif + } return pageDigitOffset; } template -GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTB( +GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB( processorType& clusterer, [[maybe_unused]] GPUSharedMemory& smem, int32_t iThread, @@ -667,23 +713,24 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTB( const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, - [[maybe_unused]] const uint8_t* payloadEnd, - [[maybe_unused]] const uint8_t* nextPage) + uint16_t nSamplesLeftInPage, + const uint8_t* payloadEnd, + const uint8_t* nextPage) { if constexpr (DecodeInParallel) { - return DecodeTBMultiThread(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, cru, payloadEnd, nextPage); + return DecodeTBMultiThread(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, cru, nSamplesLeftInPage, payloadEnd, nextPage); } else { - uint16_t nSamplesWritten = 0; + int16_t nSamplesWritten = 0; if (iThread == 0) { - nSamplesWritten = DecodeTBSingleThread(clusterer, page, pageDigitOffset, rawDataHeader, firstHBF, cru, payloadEnd, nextPage); + nSamplesWritten = DecodeTBSingleThread(clusterer, page, pageDigitOffset, rawDataHeader, firstHBF, cru, nSamplesLeftInPage, payloadEnd, nextPage); } return warp_broadcast(nSamplesWritten, 0); } } template -GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( +GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( processorType& clusterer, GPUSharedMemory& smem, const int32_t iThread, @@ -692,8 +739,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, - [[maybe_unused]] const uint8_t* payloadEnd, - [[maybe_unused]] const uint8_t* nextPage) + uint16_t nSamplesLeftInPage, + const uint8_t* payloadEnd, + const uint8_t* nextPage) { #define MAYBE_PAGE_OVERFLOW(pagePtr) \ if constexpr (PayloadExtendsToNextPage) { \ @@ -703,7 +751,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \ } \ } else { \ - assert(pagePtr <= payloadEnd); \ + if (pagePtr > payloadEnd) { \ + return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \ + } \ } #define PEEK_OVERFLOW(pagePtr, offset) \ @@ -728,7 +778,7 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( uint16_t linkBC = (tbbHdr & 0xFFF0) >> 4; int32_t timeBin = (linkBC + (uint64_t)(raw::RDHUtils::getHeartBeatOrbit(*rawDataHeader) - firstHBF) * constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN; - uint16_t nSamplesInTB = 0; + int16_t nSamplesInTB = 0; // Read timebin link headers for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) { @@ -747,7 +797,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( } int32_t nBytesBitmask = CAMath::Popcount(bitmaskL2); - assert(nBytesBitmask <= 10); for (int32_t chan = iThread; chan < CAMath::nextMultipleOf(80); chan += NTHREADS) { int32_t chanL2Idx = chan / 8; @@ -756,7 +805,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( int32_t chanByteOffset = nBytesBitmask - 1 - CAMath::Popcount(bitmaskL2 >> (chanL2Idx + 1)); uint8_t myChannelHasData = (chan < 80 && l2 ? TEST_BIT(PEEK_OVERFLOW(page, chanByteOffset), chan % 8) : 0); - assert(myChannelHasData == 0 || myChannelHasData == 1); int32_t nSamplesStep; int32_t threadSampleOffset = CfUtils::warpPredicateScan(myChannelHasData, &nSamplesStep); @@ -779,13 +827,17 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it - const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8); - MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check? + if (nSamplesInTB > nSamplesLeftInPage) { + return -GPUErrors::ERROR_TPCZS_INVALID_NADC; + } if (not fragment.contains(timeBin)) { return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB); } + const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8); + MAYBE_PAGE_OVERFLOW(page); + // Unpack ADC int32_t iLink = 0; for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) { @@ -821,9 +873,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem - assert(PayloadExtendsToNextPage || adcData <= page); - assert(PayloadExtendsToNextPage || page <= payloadEnd); - return nSamplesInTB; #undef TEST_BIT @@ -832,13 +881,14 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( } template -GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( +GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( processorType& clusterer, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, + uint16_t nSamplesLeftInPage, [[maybe_unused]] const uint8_t* payloadEnd, [[maybe_unused]] const uint8_t* nextPage) { @@ -850,7 +900,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \ } \ } else { \ - assert(pagePtr <= payloadEnd); \ + if (pagePtr > payloadEnd) { \ + return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \ + } \ } using zerosupp_link_based::ChannelPerTBHeader; @@ -898,14 +950,18 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( } // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) - const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8); - MAYBE_PAGE_OVERFLOW(page); + if (nSamplesInTB > nSamplesLeftInPage) { + return -GPUErrors::ERROR_TPCZS_INVALID_NADC; + } if (not fragment.contains(timeBin)) { FillWithInvalid(clusterer, 0, 1, pageDigitOffset, nSamplesInTB); return nSamplesInTB; } + const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8); + MAYBE_PAGE_OVERFLOW(page); + // Unpack ADC uint32_t byte = 0, bits = 0; uint16_t rawFECChannel = 0; @@ -937,10 +993,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( } // while (bits >= DECODE_BITS) } // while (nSamplesWritten < nAdc) - assert(PayloadExtendsToNextPage || adcData <= page); - assert(PayloadExtendsToNextPage || page <= payloadEnd); - assert(nSamplesWritten == nSamplesInTB); - return nSamplesWritten; #undef MAYBE_PAGE_OVERFLOW diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h index e476674e030f9..4697462a8c504 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h @@ -167,14 +167,17 @@ class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase GPUd() static bool ChannelIsActive(const uint8_t* chan, uint16_t chanIndex); + // Decode a single timebin within an 8kb page. + // Returns the number of samples decoded from the page + // or negative value to indicate an error (no samples are written in this case) template - GPUd() static uint16_t DecodeTB(processorType& clusterer, GPUSharedMemory& smem, int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage); + GPUd() static int16_t DecodeTB(processorType& clusterer, GPUSharedMemory& smem, int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); template - GPUd() static uint16_t DecodeTBSingleThread(processorType& clusterer, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage); + GPUd() static int16_t DecodeTBSingleThread(processorType& clusterer, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); template - GPUd() static uint16_t DecodeTBMultiThread(processorType& clusterer, GPUSharedMemory& smem, const int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage); + GPUd() static int16_t DecodeTBMultiThread(processorType& clusterer, GPUSharedMemory& smem, const int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index c8ddcd2e9d81d..84726ea9fb8d0 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -120,7 +120,7 @@ o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanDown" "= TPC o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage CfChargePos* in CfChargePos* out) o2_gpu_add_kernel("GPUTPCCFDecodeZS" "= TPCCLUSTERFINDER" LB int32_t firstHBF) o2_gpu_add_kernel("GPUTPCCFDecodeZSLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) -o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) +o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink" "GPUTPCCFDecodeZS ERRORS" LB int32_t firstHBF) o2_gpu_add_kernel("GPUTPCCFGather" "=" LB o2::tpc::ClusterNative* dest) o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALREFIT " LB) o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB)