diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index 7ba32bd43275b..e20f5d8b0f074 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -23,6 +23,7 @@ #include "TPCPadGainCalib.h" #include "TPCZSLinkMapping.h" #include "GPUTPCGeometry.h" +#include "DetectorsRaw/RDHUtils.h" using namespace o2::gpu; using namespace o2::gpu::tpccf; @@ -251,8 +252,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& if (discardTimeBin) { FillWithInvalid(ctx.clusterer, ctx.iThread, ctx.nThreads, ctx.pageDigitOffset, nAdc); } else { -#ifdef GPUCA_GPUCODE - DecodeTBMultiThread( + DecodeTB( smem, ctx, adcData, @@ -261,16 +261,6 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& timeBin, decHdr->cruID, tbHdr->fecInPartition); -#else // CPU - DecodeTBSingleThread( - ctx, - adcData, - nAdc, - channelMask, - timeBin, - decHdr->cruID, - tbHdr->fecInPartition); -#endif } ctx.pageDigitOffset += nAdc; @@ -290,62 +280,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& return ctx.pageDigitOffset; } -GPUd() void GPUTPCCFDecodeZSLink::DecodeTBSingleThread( - DecodeCtx& ctx, - const uint8_t* adcData, - uint32_t nAdc, - const uint32_t* channelMask, - int32_t timeBin, - int32_t cru, - int32_t fecInPartition) -{ - const CfFragment& fragment = ctx.clusterer.mPmemory->fragment; - - if constexpr (TPCZSHDRV2::TIGHTLY_PACKED_V3) { - - uint32_t byte = 0, bits = 0, nSamplesWritten = 0, rawFECChannel = 0; - - // unpack adc values, assume tightly packed data - while (nSamplesWritten < nAdc) { - byte |= adcData[0] << bits; - adcData++; - bits += CHAR_BIT; - while (bits >= DECODE_BITS) { - - // Find next channel with data - for (; !ChannelIsActive(channelMask, rawFECChannel); rawFECChannel++) { - } - - // Unpack data for cluster finder - o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannel, fecInPartition); - - WriteCharge(ctx.clusterer, byte, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + nSamplesWritten); - - byte = byte >> DECODE_BITS; - bits -= DECODE_BITS; - nSamplesWritten++; - rawFECChannel++; // Ensure we don't decode same channel twice - } // while (bits >= DECODE_BITS) - } // while (nSamplesWritten < nAdc) - - } else { // ! TPCZSHDRV2::TIGHTLY_PACKED_V3 - uint32_t rawFECChannel = 0; - const uint64_t* adcData64 = (const uint64_t*)adcData; - for (uint32_t j = 0; j < nAdc; j++) { - for (; !ChannelIsActive(channelMask, rawFECChannel); rawFECChannel++) { - } - - uint32_t adc = (adcData64[j / TPCZSHDRV2::SAMPLESPER64BIT] >> ((j % TPCZSHDRV2::SAMPLESPER64BIT) * DECODE_BITS)) & DECODE_MASK; - - o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannel, fecInPartition); - float charge = ADCToFloat(adc, DECODE_MASK, DECODE_BITS_FACTOR); - WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + j); - rawFECChannel++; - } - } -} - -GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread( +GPUd() void GPUTPCCFDecodeZSLink::DecodeTB( GPUSharedMemory& smem, DecodeCtx& ctx, const uint8_t* adcData, @@ -368,26 +303,6 @@ GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread( uint8_t myOffset = warp_scan_inclusive_add(myChannelActive) - 1 + blockOffset; blockOffset = warp_broadcast(myOffset, NTHREADS - 1) + 1; - // Decode entire timebin at once if we have enough threads - // This should further improve performance, but code below is buggy... - // if (nAdc <= NThreads) { - // for (int32_t j = 1; blockOffset < nAdc; j++) { - // rawFECChannel = myChannelActive ? rawFECChannel : (iThread + j*NThreads - myOffset); - - // bool iAmIdle = not myChannelActive; - - // myChannelActive = - // rawFECChannel < zerosupp_link_based::CommonHeaderlPerTBHeader - // ? BitIsSet(channelMask, rawFECChannel) - // : false; - - // uint8_t newOffset = warp_scan_inclusive_add(static_cast(myChannelActive && iAmIdle)) - 1 + blockOffset; - // blockOffset = warp_broadcast(newOffset, NThreads - 1) + 1; - - // myOffset = iAmIdle ? newOffset : myOffset; - // } - // } - if (not myChannelActive) { continue; } @@ -397,28 +312,16 @@ GPUd() void GPUTPCCFDecodeZSLink::DecodeTBMultiThread( if constexpr (TPCZSHDRV2::TIGHTLY_PACKED_V3) { - // Try to access adcData with 4 byte reads instead of 1 byte. - // You'd think this would improve performace, but it's actually slower... - // const uint32_t* adcDataU32 = reinterpret_cast(adcData); - uint32_t adcBitOffset = myOffset * DECODE_BITS; uint32_t adcByteOffset = adcBitOffset / CHAR_BIT; uint32_t adcOffsetInByte = adcBitOffset - adcByteOffset * CHAR_BIT; - // uint32_t adcByteOffset = adcBitOffset / 32; - // uint32_t adcOffsetInByte = adcBitOffset - adcByteOffset * 32; uint32_t byte = 0, bits = 0; - // uint32_t byte = adcDataU32[adcByteOffset] >> adcOffsetInByte; - // uint32_t bits = 32 - adcOffsetInByte; - // adcByteOffset++; - while (bits < DECODE_BITS) { byte |= ((uint32_t)adcData[adcByteOffset]) << bits; - // byte |= adcDataU32[adcByteOffset] << bits; adcByteOffset++; bits += CHAR_BIT; - // bits += 32; } adc = byte >> adcOffsetInByte; @@ -601,12 +504,6 @@ GPUd() void GPUTPCCFDecodeZSDenseLink::Thread<0>(int32_t nBlocks, int32_t nThrea GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& ctx) { -#ifdef GPUCA_GPUCODE - constexpr bool DecodeInParallel = true; -#else - constexpr bool DecodeInParallel = false; -#endif - const uint8_t* const pageStart = ctx.page; const auto* rawDataHeader = Peek(ctx.page); @@ -651,13 +548,13 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, Dec } if ((uint16_t)(raw::RDHUtils::getPageCounter(rawDataHeader) + 1) == raw::RDHUtils::getPageCounter(nextPage)) { - nSamplesWrittenTB = DecodeTB(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); + nSamplesWrittenTB = DecodeTB(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); } else { err = GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF; break; } } else { - nSamplesWrittenTB = DecodeTB(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); + nSamplesWrittenTB = DecodeTB(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage); } // Abort decoding the page if an error was detected. @@ -712,30 +609,8 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, Dec return ctx.pageDigitOffset; } -template -GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB( - [[maybe_unused]] GPUSharedMemory& smem, - DecodeCtx& ctx, - const header::RAWDataHeader* rawDataHeader, - int32_t cru, - uint16_t nSamplesLeftInPage, - const uint8_t* payloadEnd, - const uint8_t* nextPage) -{ - - if constexpr (DecodeInParallel) { - return DecodeTBMultiThread(smem, ctx, rawDataHeader, cru, nSamplesLeftInPage, payloadEnd, nextPage); - } else { - int16_t nSamplesWritten = 0; - if (ctx.iThread == 0) { - nSamplesWritten = DecodeTBSingleThread(ctx, rawDataHeader, cru, nSamplesLeftInPage, payloadEnd, nextPage); - } - return warp_broadcast(nSamplesWritten, 0); - } -} - template -GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( +GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB( GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, @@ -883,123 +758,6 @@ GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( #undef MAYBE_PAGE_OVERFLOW } -template -GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread( - DecodeCtx& ctx, - const header::RAWDataHeader* rawDataHeader, - int32_t cru, - uint16_t nSamplesLeftInPage, - const uint8_t* payloadEnd, - const uint8_t* nextPage) -{ -#define MAYBE_PAGE_OVERFLOW(pagePtr) \ - if constexpr (PayloadExtendsToNextPage) { \ - if (pagePtr >= payloadEnd && pagePtr < nextPage) { \ - ptrdiff_t diff = pagePtr - payloadEnd; \ - pagePtr = nextPage; \ - ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \ - } \ - } else { \ - if (pagePtr > payloadEnd) { \ - return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \ - } \ - } - - using zerosupp_link_based::ChannelPerTBHeader; - - const CfFragment& fragment = ctx.clusterer.mPmemory->fragment; - - uint8_t linkIds[MaxNLinksPerTimebin]; - uint8_t channelMasks[MaxNLinksPerTimebin * 10] = {0}; - uint16_t nSamplesWritten = 0; - - // Read timebin block header - uint16_t tbbHdr = ConsumeByte(ctx.page); - MAYBE_PAGE_OVERFLOW(ctx.page); - tbbHdr |= static_cast(ConsumeByte(ctx.page)) << CHAR_BIT; - MAYBE_PAGE_OVERFLOW(ctx.page); - - uint8_t nLinksInTimebin = tbbHdr & 0x000F; - uint16_t linkBC = (tbbHdr & 0xFFF0) >> 4; - int32_t timeBin = (linkBC + (uint64_t)(raw::RDHUtils::getHeartBeatOrbit(*rawDataHeader) - ctx.firstHBF) * constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN; - - uint16_t nSamplesInTB = 0; - - // Read timebin link headers - for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) { - uint8_t timebinLinkHeaderStart = ConsumeByte(ctx.page); - MAYBE_PAGE_OVERFLOW(ctx.page); - - linkIds[iLink] = timebinLinkHeaderStart & 0b00011111; - - bool bitmaskIsFlat = timebinLinkHeaderStart & 0b00100000; - - uint16_t bitmaskL2 = 0x0FFF; - if (not bitmaskIsFlat) { - bitmaskL2 = static_cast(timebinLinkHeaderStart & 0b11000000) << 2 | static_cast(ConsumeByte(ctx.page)); - MAYBE_PAGE_OVERFLOW(ctx.page); - } - - for (int32_t i = 0; i < 10; i++) { - if (bitmaskL2 & 1 << i) { - nSamplesInTB += CAMath::Popcount(*Peek(ctx.page)); - channelMasks[10 * iLink + i] = ConsumeByte(ctx.page); - MAYBE_PAGE_OVERFLOW(ctx.page); - } - } - - } // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) - - if (nSamplesInTB > nSamplesLeftInPage) { - return -GPUErrors::ERROR_TPCZS_INVALID_NADC; - } - - const uint8_t* adcData = ConsumeBytes(ctx.page, (nSamplesInTB * DECODE_BITS + 7) / 8); - MAYBE_PAGE_OVERFLOW(ctx.page); - - bool discardTimeBin = not fragment.contains(timeBin); - discardTimeBin |= (ctx.tpcTimeBinCut > 0 && timeBin > ctx.tpcTimeBinCut); - - if (discardTimeBin) { - return FillWithInvalid(ctx.clusterer, 0, 1, ctx.pageDigitOffset, nSamplesInTB); - } - - // Unpack ADC - uint32_t byte = 0, bits = 0; - uint16_t rawFECChannel = 0; - - // unpack adc values, assume tightly packed data - while (nSamplesWritten < nSamplesInTB) { - byte |= static_cast(ConsumeByte(adcData)) << bits; - MAYBE_PAGE_OVERFLOW(adcData); - bits += CHAR_BIT; - while (bits >= DECODE_BITS) { - - // Find next channel with data - for (; !ChannelIsActive(channelMasks, rawFECChannel); rawFECChannel++) { - } - - int32_t iLink = rawFECChannel / ChannelPerTBHeader; - int32_t rawFECChannelLink = rawFECChannel % ChannelPerTBHeader; - - // Unpack data for cluster finder - o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannelLink, linkIds[iLink]); - - float charge = ADCToFloat(byte, DECODE_MASK, DECODE_BITS_FACTOR); - WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + nSamplesWritten); - - byte >>= DECODE_BITS; - bits -= DECODE_BITS; - nSamplesWritten++; - rawFECChannel++; // Ensure we don't decode same channel twice - } // while (bits >= DECODE_BITS) - } // while (nSamplesWritten < nAdc) - - return nSamplesWritten; - -#undef MAYBE_PAGE_OVERFLOW -} - GPUd() bool GPUTPCCFDecodeZSDenseLink::ChannelIsActive(const uint8_t* chan, uint16_t chanIndex) { constexpr uint8_t N_BITS_PER_ENTRY = sizeof(*chan) * CHAR_BIT; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h index 750df643f2d10..c633a5ebc2774 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h @@ -21,7 +21,7 @@ #include "TPCBase/PadPos.h" #include "DataFormatsTPC/ZeroSuppression.h" #include "DataFormatsTPC/ZeroSuppressionLinkBased.h" -#include "DetectorsRaw/RDHUtils.h" +#include "Headers/RAWDataHeader.h" namespace o2::gpu { @@ -148,8 +148,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase GPUd() static void GetChannelBitmask(const tpc::zerosupp_link_based::CommonHeader& tbHdr, uint32_t* chan); GPUd() static bool ChannelIsActive(const uint32_t* chan, uint8_t chanIndex); - GPUd() static void DecodeTBSingleThread(DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition); - GPUd() static void DecodeTBMultiThread(GPUSharedMemory& smem, DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition); + GPUd() static void DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const uint8_t* adcData, uint32_t nAdc, const uint32_t* channelMask, int32_t timeBin, int32_t cru, int32_t fecInPartition); }; class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase @@ -179,14 +178,8 @@ class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase // Decode a single timebin within an 8kb page. // Returns the number of samples decoded from the page // or negative value to indicate an error (no samples are written in this case) - template - GPUd() static int16_t DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); - template - GPUd() static int16_t DecodeTBSingleThread(DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); - - template - GPUd() static int16_t DecodeTBMultiThread(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); + GPUd() static int16_t DecodeTB(GPUSharedMemory& smem, DecodeCtx& ctx, const header::RAWDataHeader* rawDataHeader, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage); }; } // namespace o2::gpu