diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 626137d821e10..e8eb436d6ef5c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -50,88 +50,72 @@ using namespace o2::tpc::constants; using namespace o2::dataformats; #ifdef GPUCA_TPC_GEOMETRY_O2 -std::pair GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment &fragment) +std::pair GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment) { bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding; - GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector]; - GPUTPCClusterFinder::ZSOffset *o = processors()->tpcClusterer[iSector].mPzsOffsets; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; + GPUTPCClusterFinder::ZSOffset* o = processors()->tpcClusterer[iSector].mPzsOffsets; uint32_t digits = 0; uint32_t pages = 0; - for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) - { + for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j]; - if (doGPU) - { + if (doGPU) { uint16_t posInEndpoint = 0; uint16_t pagesEndpoint = 0; - for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) - { + for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) { const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0; const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; - for (uint32_t l = pageFirst; l < pageLast; l++) - { + for (uint32_t l = pageFirst; l < pageLast; l++) { uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++]; - if (pageDigits) - { + if (pageDigits) { *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint}; digits += pageDigits; } pagesEndpoint++; } } - if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) - { - if (GetProcessingSettings().ignoreNonFatalGPUErrors) - { + if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) { + if (GetProcessingSettings().ignoreNonFatalGPUErrors) { GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()); return {0, 0}; - } - else - { + } else { GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()); } } - } - else - { + } else { clusterer.mPzsOffsets[j] = GPUTPCClusterFinder::ZSOffset{digits, j, 0}; digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j]; pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j]; } } - if (doGPU) - { + if (doGPU) { pages = o - processors()->tpcClusterer[iSector].mPzsOffsets; } - if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) - { + if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) { TPCClusterizerEnsureZSOffsets(iSector, fragment); } return {digits, pages}; } -void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment &fragment) +void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment) { - GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector]; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; uint32_t nAdcs = 0; - for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++) - { - const auto &data = mCFContext->fragmentData[fragment.index]; + for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++) { + const auto& data = mCFContext->fragmentData[fragment.index]; uint32_t pagesEndpoint = 0; const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint]; const uint32_t nPagesExpected = data.nPages[iSector][endpoint]; uint32_t nAdcDecoded = 0; - const auto &zs = mIOPtrs.tpcZS->sector[iSector]; - for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) - { + const auto& zs = mIOPtrs.tpcZS->sector[iSector]; + for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) { const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0; const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i]; - for (uint32_t j = pageFirst; j < pageLast; j++) - { - const uint8_t *page = static_cast(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE; - const header::RAWDataHeader *rawDataHeader = reinterpret_cast(page); - const TPCZSHDRV2 *decHdr = reinterpret_cast(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2)); + for (uint32_t j = pageFirst; j < pageLast; j++) { + const uint8_t* page = static_cast(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE; + const header::RAWDataHeader* rawDataHeader = reinterpret_cast(page); + const TPCZSHDRV2* decHdr = reinterpret_cast(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2)); const uint16_t nSamplesInPage = decHdr->nADCsamples; nAdcDecoded += nSamplesInPage; @@ -139,18 +123,15 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF } } - if (pagesEndpoint != nPagesExpected) - { + if (pagesEndpoint != nPagesExpected) { GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %lu", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected); } - if (nAdcDecoded != nAdcsExpected) - { + if (nAdcDecoded != nAdcsExpected) { GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded); } - if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) - { + if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) { GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset); } @@ -160,13 +141,12 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF namespace { - struct TPCCFDecodeScanTmp - { - int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter; - }; +struct TPCCFDecodeScanTmp { + int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter; +}; } // namespace -std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment &fragment) +std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment) { mRec->getGeneralStepTimer(GeneralStep::Prepare).Start(); uint32_t nDigits = 0; @@ -174,20 +154,16 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS]; memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples)); bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding; - int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) + int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0; - for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) - { + for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { #ifndef GPUCA_NO_VC - if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) - { - for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) - { - for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) - { - Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE); - Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); + if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) { + for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) { + for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) { + Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE); + Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); } } } @@ -196,8 +172,7 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint std::vector> fragments; fragments.reserve(mCFContext->nFragments); fragments.emplace_back(std::pair{fragment, {0, 0, 0, 0, 0, -1}}); - for (uint32_t i = 1; i < mCFContext->nFragments; i++) - { + for (uint32_t i = 1; i < mCFContext->nFragments; i++) { fragments.emplace_back(std::pair{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}}); } std::vector fragmentExtends(mCFContext->nFragments, false); @@ -205,82 +180,64 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint uint32_t firstPossibleFragment = 0; uint32_t pageCounter = 0; uint32_t emptyPages = 0; - for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) - { - if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) - { + for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) { + if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) { break; } nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; - for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) - { + for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) { #ifndef GPUCA_NO_VC - if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) - { - Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE); - Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); + if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) { + Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE); + Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); } #endif - const uint8_t *const page = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE; - const o2::header::RAWDataHeader *rdh = (const o2::header::RAWDataHeader *)page; - if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) - { + const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE; + const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page; + if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) { emptyPages++; continue; } pageCounter++; - const TPCZSHDR *const hdr = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader))); - if (mCFContext->zsVersion == -1) - { + const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader))); + if (mCFContext->zsVersion == -1) { mCFContext->zsVersion = hdr->version; - if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) - { // TODO: Move tpcTriggerHandling to recoSteps bitmask + if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask static bool errorShown = false; - if (errorShown == false) - { + if (errorShown == false) { GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion); } errorShown = true; } - } - else if (mCFContext->zsVersion != (int32_t)hdr->version) - { + } else if (mCFContext->zsVersion != (int32_t)hdr->version) { GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh)); constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1; char dumpBuffer[bufferSize]; - for (size_t i = 0; i < sizeof(*rdh); i++) - { + for (size_t i = 0; i < sizeof(*rdh); i++) { // "%02X " guaranteed to be 3 chars + ending 0. - snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)rdh)[i]); + snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]); } GPUAlarm("RDH of page: %s", dumpBuffer); - for (size_t i = 0; i < sizeof(*hdr); i++) - { + for (size_t i = 0; i < sizeof(*hdr); i++) { // "%02X " guaranteed to be 3 chars + ending 0. - snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)hdr)[i]); + snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]); } GPUAlarm("Metainfo of page: %s", dumpBuffer); - if (GetProcessingSettings().ignoreNonFatalGPUErrors) - { + if (GetProcessingSettings().ignoreNonFatalGPUErrors) { mCFContext->abandonTimeframe = true; return {0, 0}; - } - else - { + } else { GPUFatal("Cannot process with invalid TPC ZS data, exiting"); } } - if (GetProcessingSettings().param.tpcTriggerHandling) - { - const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr; - if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) - { - const char *triggerWord = (const char *)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE; + if (GetProcessingSettings().param.tpcTriggerHandling) { + const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr; + if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) { + const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE; o2::tpc::TriggerInfoDLBZS tmp; - memcpy((void *)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE); + memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE); tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh); - if (tmp.triggerWord.isValid(0)) - { + if (tmp.triggerWord.isValid(0)) { mTriggerBuffer->triggers.emplace(tmp); } } @@ -289,37 +246,28 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint endpointAdcSamples[j] += hdr->nADCsamples; uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN; uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan; - if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) - { - const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr; - if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) - { + if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) { + const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr; + if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) { maxTimeBin += 256; } } - if (maxTimeBin > mCFContext->tpcMaxTimeBin) - { + if (maxTimeBin > mCFContext->tpcMaxTimeBin) { mCFContext->tpcMaxTimeBin = maxTimeBin; } bool extendsInNextPage = false; - if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) - { - if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) - { - const o2::header::RAWDataHeader *nextrdh = (const o2::header::RAWDataHeader *)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE); + if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) { + if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) { + const o2::header::RAWDataHeader* nextrdh = (const o2::header::RAWDataHeader*)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE); extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader); } } - while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) - { + while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) { firstPossibleFragment--; } - auto handleExtends = [&](uint32_t ff) - { - if (fragmentExtends[ff]) - { - if (doGPU) - { + auto handleExtends = [&](uint32_t ff) { + if (fragmentExtends[ff]) { + if (doGPU) { // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway. // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded. fragments[ff].second.zsPageLast++; @@ -329,57 +277,39 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint fragmentExtends[ff] = false; } }; - if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) - { - for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) - { + if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) { + for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) { handleExtends(ff); } } - for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) - { - if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) - { - if (!fragments[f].second.hasData) - { + for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) { + if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) { + if (!fragments[f].second.hasData) { fragments[f].second.hasData = 1; fragments[f].second.zsPtrFirst = k; fragments[f].second.zsPageFirst = l; - } - else - { - if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) - { + } else { + if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) { mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1; - for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) - { - for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) - { - if (doGPU) - { + for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) { + for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) { + if (doGPU) { mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0); - } - else - { + } else { // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters - const uint8_t *const pageTmp = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE; - const o2::header::RAWDataHeader *rdhTmp = (const o2::header::RAWDataHeader *)pageTmp; - if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) - { - const TPCZSHDR *const hdrTmp = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader))); + const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE; + const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp; + if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) { + const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader))); mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples; } } } } - } - else if (emptyPages) - { + } else if (emptyPages) { mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages; - if (doGPU) - { - for (uint32_t m = 0; m < emptyPages; m++) - { + if (doGPU) { + for (uint32_t m = 0; m < emptyPages; m++) { mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0); } } @@ -390,28 +320,20 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint fragments[f].second.pageCounter = pageCounter; mCFContext->fragmentData[f].nPages[iSector][j]++; mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples; - if (doGPU) - { + if (doGPU) { mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples); } fragmentExtends[f] = extendsInNextPage; - } - else - { + } else { handleExtends(f); - if (timeBin < (uint32_t)fragments[f].first.last()) - { - if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) - { - for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) - { + if (timeBin < (uint32_t)fragments[f].first.last()) { + if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) { + for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) { handleExtends(ff); } } break; - } - else - { + } else { firstPossibleFragment = f + 1; } } @@ -419,8 +341,7 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint emptyPages = 0; } } - for (uint32_t f = 0; f < mCFContext->nFragments; f++) - { + for (uint32_t f = 0; f < mCFContext->nFragments; f++) { mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast; mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst; mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast; @@ -431,20 +352,16 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint mCFContext->nPagesSector[iSector] = nPages; mCFContext->nDigitsEndpointMax[iSector] = 0; - for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) - { - if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) - { + for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) { + if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) { mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i]; } } uint32_t nDigitsFragmentMax = 0; - for (uint32_t i = 0; i < mCFContext->nFragments; i++) - { + for (uint32_t i = 0; i < mCFContext->nFragments; i++) { uint32_t pagesInFragment = 0; uint32_t digitsInFragment = 0; - for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) - { + for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j]; digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j]; } @@ -455,36 +372,29 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint return {nDigits, nDigitsFragmentMax}; } -void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clusterer, GPUTPCClusterFinder &clustererShadow, int32_t stage, bool doGPU, int32_t lane) +void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane) { - auto &in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions; - auto &out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions; - if (doGPU) - { + auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions; + auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions; + if (doGPU) { const uint32_t iSector = clusterer.mISector; - auto &count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions; + auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions; std::vector counts; uint32_t nSteps = clusterer.getNSteps(count); - if (nSteps > clusterer.mNBufs) - { + if (nSteps > clusterer.mNBufs) { GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs); exit(1); } size_t tmpCount = count; - if (nSteps > 1) - { - for (uint32_t i = 1; i < nSteps; i++) - { + if (nSteps > 1) { + for (uint32_t i = 1; i < nSteps; i++) { counts.push_back(tmpCount); - if (i == 1) - { + if (i == 1) { runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage); - } - else - { + } else { runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount); } tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize; @@ -492,24 +402,19 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clust runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount); - for (uint32_t i = nSteps - 1; i > 1; i--) - { + for (uint32_t i = nSteps - 1; i > 1; i--) { tmpCount = counts[i - 1]; runKernel({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount); } } runKernel({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out); - } - else - { - auto &nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks; - auto &nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions; + } else { + auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks; + auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions; size_t count = 0; - for (size_t i = 0; i < nIn; i++) - { - if (clusterer.mPisPeak[i]) - { + for (size_t i = 0; i < nIn; i++) { + if (clusterer.mPisPeak[i]) { out[count++] = in[i]; } } @@ -517,33 +422,28 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clust } } -std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment &fragment, int32_t lane) +std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane) { bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; - if (mCFContext->abandonTimeframe) - { + if (mCFContext->abandonTimeframe) { return {0, 0}; } - const auto &retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); - if (doGPU) - { - GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector]; - GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; + const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); + if (doGPU) { + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; uint32_t nPagesSector = 0; - for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) - { + for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { uint32_t nPages = 0; mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j]; mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE; - for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) - { + for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) { const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0; const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; - if (max > min) - { - char *src = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE; - char *ptrLast = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE; - size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader *)ptrLast); + if (max > min) { + char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE; + char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE; + size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast); GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true); } nPages += max - min; @@ -561,10 +461,8 @@ std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) { bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding; - if (restorePointers) - { - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + if (restorePointers) { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost; processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice; processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice; @@ -572,10 +470,9 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave; return 0; } - const auto &threadContext = GetThreadContext(); + const auto& threadContext = GetThreadContext(); mRec->MemoryScalers()->nTPCdigits = 0; - if (mCFContext == nullptr) - { + if (mCFContext == nullptr) { mCFContext.reset(new GPUTPCCFChainContext); } const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen; @@ -583,114 +480,86 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) mCFContext->tpcMaxTimeBin = maxAllowedTimebin; const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen}; mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax); - if (GetProcessingSettings().param.tpcTriggerHandling) - { + if (GetProcessingSettings().param.tpcTriggerHandling) { mTriggerBuffer->triggers.clear(); } - if (mIOPtrs.tpcZS) - { + if (mIOPtrs.tpcZS) { uint32_t nDigitsFragmentMax[NSECTORS]; mCFContext->zsVersion = -1; - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { - if (mIOPtrs.tpcZS->sector[iSector].count[0]) - { - const void *rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]; - if (rdh && o2::raw::RDHUtils::getVersion() > o2::raw::RDHUtils::getVersion(rdh)) - { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { + if (mIOPtrs.tpcZS->sector[iSector].count[0]) { + const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]; + if (rdh && o2::raw::RDHUtils::getVersion() > o2::raw::RDHUtils::getVersion(rdh)) { GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion()); return 1; } } #ifndef GPUCA_NO_VC - if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) - { - for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) - { - for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) - { - for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) - { - Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE); - Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); + if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) { + for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { + for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) { + for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) { + Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE); + Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); } } } } #endif - const auto &x = TPCClusterizerDecodeZSCount(iSector, fragmentMax); + const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax); nDigitsFragmentMax[iSector] = x.first; processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first; mRec->MemoryScalers()->nTPCdigits += x.first; } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { uint32_t nDigitsBase = nDigitsFragmentMax[iSector]; uint32_t threshold = 40000000; uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase); processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]); - if (doGPU) - { + if (doGPU) { processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]); } - if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) - { + if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) { mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec); mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec); - } - else - { + } else { AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId); AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId); } } - } - else - { - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + } else { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]; mRec->MemoryScalers()->nTPCdigits += nDigits; processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0); } } - if (mIOPtrs.tpcZS) - { + if (mIOPtrs.tpcZS) { GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits); - } - else - { + } else { GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits); } - if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) - { + if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) { GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin); - if (GetProcessingSettings().ignoreNonFatalGPUErrors) - { + if (GetProcessingSettings().ignoreNonFatalGPUErrors) { mCFContext->abandonTimeframe = true; mCFContext->tpcMaxTimeBin = maxAllowedTimebin; - } - else - { + } else { return 1; } } mCFContext->fragmentFirst = CfFragment{std::max(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen}; - for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) - { - if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) - { + for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) { + if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) { mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector); } } - if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) - { - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets; mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets; mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs; @@ -702,66 +571,55 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) { - if (param().rec.fwdTPCDigitsAsClusters) - { + if (param().rec.fwdTPCDigitsAsClusters) { return ForwardTPCDigits(); } #ifdef GPUCA_TPC_GEOMETRY_O2 int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin; mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST")); - const auto &threadContext = GetThreadContext(); + const auto& threadContext = GetThreadContext(); const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; - if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) - { + if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) { return 1; } - if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) - { + if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) { mRec->SetNActiveThreads(mRec->MemoryScalers()->nTPCdigits / 6000); } mRec->MemoryScalers()->nTPCHits = mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits); float tpcHitLowOccupancyScalingFactor = 1.f; - if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF) - { + if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF) { uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits; uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF; - if (mIOPtrs.settingsTF->nHBFPerTF < 64) - { + if (mIOPtrs.settingsTF->nHBFPerTF < 64) { threshold *= 2; } mRec->MemoryScalers()->nTPCHits = std::max(nHitsBase, std::min(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters - if (nHitsBase < threshold) - { + if (nHitsBase < threshold) { float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f); mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase); tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase); } } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes } mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate } - if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) - { + if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) { RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } #ifdef GPUCA_HAS_ONNX uint32_t maxClusters = -1; - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) - { - GPUTPCNNClusterizer &clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer &nn_settings = GetProcessingSettings().nn; + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; @@ -773,12 +631,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.nnClusterizerTotalClusters = maxClusters; clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (nn_settings.nnClusterizerVerbosity < 0) - { + if (nn_settings.nnClusterizerVerbosity < 0) { clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; - } - else - { + } else { clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; @@ -787,55 +642,46 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #endif - if (doGPU && mIOPtrs.tpcZS) - { + if (doGPU && mIOPtrs.tpcZS) { processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)&processors()->ioPtrs - (char *)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); } - if (doGPU) - { - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)processors()->tpcClusterer - (char *)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); + if (doGPU) { + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); } size_t nClsTotal = 0; - ClusterNativeAccess *tmpNativeAccess = mClusterNativeAccess.get(); - ClusterNative *tmpNativeClusters = nullptr; + ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get(); + ClusterNative* tmpNativeClusters = nullptr; std::unique_ptr tmpNativeClusterBuffer; // setup MC Labels bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC; - auto *digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; + auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU(); bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor; - if (buildNativeGPU) - { + if (buildNativeGPU) { AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); } - if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) - { - if (mWaitForFinalInputs) - { + if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) { + if (mWaitForFinalInputs) { GPUFatal("Cannot use waitForFinalInput callback without delayed output"); } - if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) - { + if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) { AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); tmpNativeClusters = mInputsHost->mPclusterNativeOutput; - } - else - { + } else { tmpNativeClusterBuffer = std::make_unique(mInputsHost->mNClusterNative); tmpNativeClusters = tmpNativeClusterBuffer.get(); } } GPUTPCLinearLabels mcLinearLabels; - if (propagateMCLabels) - { + if (propagateMCLabels) { // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2); mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits); @@ -844,10 +690,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int8_t transferRunning[NSECTORS] = {0}; uint32_t outputQueueStart = mOutputQueue.size(); - auto notifyForeignChainFinished = [this]() - { - if (mPipelineNotifyCtx) - { + auto notifyForeignChainFinished = [this]() { + if (mPipelineNotifyCtx) { SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory { std::lock_guard lock(mPipelineNotifyCtx->mutex); @@ -858,56 +702,44 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) }; bool synchronizeCalibUpdate = false; - for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) - { + for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) { std::vector laneHasData(GetProcessingSettings().nTPCClustererLanes, false); static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors"); const int32_t maxLane = std::min(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase); - for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) - { - if (GetProcessingSettings().debugLevel >= 3) - { + for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) { + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1); } - mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) - { - if (doGPU && fragment.index != 0) - { + mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) { + if (doGPU && fragment.index != 0) { SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished } uint32_t iSector = iSectorBase + lane; - GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector]; - GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0; clusterer.mPmemory->fragment = fragment; - if (mIOPtrs.tpcPackedDigits) - { + if (mIOPtrs.tpcPackedDigits) { bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS; bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels; - auto *inDigits = mIOPtrs.tpcPackedDigits; + auto* inDigits = mIOPtrs.tpcPackedDigits; size_t numDigits = inDigits->nTPCDigits[iSector]; - if (setDigitsOnGPU) - { + if (setDigitsOnGPU) { GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true); } - if (setDigitsOnHost) - { - clusterer.mPdigits = const_cast(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast) + if (setDigitsOnHost) { + clusterer.mPdigits = const_cast(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast) } clusterer.mPmemory->counters.nDigits = numDigits; } - if (mIOPtrs.tpcZS) - { - if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) - { + if (mIOPtrs.tpcZS) { + if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) { clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first; clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second; - } - else - { + } else { clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0; } } @@ -917,82 +749,68 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) using PeakMapType = decltype(*clustererShadow.mPpeakMap); runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!! runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType)); - if (fragment.index == 0) - { + if (fragment.index == 0) { runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy)); } DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges"); - if (doGPU) - { - if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) - { + if (doGPU) { + if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) { TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane); SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane); } SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory } - if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) - { + if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) { clusterer.mPmemory->counters.nPositions = 0; return; } - if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) - { + if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) { clusterer.mPmemory->counters.nPositions = 0; return; } - if (propagateMCLabels && fragment.index == 0) - { + if (propagateMCLabels && fragment.index == 0) { clusterer.PrepareMC(); clusterer.mPinputLabels = digitsMC->v[iSector]; - if (clusterer.mPinputLabels == nullptr) - { + if (clusterer.mPinputLabels == nullptr) { GPUFatal("MC label container missing, sector %d", iSector); } - if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) - { + if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) { GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize()); } } - if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) - { - if (not mIOPtrs.tpcZS) - { + if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) { + if (not mIOPtrs.tpcZS) { runKernel({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr); TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - } - else if (propagateMCLabels) - { + } else if (propagateMCLabels) { runKernel({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr); TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); } } - if (mIOPtrs.tpcZS) - { - int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0); + if (mIOPtrs.tpcZS) { + int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0); uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS; (void)tpcTimeBinCut; // TODO: To be used in decoding kernels - switch (mCFContext->zsVersion) - { - default: - GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion); - break; - case ZSVersionRowBased10BitADC: - case ZSVersionRowBased12BitADC: - runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); - break; - case ZSVersionLinkBasedWithMeta: - runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); - break; - case ZSVersionDenseLinkBased: - runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); - break; + switch (mCFContext->zsVersion) { + default: + GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion); + break; + case ZSVersionRowBased10BitADC: + case ZSVersionRowBased12BitADC: + runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); + break; + case ZSVersionLinkBasedWithMeta: + runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); + break; + case ZSVersionDenseLinkBased: + runKernel({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF); + break; } TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); } // clang-format off