From 728b78a1aef2b7e518045b00177b049fae1d0c3d Mon Sep 17 00:00:00 2001 From: Felix Weiglhofer Date: Fri, 24 Oct 2025 14:41:57 +0200 Subject: [PATCH] GPU: Add a fallback implementation for Vc. --- .../Global/GPUChainTrackingClusterizer.cxx | 16 +- .../GPUTPCCFCheckPadBaseline.cxx | 30 +-- GPU/GPUTracking/utils/VcShim.h | 192 ++++++++++++++++++ 3 files changed, 199 insertions(+), 39 deletions(-) create mode 100644 GPU/GPUTracking/utils/VcShim.h diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b0d466f13e5ef..fdd70e150adcb 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -53,13 +53,10 @@ #include "CommonDataFormat/InteractionRecord.h" #endif +#include "utils/VcShim.h" #include "utils/strtag.h" #include -#ifndef GPUCA_NO_VC -#include -#endif - using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -173,7 +170,7 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0); for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { -#ifndef GPUCA_NO_VC + if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) { for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) { for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) { @@ -182,7 +179,6 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint } } } -#endif std::vector> fragments; fragments.reserve(mCFContext->nFragments); @@ -201,12 +197,12 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint } nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) { -#ifndef GPUCA_NO_VC + if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) { Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE); Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader)); } -#endif + const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE; const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page; if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) { @@ -510,7 +506,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) return 1; } } -#ifndef GPUCA_NO_VC + if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) { for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) { @@ -521,7 +517,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) } } } -#endif + const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax); nDigitsFragmentMax[iSector] = x.first; processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx index ec084c308312e..3248e517ff465 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx @@ -19,11 +19,7 @@ #include "clusterFinderDefs.h" #ifndef GPUCA_GPUCODE -#ifndef GPUCA_NO_VC -#include -#else -#include -#endif +#include "utils/VcShim.h" #endif using namespace o2::gpu; @@ -80,7 +76,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread constexpr size_t ElemsInTileRow = (size_t)TilingLayout>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline; -#ifndef GPUCA_NO_VC using UShort8 = Vc::fixed_size_simd; using Charge8 = Vc::fixed_size_simd; @@ -88,12 +83,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread UShort8 consecCharges{Vc::Zero}; UShort8 maxConsecCharges{Vc::Zero}; Charge8 maxCharge{Vc::Zero}; -#else - std::array totalCharges{0}; - std::array consecCharges{0}; - std::array maxConsecCharges{0}; - std::array maxCharge{0}; -#endif tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); @@ -102,7 +91,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) { for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) { -#ifndef GPUCA_NO_VC const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned}; const UShort8::mask_type isCharge = packedCharges != 0; @@ -123,22 +111,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread } else { consecCharges = 0; } -#else // Vc not available - for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) { - const uint16_t packedCharge = packedChargeStart[PadsPerCacheline * localtime + localpad]; - const bool isCharge = packedCharge != 0; - if (isCharge) { - totalCharges[localpad]++; - consecCharges[localpad]++; - maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]); - - const Charge unpackedCharge = Charge(packedCharge) / Charge(1 << PackedCharge::DecimalBits); - maxCharge[localpad] = CAMath::Max(maxCharge[localpad], unpackedCharge); - } else { - consecCharges[localpad] = 0; - } - } -#endif } packedChargeStart += ElemsInTileRow; diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h new file mode 100644 index 0000000000000..21a9a6a5c95c2 --- /dev/null +++ b/GPU/GPUTracking/utils/VcShim.h @@ -0,0 +1,192 @@ +// Copyright 2020-2025 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file VcShim.h +/// \brief Provides a basic fallback implementation for Vc +/// +/// \author Felix Weiglhofer + +#ifndef GPU_UTILS_VCSHIM_H +#define GPU_UTILS_VCSHIM_H + +#ifndef GPUCA_NO_VC + +#include + +#else + +#include +#include +#include +#include + +namespace Vc +{ + +constexpr struct VectorSpecialInitializerZero { +} Zero; +constexpr struct AlignedTag { +} Aligned; + +template +typename T::vector_type& internal_data(T& v) +{ + return v.mData; +} + +template +const typename T::vector_type& internal_data(const T& v) +{ + return v.mData; +} + +namespace Common +{ + +template +class WriteMaskVector +{ + private: + const M& mMask; + V& mVec; + + public: + using value_type = typename V::value_type; + + WriteMaskVector(V& v, const M& m) : mMask(m), mVec(v) {} + + WriteMaskVector& operator++(int) + { + for (size_t i = 0; i < mVec.size(); i++) + mVec[i] += value_type(mMask[i]); + return *this; + } + + WriteMaskVector& operator=(const value_type& v) + { + for (size_t i = 0; i < mVec.size(); i++) { + if (mMask[i]) + mVec[i] = v; + } + return *this; + } +}; + +inline void prefetchMid(const void*) {} +inline void prefetchFar(const void*) {} +inline void prefetchForOneRead(const void*) {} + +} // namespace Common + +template +class fixed_size_simd_mask +{ + private: + std::bitset mData; + + public: + bool isNotEmpty() const { return mData.any(); } + + std::bitset::reference operator[](size_t i) { return mData[i]; } + bool operator[](size_t i) const { return mData[i]; } + + fixed_size_simd_mask operator!() const + { + auto o = *this; + o.mData.flip(); + return o; + } +}; + +template +class fixed_size_simd +{ + private: + std::array mData; + + public: + using vector_type = std::array; + using value_type = T; + using mask_type = fixed_size_simd_mask; + + static constexpr size_t size() { return N; } + + fixed_size_simd() = default; + explicit fixed_size_simd(VectorSpecialInitializerZero) { mData = {}; } + + template + fixed_size_simd(const fixed_size_simd& w) + { + std::copy_n(internal_data(w).begin(), N, mData.begin()); + } + + fixed_size_simd(const T* d, AlignedTag) { std::copy_n(d, N, mData.begin()); } + + T& operator[](size_t i) { return mData[i]; } + const T& operator[](size_t i) const { return mData[i]; } + + Common::WriteMaskVector operator()(const mask_type& m) { return {*this, m}; } + + fixed_size_simd& operator=(const T& v) + { + for (auto& x : mData) + x = v; + return *this; + } + + fixed_size_simd& operator+=(const T& v) + { + for (auto& x : mData) + x += v; + return *this; + } + + fixed_size_simd& operator/=(const T& v) + { + for (auto& x : mData) + x /= v; + return *this; + } + + fixed_size_simd operator/(const T& v) const + { + auto x = *this; + return x /= v; + } + + mask_type operator==(const T& v) const + { + mask_type m; + for (size_t i = 0; i < N; i++) + m[i] = mData[i] == v; + return m; + } + + mask_type operator!=(const T& v) const { return !(*this == v); } + + friend vector_type& internal_data<>(fixed_size_simd& x); + friend const vector_type& internal_data<>(const fixed_size_simd& x); +}; + +template +V max(const V& a, const V& b) +{ + V o; + for (size_t i = 0; i < a.size(); i++) + o[i] = std::max(a[i], b[i]); + return o; +} + +} // namespace Vc + +#endif // ifndef GPUCA_NO_VC + +#endif