From 728b78a1aef2b7e518045b00177b049fae1d0c3d Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <felix.weiglhofer@cern.ch>
Date: Fri, 24 Oct 2025 14:41:57 +0200
Subject: [PATCH] GPU: Add a fallback implementation for Vc.

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  16 +-
 .../GPUTPCCFCheckPadBaseline.cxx              |  30 +--
 GPU/GPUTracking/utils/VcShim.h                | 192 ++++++++++++++++++
 3 files changed, 199 insertions(+), 39 deletions(-)
 create mode 100644 GPU/GPUTracking/utils/VcShim.h
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b0d466f13e5ef..fdd70e150adcb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -53,13 +53,10 @@
 #include "CommonDataFormat/InteractionRecord.h"
 #endif
 
+#include "utils/VcShim.h"
 #include "utils/strtag.h"
 #include <fstream>
 
-#ifndef GPUCA_NO_VC
-#include <Vc/Vc>
-#endif
-
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -173,7 +170,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
 
   for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
-#ifndef GPUCA_NO_VC
+
     if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
       for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
         for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
@@ -182,7 +179,6 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
         }
       }
     }
-#endif
 
     std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
     fragments.reserve(mCFContext->nFragments);
@@ -201,12 +197,12 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
       }
       nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
       for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
-#ifndef GPUCA_NO_VC
+
         if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
           Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
           Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
         }
-#endif
+
         const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
         const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page;
         if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
@@ -510,7 +506,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
           return 1;
         }
       }
-#ifndef GPUCA_NO_VC
+
       if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
         for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
           for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
@@ -521,7 +517,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
           }
         }
       }
-#endif
+
       const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
       nDigitsFragmentMax[iSector] = x.first;
       processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
index ec084c308312e..3248e517ff465 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
@@ -19,11 +19,7 @@
 #include "clusterFinderDefs.h"
 
 #ifndef GPUCA_GPUCODE
-#ifndef GPUCA_NO_VC
-#include <Vc/Vc>
-#else
-#include <array>
-#endif
+#include "utils/VcShim.h"
 #endif
 
 using namespace o2::gpu;
@@ -80,7 +76,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
 
   constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
 
-#ifndef GPUCA_NO_VC
   using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
   using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
 
@@ -88,12 +83,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
   UShort8 consecCharges{Vc::Zero};
   UShort8 maxConsecCharges{Vc::Zero};
   Charge8 maxCharge{Vc::Zero};
-#else
-  std::array<uint16_t, PadsPerCacheline> totalCharges{0};
-  std::array<uint16_t, PadsPerCacheline> consecCharges{0};
-  std::array<uint16_t, PadsPerCacheline> maxConsecCharges{0};
-  std::array<Charge, PadsPerCacheline> maxCharge{0};
-#endif
 
   tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
 
@@ -102,7 +91,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
 
   for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
     for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
-#ifndef GPUCA_NO_VC
       const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
       const UShort8::mask_type isCharge = packedCharges != 0;
 
@@ -123,22 +111,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
       } else {
         consecCharges = 0;
       }
-#else // Vc not available
-      for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
-        const uint16_t packedCharge = packedChargeStart[PadsPerCacheline * localtime + localpad];
-        const bool isCharge = packedCharge != 0;
-        if (isCharge) {
-          totalCharges[localpad]++;
-          consecCharges[localpad]++;
-          maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]);
-
-          const Charge unpackedCharge = Charge(packedCharge) / Charge(1 << PackedCharge::DecimalBits);
-          maxCharge[localpad] = CAMath::Max<Charge>(maxCharge[localpad], unpackedCharge);
-        } else {
-          consecCharges[localpad] = 0;
-        }
-      }
-#endif
     }
 
     packedChargeStart += ElemsInTileRow;
diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h
new file mode 100644
index 0000000000000..21a9a6a5c95c2
--- /dev/null
+++ b/GPU/GPUTracking/utils/VcShim.h
@@ -0,0 +1,192 @@
+// Copyright 2020-2025 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file VcShim.h
+/// \brief Provides a basic fallback implementation for Vc
+///
+/// \author Felix Weiglhofer
+
+#ifndef GPU_UTILS_VCSHIM_H
+#define GPU_UTILS_VCSHIM_H
+
+#ifndef GPUCA_NO_VC
+
+#include <Vc/Vc>
+
+#else
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+namespace Vc
+{
+
+constexpr struct VectorSpecialInitializerZero {
+} Zero;
+constexpr struct AlignedTag {
+} Aligned;
+
+template <typename T>
+typename T::vector_type& internal_data(T& v)
+{
+  return v.mData;
+}
+
+template <typename T>
+const typename T::vector_type& internal_data(const T& v)
+{
+  return v.mData;
+}
+
+namespace Common
+{
+
+template <typename V, typename M>
+class WriteMaskVector
+{
+ private:
+  const M& mMask;
+  V& mVec;
+
+ public:
+  using value_type = typename V::value_type;
+
+  WriteMaskVector(V& v, const M& m) : mMask(m), mVec(v) {}
+
+  WriteMaskVector& operator++(int)
+  {
+    for (size_t i = 0; i < mVec.size(); i++)
+      mVec[i] += value_type(mMask[i]);
+    return *this;
+  }
+
+  WriteMaskVector& operator=(const value_type& v)
+  {
+    for (size_t i = 0; i < mVec.size(); i++) {
+      if (mMask[i])
+        mVec[i] = v;
+    }
+    return *this;
+  }
+};
+
+inline void prefetchMid(const void*) {}
+inline void prefetchFar(const void*) {}
+inline void prefetchForOneRead(const void*) {}
+
+} // namespace Common
+
+template <typename T, size_t N>
+class fixed_size_simd_mask
+{
+ private:
+  std::bitset<N> mData;
+
+ public:
+  bool isNotEmpty() const { return mData.any(); }
+
+  std::bitset<N>::reference operator[](size_t i) { return mData[i]; }
+  bool operator[](size_t i) const { return mData[i]; }
+
+  fixed_size_simd_mask operator!() const
+  {
+    auto o = *this;
+    o.mData.flip();
+    return o;
+  }
+};
+
+template <typename T, size_t N>
+class fixed_size_simd
+{
+ private:
+  std::array<T, N> mData;
+
+ public:
+  using vector_type = std::array<T, N>;
+  using value_type = T;
+  using mask_type = fixed_size_simd_mask<T, N>;
+
+  static constexpr size_t size() { return N; }
+
+  fixed_size_simd() = default;
+  explicit fixed_size_simd(VectorSpecialInitializerZero) { mData = {}; }
+
+  template <typename U>
+  fixed_size_simd(const fixed_size_simd<U, N>& w)
+  {
+    std::copy_n(internal_data(w).begin(), N, mData.begin());
+  }
+
+  fixed_size_simd(const T* d, AlignedTag) { std::copy_n(d, N, mData.begin()); }
+
+  T& operator[](size_t i) { return mData[i]; }
+  const T& operator[](size_t i) const { return mData[i]; }
+
+  Common::WriteMaskVector<fixed_size_simd, mask_type> operator()(const mask_type& m) { return {*this, m}; }
+
+  fixed_size_simd& operator=(const T& v)
+  {
+    for (auto& x : mData)
+      x = v;
+    return *this;
+  }
+
+  fixed_size_simd& operator+=(const T& v)
+  {
+    for (auto& x : mData)
+      x += v;
+    return *this;
+  }
+
+  fixed_size_simd& operator/=(const T& v)
+  {
+    for (auto& x : mData)
+      x /= v;
+    return *this;
+  }
+
+  fixed_size_simd operator/(const T& v) const
+  {
+    auto x = *this;
+    return x /= v;
+  }
+
+  mask_type operator==(const T& v) const
+  {
+    mask_type m;
+    for (size_t i = 0; i < N; i++)
+      m[i] = mData[i] == v;
+    return m;
+  }
+
+  mask_type operator!=(const T& v) const { return !(*this == v); }
+
+  friend vector_type& internal_data<>(fixed_size_simd& x);
+  friend const vector_type& internal_data<>(const fixed_size_simd& x);
+};
+
+template <typename V>
+V max(const V& a, const V& b)
+{
+  V o;
+  for (size_t i = 0; i < a.size(); i++)
+    o[i] = std::max(a[i], b[i]);
+  return o;
+}
+
+} // namespace Vc
+
+#endif // ifndef GPUCA_NO_VC
+
+#endif