diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index b4dac39ae1cd2..acca74e57a80e 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -263,7 +263,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() } if (mProcessingSettings.deterministicGPUReconstruction) { #ifndef GPUCA_DETERMINISTIC_MODE - GPUError("Warning, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); + GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); #endif mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU; param().rec.tpc.nWaysOuter = true; @@ -274,6 +274,10 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mProcessingSettings.createO2Output = 1; } mProcessingSettings.rtc.deterministic = 1; + } else { +#ifdef GPUCA_DETERMINISTIC_MODE + GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!"); +#endif } if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) { mProcessingSettings.nTPCClustererLanes = 1; diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index b212abbcd2707..4ee6b23d46b51 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -516,6 +516,12 @@ #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP #define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6 #endif + #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL + #define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 4 + #endif + #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED + #define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 1 + #endif #ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE #define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 12 #endif @@ -544,6 +550,12 @@ #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP #define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0 #endif + #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL + #define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 0 + #endif + #ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED + #define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 0 + #endif #ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE #define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 0 #endif diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 34fac6514851c..9d1772379f6bd 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -284,7 +284,7 @@ AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for m AddOption(debugMask, int32_t, 262143, "", 0, "Mask for debug output dumps to file") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") -AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6") +AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1)) AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics") AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression") AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event") diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx index d76c079bb406f..54ce7f12c655f 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx @@ -75,14 +75,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh return; } -#define UnrollGlobal 4 -#define MaxShared GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP -#if MaxShared < GPUCA_MAXN -#define MaxGlobal ((GPUCA_MAXN - MaxShared - 1) / UnrollGlobal + 1) * UnrollGlobal -#else -#define MaxGlobal 0 -#endif -#define MaxTotal MaxShared + MaxGlobal + static constexpr uint32_t UNROLL_GLOBAL = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL > 1 ? GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL : 1; + static_assert(GPUCA_MAXN % UNROLL_GLOBAL == 0); + static constexpr uint32_t MAX_SHARED = GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP; + static constexpr uint32_t MAX_GLOBAL = (MAX_SHARED < GPUCA_MAXN) ? (((GPUCA_MAXN - MAX_SHARED - 1) / UNROLL_GLOBAL + 1) * UNROLL_GLOBAL) : 0; + static constexpr uint32_t MAX_TOTAL = MAX_SHARED + MAX_GLOBAL; const float chi2Cut = 3.f * 3.f * 4 * (s.mUpDx * s.mUpDx + s.mDnDx * s.mDnDx); // float chi2Cut = 3.f*3.f*(s.mUpDx*s.mUpDx + s.mDnDx*s.mDnDx ); //SG @@ -117,10 +114,8 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh const float kAreaSlopeZUp = kAngularMultiplier != 0.f ? 1.f : s.mUpTx; const float kAreaSlopeZDn = kAngularMultiplier != 0.f ? 1.f : s.mDnTx; -#if MaxGlobal > 0 - calink neighUp[MaxGlobal]; - float yzUp[2 * MaxGlobal]; -#endif + calink neighUp[MAX_GLOBAL]; + float yzUp[2 * MAX_GLOBAL]; for (int32_t ih = iThread; ih < s.mNHits; ih += nThreads) { @@ -128,7 +123,7 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh const float y = y0 + hitData.x * stepY; const float z = z0 + hitData.y * stepZ; - int32_t nNeighUp = 0; + uint32_t nNeighUp = 0; float minZ, maxZ, minY, maxY; int32_t binYmin, binYmax, binZmin, binZmax; int32_t nY; @@ -145,11 +140,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh nY = rowUp.Grid().Ny(); } - for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MaxTotal); k1++) { + for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MAX_TOTAL); k1++) { int32_t iMin = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmin]; int32_t iMax = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmax + 1]; GPUCA_UNROLL(U(4), U(2)) - for (int32_t i = iMin; i < iMax && (nNeighUp < MaxTotal); i++) { + for (int32_t i = iMin; i < iMax && (nNeighUp < MAX_TOTAL); i++) { const GPUglobalref() cahit2& hitDataUp = pHitData[lHitNumberOffsetUp + i]; GPUTPCHit h; h.mY = y0Up + (hitDataUp.x) * stepYUp; @@ -159,51 +154,48 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh continue; } -#if MaxGlobal > 0 -#if MaxShared == 0 - if (true) { -#else - if (nNeighUp >= MaxShared) { -#endif - neighUp[nNeighUp - MaxShared] = (calink)i; - yzUp[2 * (nNeighUp - MaxShared)] = s.mDnDx * (h.Y() - y); - yzUp[2 * (nNeighUp - MaxShared) + 1] = s.mDnDx * (h.Z() - z); - } else -#endif - { -#if MaxShared > 0 - s.mB[nNeighUp][iThread] = (calink)i; - s.mA1[nNeighUp][iThread] = s.mDnDx * (h.Y() - y); - s.mA2[nNeighUp][iThread] = s.mDnDx * (h.Z() - z); -#endif + const bool inGlobal = nNeighUp >= MAX_SHARED; + if constexpr (MAX_GLOBAL > 0) { + if (inGlobal) { + neighUp[nNeighUp - MAX_SHARED] = (calink)i; + yzUp[2 * (nNeighUp - MAX_SHARED)] = s.mDnDx * (h.Y() - y); + yzUp[2 * (nNeighUp - MAX_SHARED) + 1] = s.mDnDx * (h.Z() - z); + } + } + if constexpr (MAX_SHARED > 0) { + if (!inGlobal) { + s.mB[nNeighUp][iThread] = (calink)i; + s.mA1[nNeighUp][iThread] = s.mDnDx * (h.Y() - y); + s.mA2[nNeighUp][iThread] = s.mDnDx * (h.Z() - z); + } } nNeighUp++; } } -#if MaxShared > 0 // init a rest of the shared array - for (int32_t iUp = nNeighUp; iUp < MaxShared; iUp++) { - s.mA1[iUp][iThread] = -1.e10f; - s.mA2[iUp][iThread] = -1.e10f; - s.mB[iUp][iThread] = (calink)-1; + if constexpr (MAX_SHARED > 0 && GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED) { // init the rest of the shared array + for (uint32_t iUp = nNeighUp; iUp < MAX_SHARED; iUp++) { + s.mA1[iUp][iThread] = -1.e10f; + s.mA2[iUp][iThread] = -1.e10f; + s.mB[iUp][iThread] = (calink)-1; + } } -#endif -#if MaxGlobal > 0 // init a rest of the UnrollGlobal chunk of the global array - int32_t Nrest = nNeighUp - MaxShared; - int32_t N4 = (Nrest / UnrollGlobal) * UnrollGlobal; - if (N4 < Nrest) { - N4 += UnrollGlobal; - GPUCA_UNROLL(U(UnrollGlobal - 1), U(UnrollGlobal - 1)) - for (int32_t k = 0; k < UnrollGlobal - 1; k++) { - if (Nrest + k < N4) { - yzUp[2 * (Nrest + k)] = -1.e10f; - yzUp[2 * (Nrest + k) + 1] = -1.e10f; - neighUp[Nrest + k] = (calink)-1; + const uint32_t nRest = nNeighUp - MAX_SHARED; + uint32_t nRestUnrolled = (nRest / UNROLL_GLOBAL) * UNROLL_GLOBAL; + if constexpr (MAX_GLOBAL > 1) { // init the rest of the UNROLL_GLOBAL chunk of the global array + if (nNeighUp > MAX_SHARED && nRestUnrolled < nRest) { + nRestUnrolled += UNROLL_GLOBAL; + GPUCA_UNROLL(U(UNROLL_GLOBAL - 1), U(UNROLL_GLOBAL - 1)) + for (uint32_t k = 0; k + 1 < UNROLL_GLOBAL; k++) { + if (nRest + k < nRestUnrolled) { + yzUp[2 * (nRest + k)] = -1.e10f; + yzUp[2 * (nRest + k) + 1] = -1.e10f; + neighUp[nRest + k] = (calink)-1; + } } } } -#endif { // area in the lower row const float yy = y * s.mDnTx; @@ -236,47 +228,50 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh float yDnProjUp = s.mUpDx * (yDn - y); float zDnProjUp = s.mUpDx * (zDn - z); -#if MaxShared > 0 - GPUCA_UNROLL(U(MaxShared), U(MaxShared)) - for (int32_t iUp = 0; iUp < MaxShared; iUp++) { - const float dy = yDnProjUp - s.mA1[iUp][iThread]; - const float dz = zDnProjUp - s.mA2[iUp][iThread]; - const float d = dy * dy + dz * dz; - if (d < bestD) { - bestD = d; - linkDn = i; - linkUp = iUp; - } - } -#endif - -#if MaxGlobal > 0 - for (int32_t iUp = 0; iUp < N4; iUp += UnrollGlobal) { - GPUCA_UNROLL(U(UnrollGlobal), U(UnrollGlobal)) - for (int32_t k = 0; k < UnrollGlobal; k++) { - int32_t jUp = iUp + k; - const float dy = yDnProjUp - yzUp[2 * jUp]; - const float dz = zDnProjUp - yzUp[2 * jUp + 1]; + if constexpr (MAX_SHARED > 0) { + const uint32_t maxSharedUp = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED ? MAX_SHARED : CAMath::Min(nNeighUp, MAX_SHARED); + GPUCA_UNROLL(U(MAX_SHARED), U(MAX_SHARED)) + for (uint32_t iUp = 0; iUp < maxSharedUp; iUp++) { + const float dy = yDnProjUp - s.mA1[iUp][iThread]; + const float dz = zDnProjUp - s.mA2[iUp][iThread]; const float d = dy * dy + dz * dz; if (d < bestD) { bestD = d; linkDn = i; - linkUp = MaxShared + jUp; + linkUp = iUp; + } + } + } + + if constexpr (MAX_GLOBAL > 0) { + if (nNeighUp > MAX_SHARED) { + for (uint32_t iUp = 0; iUp < nRestUnrolled; iUp += UNROLL_GLOBAL) { + GPUCA_UNROLL(U(UNROLL_GLOBAL), U(UNROLL_GLOBAL)) + for (uint32_t k = 0; k < UNROLL_GLOBAL; k++) { + const uint32_t jUp = iUp + k; + const float dy = yDnProjUp - yzUp[2 * jUp]; + const float dz = zDnProjUp - yzUp[2 * jUp + 1]; + const float d = dy * dy + dz * dz; + if (d < bestD) { + bestD = d; + linkDn = i; + linkUp = MAX_SHARED + jUp; + } + } } } } -#endif } } if (linkUp >= 0) { -#if MaxShared > 0 && MaxGlobal > 0 - linkUp = (linkUp >= MaxShared) ? neighUp[linkUp - MaxShared] : s.mB[linkUp][iThread]; -#elif MaxShared > 0 - linkUp = s.mB[linkUp][iThread]; -#else - linkUp = neighUp[linkUp]; -#endif + if constexpr (MAX_SHARED > 0 && MAX_GLOBAL > 0) { + linkUp = ((uint32_t)linkUp >= MAX_SHARED) ? neighUp[linkUp - MAX_SHARED] : s.mB[linkUp][iThread]; + } else if constexpr (MAX_SHARED > 0) { + linkUp = s.mB[linkUp][iThread]; + } else { + linkUp = neighUp[linkUp]; + } } tracker.mData.mLinkUpData[lHitNumberOffset + ih] = linkUp; diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h index 0ecd230a67415..6bdc637b6bad6 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h @@ -40,12 +40,10 @@ class GPUTPCNeighboursFinder : public GPUKernelTemplate int32_t mIRow; // row number int32_t mIRowUp; // next row number int32_t mIRowDn; // previous row number -#if GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0 static_assert(GPUCA_MAXN >= GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP); float mA1[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; float mA2[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; calink mB[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)]; -#endif GPUTPCRow mRow, mRowUp, mRowDown; }; diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx index 20dfd69864816..06dac4a68c540 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx @@ -39,36 +39,38 @@ GPUdii() void GPUTPCStartHitsFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nThr uint32_t linkUpData = tracker.mData.mLinkUpData[lHitNumberOffset + ih]; if (tracker.mData.mLinkDownData[lHitNumberOffset + ih] == CALINK_INVAL && linkUpData != CALINK_INVAL && tracker.mData.mLinkUpData[rowUp.mHitNumberOffset + linkUpData] != CALINK_INVAL) { -#if GPUCA_PAR_SORT_STARTHITS > 0 - GPUglobalref() GPUTPCHitId* const GPUrestrict() startHits = tracker.mTrackletTmpStartHits + s.mIRow * tracker.mNMaxRowStartHits; - uint32_t nextRowStartHits = CAMath::AtomicAddShared(&s.mNRowStartHits, 1u); - if (nextRowStartHits >= tracker.mNMaxRowStartHits) { - tracker.raiseError(GPUErrors::ERROR_ROWSTARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxRowStartHits); - CAMath::AtomicExchShared(&s.mNRowStartHits, tracker.mNMaxRowStartHits); - break; + GPUglobalref() GPUTPCHitId* GPUrestrict() startHits; + uint32_t nextRowStartHits; + if constexpr (GPUCA_PAR_SORT_STARTHITS > 0) { + startHits = tracker.mTrackletTmpStartHits + s.mIRow * tracker.mNMaxRowStartHits; + nextRowStartHits = CAMath::AtomicAddShared(&s.mNRowStartHits, 1u); + if (nextRowStartHits >= tracker.mNMaxRowStartHits) { + tracker.raiseError(GPUErrors::ERROR_ROWSTARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxRowStartHits); + CAMath::AtomicExchShared(&s.mNRowStartHits, tracker.mNMaxRowStartHits); + break; + } + } else { + startHits = tracker.mTrackletStartHits; + nextRowStartHits = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, 1u); + if (nextRowStartHits >= tracker.mNMaxStartHits) { + tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxStartHits); + CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits); + break; + } } -#else - GPUglobalref() GPUTPCHitId* const GPUrestrict() startHits = tracker.mTrackletStartHits; - uint32_t nextRowStartHits = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, 1u); - if (nextRowStartHits >= tracker.mNMaxStartHits) { - tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxStartHits); - CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits); - break; - } -#endif startHits[nextRowStartHits].Set(s.mIRow, ih); } } GPUbarrier(); -#if GPUCA_PAR_SORT_STARTHITS > 0 - if (iThread == 0) { - uint32_t nOffset = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, s.mNRowStartHits); - tracker.mRowStartHitCountOffset[s.mIRow] = s.mNRowStartHits; - if (nOffset + s.mNRowStartHits > tracker.mNMaxStartHits) { - tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nOffset + s.mNRowStartHits, tracker.mNMaxStartHits); - CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits); + if constexpr (GPUCA_PAR_SORT_STARTHITS > 0) { + if (iThread == 0) { + uint32_t nOffset = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, s.mNRowStartHits); + tracker.mRowStartHitCountOffset[s.mIRow] = s.mNRowStartHits; + if (nOffset + s.mNRowStartHits > tracker.mNMaxStartHits) { + tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nOffset + s.mNRowStartHits, tracker.mNMaxStartHits); + CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits); + } } } -#endif } diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.cxx index e27a8f66ae754..0bf3448bed730 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.cxx @@ -48,11 +48,11 @@ GPUdii() void GPUTPCTrackletSelector::Thread<0>(int32_t nBlocks, int32_t nThread int32_t irow = firstRow; - int32_t gap = 0; - int32_t nShared = 0; - int32_t nHits = 0; - const int32_t minHits = tracker.Param().rec.tpc.minNClustersTrackSeed == -1 ? GPUCA_TRACKLET_SELECTOR_MIN_HITS_B5(tracklet.Param().QPt() * tracker.Param().qptB5Scaler) : tracker.Param().rec.tpc.minNClustersTrackSeed; - const int32_t sharingMinNorm = minHits * tracker.Param().rec.tpc.trackletMinSharedNormFactor; + uint32_t gap = 0; + uint32_t nShared = 0; + uint32_t nHits = 0; + const uint32_t minHits = tracker.Param().rec.tpc.minNClustersTrackSeed == -1 ? GPUCA_TRACKLET_SELECTOR_MIN_HITS_B5(tracklet.Param().QPt() * tracker.Param().qptB5Scaler) : tracker.Param().rec.tpc.minNClustersTrackSeed; + const uint32_t sharingMinNorm = minHits * tracker.Param().rec.tpc.trackletMinSharedNormFactor; float maxShared = maxSharedFrac * sharingMinNorm; GPUCA_UNROLL(, U(1)) @@ -63,16 +63,20 @@ GPUdii() void GPUTPCTrackletSelector::Thread<0>(int32_t nBlocks, int32_t nThread } if (ih != CALINK_INVAL && ih != CALINK_DEAD_CHANNEL) { GPUglobalref() const GPUTPCRow& row = tracker.Row(irow); - bool own = (tracker.HitWeight(row, ih) <= w); - bool sharedOK = nShared <= (nHits < sharingMinNorm ? maxShared : nHits * maxSharedFrac); + const bool own = (tracker.HitWeight(row, ih) <= w); + const bool sharedOK = nShared <= (nHits < sharingMinNorm ? maxShared : nHits * maxSharedFrac); if (own || sharedOK) { // SG!!! gap = 0; -#if GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 - if (nHits < GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE) { - s.mHits[nHits][iThread].Set(irow, ih); - } else -#endif // GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 - { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtype-limits" + const bool inShared = nHits < (uint32_t)GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE; +#pragma GCC diagnostic pop + if constexpr (GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE > 0) { + if (inShared) { + s.mHits[nHits][iThread].Set(irow, ih); + } + } + if (!inShared) { trackHits[nHits - GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE].Set(irow, ih); } nHits++; @@ -100,13 +104,17 @@ GPUdii() void GPUTPCTrackletSelector::Thread<0>(int32_t nBlocks, int32_t nThread tracker.Tracks()[itrout].SetParam(tracklet.Param()); tracker.Tracks()[itrout].SetFirstHitID(nFirstTrackHit); tracker.Tracks()[itrout].SetNHits(nHits); - for (int32_t jh = 0; jh < nHits; jh++) { -#if GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 - if (jh < GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE) { - tracker.TrackHits()[nFirstTrackHit + jh] = s.mHits[jh][iThread]; - } else -#endif // GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 - { + for (uint32_t jh = 0; jh < nHits; jh++) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtype-limits" + const bool inShared = jh < (uint32_t)GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE; +#pragma GCC diagnostic pop + if constexpr (GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE > 0) { + if (inShared) { + tracker.TrackHits()[nFirstTrackHit + jh] = s.mHits[jh][iThread]; + } + } + if (!inShared) { tracker.TrackHits()[nFirstTrackHit + jh] = trackHits[jh - GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE]; } } diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h index e5a28c80f37f9..070e02fad8222 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h @@ -36,10 +36,8 @@ class GPUTPCTrackletSelector : public GPUKernelTemplate int32_t mNThreadsTotal; // total n threads int32_t mNTracklets; // n of tracklets int32_t mReserved; // for alignment reasons -#if GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 static_assert(GPUCA_ROW_COUNT >= GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE); GPUTPCHitId mHits[GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletSelector)]; -#endif // GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 }; typedef GPUconstantref() GPUTPCTracker processorType; diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index ee3af2b87d925..fcf576d828b7f 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -136,6 +136,8 @@ o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALR o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP + NEIGHBOURS_FINDER_UNROLL_GLOBAL + NEIGHBOURS_FINDER_UNROLL_SHARED TRACKLET_SELECTOR_HITS_REG_SIZE ALTERNATE_BORDER_SORT SORT_BEFORE_FIT