Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 109 additions & 150 deletions GPU/Common/GPUCommonMath.h

Large diffs are not rendered by default.

12 changes: 0 additions & 12 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -270,15 +270,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
}
if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
mProcessingSettings.nTPCClustererLanes = 1;
if (mProcessingSettings.trackletConstructorInPipeline < 0) {
mProcessingSettings.trackletConstructorInPipeline = 1;
}
if (mProcessingSettings.trackletSelectorInPipeline < 0) {
mProcessingSettings.trackletSelectorInPipeline = 1;
}
if (mProcessingSettings.trackletSelectorSectors < 0) {
mProcessingSettings.trackletSelectorSectors = 1;
}
}
if (mProcessingSettings.createO2Output > 1 && mProcessingSettings.runQA && mProcessingSettings.qcRunFraction == 100.f) {
mProcessingSettings.createO2Output = 1;
Expand All @@ -296,9 +287,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()

UpdateAutomaticProcessingSettings();
GPUCA_GPUReconstructionUpdateDefaults();
if (!mProcessingSettings.trackletConstructorInPipeline) {
mProcessingSettings.trackletSelectorInPipeline = false;
}
if (!mProcessingSettings.rtc.enable) {
mProcessingSettings.rtc.optConstexpr = false;
}
Expand Down
1 change: 0 additions & 1 deletion GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include "GPUOutputControl.h"
#include "GPUMemoryResource.h"
#include "GPUConstantMem.h"
#include "GPUTPCSectorOutput.h"
#include "GPULogging.h"

namespace o2::its
Expand Down
38 changes: 17 additions & 21 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include "GPUChain.h"

#include "GPUTPCClusterData.h"
#include "GPUTPCSectorOutput.h"
#include "GPUTPCSectorOutCluster.h"
#include "GPUTPCGMMergedTrack.h"
#include "GPUTPCGMMergedTrackHit.h"
Expand Down Expand Up @@ -66,28 +65,25 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
if (x.nThreads != 1) {
throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
}
uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
for (uint32_t k = 0; k < num; k++) {
int32_t nThreads = getNKernelHostThreads(false);
if (nThreads > 1) {
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d Threads\n", nThreads);
}
tbb::this_task_arena::isolate([&] {
mThreading->activeThreads->execute([&] {
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
typename T::GPUSharedMemory smem;
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
}
});
int32_t nThreads = getNKernelHostThreads(false);
if (nThreads > 1) {
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d Threads\n", nThreads);
}
tbb::this_task_arena::isolate([&] {
mThreading->activeThreads->execute([&] {
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
typename T::GPUSharedMemory smem;
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
}
});
});
} else {
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
typename T::GPUSharedMemory smem;
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
}
});
} else {
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
typename T::GPUSharedMemory smem;
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
}
}
}
Expand Down
7 changes: 4 additions & 3 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP

public:
~GPUReconstructionCPU() override;
static constexpr krnlRunRange krnlRunRangeNone{0, -1};
static constexpr krnlRunRange krnlRunRangeNone{0};
static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};

template <class S, int32_t I = 0, typename... Args>
Expand Down Expand Up @@ -77,7 +77,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP

GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}

#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
{ \
if (cpuFallback) { \
Expand Down Expand Up @@ -161,7 +161,8 @@ inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
}
if (mProcessingSettings.debugLevel >= 3) {
GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)"
: mDeviceName.c_str());
}
if (nThreads == 0 || nBlocks == 0) {
return;
Expand Down
1 change: 0 additions & 1 deletion GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "GPUReconstructionIncludes.h"

#include "GPUTPCTracker.h"
#include "GPUTPCSectorOutput.h"

using namespace o2::gpu;

Expand Down
9 changes: 0 additions & 9 deletions GPU/GPUTracking/Base/GPUReconstructionIncludes.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,6 @@
#include <fstream>

#define GPUCA_GPUReconstructionUpdateDefaults() \
if (mProcessingSettings.trackletConstructorInPipeline < 0) { \
mProcessingSettings.trackletConstructorInPipeline = GPUCA_CONSTRUCTOR_IN_PIPELINE; \
} \
if (mProcessingSettings.trackletSelectorInPipeline < 0) { \
mProcessingSettings.trackletSelectorInPipeline = GPUCA_SELECTOR_IN_PIPELINE; \
} \
if (mProcessingSettings.trackletSelectorSectors < 0) { \
mProcessingSettings.trackletSelectorSectors = GPUCA_TRACKLET_SELECTOR_SECTOR_COUNT; \
} \
if (mProcessingSettings.alternateBorderSort < 0) { \
mProcessingSettings.alternateBorderSort = GPUCA_ALTERNATE_BORDER_SORT; \
} \
Expand Down
106 changes: 26 additions & 80 deletions GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,113 +35,59 @@
#ifndef GPUCA_KRNL_REG
#define GPUCA_KRNL_REG(...)
#endif
#define GPUCA_KRNL_REG_INTERNAL_PROP(...) GPUCA_M_STRIP(__VA_ARGS__)
#ifndef GPUCA_KRNL_CUSTOM
#define GPUCA_KRNL_CUSTOM(...)
#endif
#define GPUCA_KRNL_CUSTOM_INTERNAL_PROP(...)
#define GPUCA_ATTRRES_REG(XX, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, XX))(num) GPUCA_ATTRRES2(XX, __VA_ARGS__)
#define GPUCA_ATTRRES2_REG(XX, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, XX))(num) GPUCA_ATTRRES3(XX, __VA_ARGS__)
#define GPUCA_ATTRRES_CUSTOM(XX, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, XX))(args) GPUCA_ATTRRES2(XX, __VA_ARGS__)
#define GPUCA_ATTRRES2_CUSTOM(XX, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, XX))(args) GPUCA_ATTRRES3(XX, __VA_ARGS__)
#define GPUCA_ATTRRES_NONE(XX, ...)
#define GPUCA_ATTRRES2_NONE(XX, ...)
#define GPUCA_ATTRRES_(XX, ...)
#define GPUCA_ATTRRES2_(XX, ...)
#define GPUCA_ATTRRES3(XX) // 3 attributes not supported
#define GPUCA_ATTRRES2(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES2_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
#define GPUCA_ATTRRES(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
// GPU Kernel entry point for single sector
#define GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, ...) \
GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t iSector_internal GPUCA_M_STRIP(x_arguments))
#ifdef GPUCA_KRNL_DEFONLY
#define GPUCA_KRNLGPU_SINGLE(...) GPUCA_KRNLGPU_SINGLE_DEF(__VA_ARGS__);
#else
#define GPUCA_KRNLGPU_SINGLE(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
{ \
GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[iSector_internal] GPUCA_M_STRIP(x_forward)); \
}
#endif
#define GPUCA_KRNL_REG_EXTRREG(...) GPUCA_M_STRIP(__VA_ARGS__)
#define GPUCA_KRNL_CUSTOM_EXTRREG(MODE, ...) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
#define GPUCA_KRNL_NONE_EXTRREG(MODE, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_REG(MODE, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, MODE))(num) GPUCA_ATTRRES_XREG (MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_CUSTOM(MODE, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, MODE))(args) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_NONE(MODE, none, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_(MODE, ...)
#define GPUCA_ATTRRES_XNONE(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_XCUSTOM(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES_XREG(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
#define GPUCA_ATTRRES(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)

// GPU Kernel entry point
#define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \
GPUg() void GPUCA_ATTRRES(, GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))

// GPU Kernel entry point for multiple sector
#define GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, ...) \
GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)(GPUCA_CONSMEM_PTR int32_t firstSector, int32_t nSectorCount GPUCA_M_STRIP(x_arguments))
#ifdef GPUCA_KRNL_DEFONLY
#define GPUCA_KRNLGPU_MULTI(...) GPUCA_KRNLGPU_MULTI_DEF(__VA_ARGS__);
#define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__);
#else
#define GPUCA_KRNLGPU_MULTI(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
#define GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, ...) \
GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
{ \
const int32_t iSector_internal = nSectorCount * (get_group_id(0) + (get_num_groups(0) % nSectorCount != 0 && nSectorCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); \
const int32_t nSectorBlockOffset = get_num_groups(0) * iSector_internal / nSectorCount; \
const int32_t sectorBlockId = get_group_id(0) - nSectorBlockOffset; \
const int32_t sectorGridDim = get_num_groups(0) * (iSector_internal + 1) / nSectorCount - get_num_groups(0) * (iSector_internal) / nSectorCount; \
GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(sectorGridDim, get_local_size(0), sectorBlockId, get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[firstSector + iSector_internal] GPUCA_M_STRIP(x_forward)); \
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[_iSector_internal] GPUCA_M_STRIP(x_forward)); \
}
#endif

// GPU Host wrapper pre- and post-parts
#define GPUCA_KRNL_PRE(x_class, ...) \
// GPU Host wrappers for kernel
#define GPUCA_KRNL_HOST(x_class, ...) \
GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
template <> class GPUCA_KRNL_BACKEND_CLASS::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
public: \
template <typename T, typename... Args> \
static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
{ \
auto& x = _xyz.x; \
auto& y = _xyz.y;

#define GPUCA_KRNL_POST() \
auto& y = _xyz.y; \
GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
} \
};

// GPU Host wrappers for single kernel, multi-sector, or auto-detection
#define GPUCA_KRNL_single(...) \
GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
GPUCA_KRNL_PRE(__VA_ARGS__) \
if (y.num > 1) { \
throw std::runtime_error("Kernel called with invalid number of sectors"); \
} else { \
GPUCA_KRNL_CALL_single(__VA_ARGS__) \
} \
GPUCA_KRNL_POST()

#define GPUCA_KRNL_multi(...) \
GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
GPUCA_KRNL_PRE(__VA_ARGS__) \
GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
GPUCA_KRNL_POST()

#define GPUCA_KRNL_(...) GPUCA_KRNL_single(__VA_ARGS__)
#define GPUCA_KRNL_simple(...) GPUCA_KRNL_single(__VA_ARGS__)
#define GPUCA_KRNL_both(...) \
GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
GPUCA_KRNL_PRE(__VA_ARGS__) \
if (y.num <= 1) { \
GPUCA_KRNL_CALL_single(__VA_ARGS__) \
} else { \
GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
} \
GPUCA_KRNL_POST()

#define GPUCA_KRNL_LOAD_(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
#define GPUCA_KRNL_LOAD_simple(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
#define GPUCA_KRNL_LOAD_both(...) \
GPUCA_KRNL_LOAD_single(__VA_ARGS__) \
GPUCA_KRNL_LOAD_multi(__VA_ARGS__)

#define GPUCA_KRNL_PROP(x_class, x_attributes) \
template <> gpu_reconstruction_kernels::krnlProperties GPUCA_KRNL_BACKEND_CLASS::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>() { \
gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes)))}; \
gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_EXTRREG, GPUCA_M_STRIP(x_attributes))}; \
return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \
}

// Generate GPU kernel and host wrapper
#define GPUCA_KRNL_WRAP(x_func, x_class, x_attributes, ...) GPUCA_M_CAT(x_func, GPUCA_M_STRIP_FIRST(x_attributes))(x_class, x_attributes, __VA_ARGS__)
#endif // GPUCA_GPUCODE

#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (GPUCA_M_STRIP(x_attributes), REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)

#endif // O2_GPU_GPURECONSTRUCTIONKERNELMACROS_H
// clang-format on
11 changes: 4 additions & 7 deletions GPU/GPUTracking/Base/GPUReconstructionKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,8 @@ struct krnlExec {
};
struct krnlRunRange {
constexpr krnlRunRange() = default;
constexpr krnlRunRange(uint32_t a) : start(a), num(0) {}
constexpr krnlRunRange(uint32_t s, int32_t n) : start(s), num(n) {}

uint32_t start = 0;
int32_t num = 0;
constexpr krnlRunRange(uint32_t v) : index(v) {}
uint32_t index = 0;
};
struct krnlEvent {
constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
Expand All @@ -63,7 +60,7 @@ struct krnlProperties {
};

struct krnlSetup {
krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0, -1}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
krnlExec x;
krnlRunRange y;
krnlEvent z;
Expand Down Expand Up @@ -98,7 +95,7 @@ class GPUReconstructionKernels : public T
template <class S, int32_t I = 0, typename... Args>
using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;

#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args) \
{ \
T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args); \
Expand Down
Loading