Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
int32_t ExitDevice() override;
int32_t GetThread();

virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }

// Pointers to tracker classes
GPUProcessorProcessors mProcShadow; // Host copy of tracker objects that will be used on the GPU
GPUConstantMem*& mProcessorsShadow = mProcShadow.mProcessorsProc;
Expand Down
21 changes: 0 additions & 21 deletions GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -470,27 +470,6 @@ void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEve

void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }

int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
{
if (GetProcessingSettings().stuckProtection) {
cl_int tmp = 0;
for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) {
usleep(50);
clGetEventInfo(event.get<cl_event>(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr);
if (tmp == CL_COMPLETE) {
break;
}
}
if (tmp != CL_COMPLETE) {
mGPUStuck = 1;
GPUErrorReturn("GPU Stuck, future processing in this component is disabled, skipping event (GPU Event State %d)", (int32_t)tmp);
}
} else {
clFinish(mInternals->command_queue[stream]);
}
return 0;
}

void GPUReconstructionOCL::SynchronizeGPU()
{
for (int32_t i = 0; i < mNStreams; i++) {
Expand Down
1 change: 0 additions & 1 deletion GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;

void SynchronizeGPU() override;
int32_t DoStuckProtection(int32_t stream, deviceEvent event) override;
int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override;
void SynchronizeStream(int32_t stream) override;
void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) override;
Expand Down
2 changes: 0 additions & 2 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,6 @@ BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Proces
AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
AddOption(stuckProtection, int32_t, 0, "", 0, "Timeout in us, When AMD GPU is stuck, just continue processing and skip tracking, do not crash or stall the chain")
AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidates for single input track during propagation")
AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking")
AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
Expand Down Expand Up @@ -383,7 +382,6 @@ AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run th
AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug")
AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
AddSubConfig(GPUSettingsProcessingRTC, rtc)
Expand Down
2 changes: 0 additions & 2 deletions GPU/GPUTracking/Global/GPUChain.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ class GPUChain

inline GPUChain* GetNextChainInQueue() { return mRec->GetNextChainInQueue(); }

virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }

template <class T, class S, typename... Args>
bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args)
{
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Global/GPUChainTracking.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ int32_t GPUChainTracking::ForceInitQA()
qa.reset(new GPUQA(this));
}
if (!GetQA()->IsInitialized()) {
return GetQA()->InitQA();
return GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic);
}
return 0;
}
Expand Down Expand Up @@ -690,7 +690,7 @@ int32_t GPUChainTracking::RunChain()
}
const bool needQA = GPUQA::QAAvailable() && (GetProcessingSettings().runQA || (GetProcessingSettings().eventDisplay && (mIOPtrs.nMCInfosTPC || GetProcessingSettings().runMC)));
if (needQA && GetQA()->IsInitialized() == false) {
if (GetQA()->InitQA(GetProcessingSettings().runQA ? -GetProcessingSettings().runQA : -1)) {
if (GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic)) {
return 1;
}
}
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/Global/GPUChainTracking.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ class GPUChainTracking : public GPUChain
void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
bool NeedTPCClustersOnGPU();
void WriteReducedClusters();
void SortClusters(bool buildNativeGPU, bool propagateMCLabels, o2::tpc::ClusterNativeAccess* clusterAccess, o2::tpc::ClusterNative* clusters);
template <int32_t I>
int32_t RunTRDTrackingInternal();
uint32_t StreamForSector(uint32_t sector) const;
Expand Down
92 changes: 71 additions & 21 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
#include "utils/VcShim.h"
#include "utils/strtag.h"
#include <fstream>
#include <numeric>
#include <vector>

using namespace o2::gpu;
using namespace o2::tpc;
Expand Down Expand Up @@ -762,14 +764,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
ClusterNative* tmpNativeClusters = nullptr;
std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;

// setup MC Labels
bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
const bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);

auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;

bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output

mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
if (buildNativeGPU) {
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
Expand Down Expand Up @@ -1281,21 +1282,20 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}

ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
if (propagateMCLabels) {
// TODO: write to buffer directly
if (propagateMCLabels) { // TODO: write to buffer directly
o2::dataformats::MCTruthContainer<o2::MCCompLabel> mcLabels;
std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
if (!GetProcessingSettings().tpcWriteClustersAfterRejection && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) {
if (!mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator) {
auto& labelOutputControl = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
if (!GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) {
if (!labelOutputControl->allocator) {
throw std::runtime_error("Cluster MC Label buffer missing");
}
ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator(0));
ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutputControl->allocator(0));
buffer = {&container->first, &container->second};
} else {
mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
buffer.first = mIOMem.clusterNativeMCBuffer.get();
buffer.second = mIOMem.clusterNativeMCView.get();
buffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
}

assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
Expand Down Expand Up @@ -1350,15 +1350,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (doGPU && synchronizeCalibUpdate) {
SynchronizeStream(0);
}
if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
for (uint32_t i = 0; i < NSECTORS; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
}
}
if (buildNativeGPU) {
GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
}
if (sortClusters) {
SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters);
}
mRec->MemoryScalers()->nTPCHits = nClsTotal;
mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
Expand All @@ -1374,3 +1367,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
#endif
return 0;
}

void GPUChainTracking::SortClusters(bool buildNativeGPU, bool propagateMCLabels, ClusterNativeAccess* clusterAccess, ClusterNative* clusters)
{
if (propagateMCLabels) {
std::vector<uint32_t> clsOrder(clusterAccess->nClustersTotal);
std::iota(clsOrder.begin(), clsOrder.end(), 0);
std::vector<ClusterNative> tmpClusters;
for (uint32_t i = 0; i < NSECTORS; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
const uint32_t offset = clusterAccess->clusterOffset[i][j];
std::sort(&clsOrder[offset], &clsOrder[offset + clusterAccess->nClusters[i][j]], [&clusters](const uint32_t a, const uint32_t b) {
return clusters[a] < clusters[b];
});
tmpClusters.resize(clusterAccess->nClusters[i][j]);
memcpy(tmpClusters.data(), &clusters[offset], clusterAccess->nClusters[i][j] * sizeof(tmpClusters[0]));
for (uint32_t k = 0; k < tmpClusters.size(); k++) {
clusters[offset + k] = tmpClusters[clsOrder[offset + k] - offset];
}
}
}
tmpClusters.clear();

std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
GPUOutputControl* labelOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
std::unique_ptr<ConstMCLabelContainerView> tmpUniqueContainerView;
std::unique_ptr<ConstMCLabelContainer> tmpUniqueContainerBuffer;
if (labelOutput && labelOutput->allocator) {
ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* labelContainer = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutput->allocator(0));
labelBuffer = {&labelContainer->first, &labelContainer->second};
} else {
tmpUniqueContainerView = std::move(mIOMem.clusterNativeMCView);
tmpUniqueContainerBuffer = std::move(mIOMem.clusterNativeMCBuffer);
mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
labelBuffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
}

o2::dataformats::MCLabelContainer tmpContainer;
for (uint32_t i = 0; i < clusterAccess->nClustersTotal; i++) {
for (const auto& element : clusterAccess->clustersMCTruth->getLabels(clsOrder[i])) {
tmpContainer.addElement(i, element);
}
}
tmpContainer.flatten_to(*labelBuffer.first);
*labelBuffer.second = *labelBuffer.first;
clusterAccess->clustersMCTruth = labelBuffer.second;
} else {
for (uint32_t i = 0; i < NSECTORS; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
std::sort(&clusters[clusterAccess->clusterOffset[i][j]], &clusters[clusterAccess->clusterOffset[i][j] + clusterAccess->nClusters[i][j]]);
}
}
}
if (buildNativeGPU) {
GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)clusters, clusterAccess->nClustersTotal * sizeof(clusters[0]), -1, true);
}
}
3 changes: 0 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,6 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
GPUTPCTracker& trk = processors()->tpcTrackers[iSector];
GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk;
int32_t useStream = StreamForSector(iSector);
if (GetProcessingSettings().amdMI100SerializationWorkaround) {
SynchronizeStream(useStream); // TODO: Remove this workaround once fixed on MI100
}

if (GetProcessingSettings().debugLevel >= 3) {
GPUInfo("Creating Sector Data (Sector %d)", iSector);
Expand Down
3 changes: 3 additions & 0 deletions GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ int32_t ReadConfiguration(int argc, char** argv)
printf("Can only produce QA pdf output when input files are specified!\n");
return 1;
}
if (configStandalone.QA.enableLocalOutput && !configStandalone.QA.inputHistogramsOnly && configStandalone.QA.output == "" && configStandalone.QA.plotsDir != "") {
configStandalone.QA.output = configStandalone.QA.plotsDir + "/output.root";
}
if (configStandalone.QA.inputHistogramsOnly) {
configStandalone.rundEdx = false;
configStandalone.noEvents = true;
Expand Down
Loading