Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
constexpr int32_t reqVerMin = 0;
#endif
if (GetProcessingSettings().rtc.enable && GetProcessingSettings().rtctech.runTest == 2) {
mWarpSize = GPUCA_WARP_SIZE;
mWarpSize = GetProcessingSettings().rtc.overrideWarpSize != -1 ? GetProcessingSettings().rtc.overrideWarpSize : GPUCA_WARP_SIZE;
genAndLoadRTC();
exit(0);
}
Expand Down Expand Up @@ -245,7 +245,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment);
GPUInfo(" ");
}
if (deviceProp.warpSize != GPUCA_WARP_SIZE && !GetProcessingSettings().rtc.enable) {
if (GetProcessingSettings().rtc.enable ? (GetProcessingSettings().rtc.overrideWarpSize != -1 && deviceProp.warpSize != GetProcessingSettings().rtc.overrideWarpSize) : (deviceProp.warpSize != GPUCA_WARP_SIZE)) {
throw std::runtime_error("Invalid warp size on GPU");
}
mWarpSize = deviceProp.warpSize;
Expand Down
4 changes: 4 additions & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ AddOption(optSpecialCode, int8_t, -1, "", 0, "Insert GPUCA_RTC_SPECIAL_CODE spec
AddOption(deterministic, bool, false, "", 0, "Compile RTC in deterministic mode, with NO_FAST_MATH flags and GPUCA_DETERMINISTIC_MODE define")
AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel")
AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code")
AddOption(overrideWarpSize, int32_t, -1, "", 0, "Override the warp size to be used for RTC")
AddHelp("help", 'h')
EndConfig()

Expand Down Expand Up @@ -624,6 +625,9 @@ AddOption(deviceType, std::string, "CPU", "", 0, "Device type, CPU | CUDA | HIP
AddOption(forceDeviceType, bool, true, "", 0, "force device type, otherwise allows fall-back to CPU")
AddOption(synchronousProcessing, bool, false, "", 0, "Apply performance shortcuts for synchronous processing, disable unneeded steps")
AddOption(dump, int32_t, 0, "", 0, "Dump events for standalone benchmark: 1 = dump events, 2 = dump events and skip processing in workflow")
AddOption(dumpFirst, int32_t, 0, "", 0, "First event to dump (referring to tfCounter)")
AddOption(dumpLast, int32_t, -1, "", 0, "Last event to dump (-1 = all)")
AddOption(dumpFolder, std::string, "", "", 0, "Folder to which to write dump files, [P] is replaced by process id")
AddOption(display, bool, false, "", 0, "Enable standalone gpu tracking visualizaion")
AddOption(rundEdx, int32_t, -1, "", 0, "Enable/disable dEdx processing (-1 for autoselect)")
AddOption(dEdxSplineTopologyCorrFile, std::string, "", "", 0, "File name of the dE/dx spline track topology correction file")
Expand Down
23 changes: 12 additions & 11 deletions GPU/GPUTracking/Interface/GPUO2Interface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -137,29 +137,30 @@ void GPUO2Interface::Deinitialize()
mNContexts = 0;
}

void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data)
void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir)
{
mCtx[0].mChain->ClearIOPointers();
mCtx[0].mChain->mIOPtrs = *data;
const auto oldPtrs = mCtx[iThread].mChain->mIOPtrs;
mCtx[iThread].mChain->mIOPtrs = *data;
char fname[1024];
snprintf(fname, 1024, "event.%d.dump", nEvent);
mCtx[0].mChain->DumpData(fname);
snprintf(fname, 1024, "%sevent.%d.dump", dir, nEvent);
mCtx[iThread].mChain->DumpData(fname);
if (nEvent == 0) {
#ifdef GPUCA_BUILD_QA
if (mConfig->configProcessing.runMC) {
mCtx[0].mChain->ForceInitQA();
mCtx[iThread].mChain->ForceInitQA();
snprintf(fname, 1024, "mc.%d.dump", nEvent);
mCtx[0].mChain->GetQA()->UpdateChain(mCtx[0].mChain);
mCtx[0].mChain->GetQA()->DumpO2MCData(fname);
mCtx[iThread].mChain->GetQA()->UpdateChain(mCtx[iThread].mChain);
mCtx[iThread].mChain->GetQA()->DumpO2MCData(fname);
}
#endif
}
mCtx[iThread].mChain->mIOPtrs = oldPtrs;
}

void GPUO2Interface::DumpSettings()
void GPUO2Interface::DumpSettings(uint32_t iThread, const char* dir)
{
mCtx[0].mChain->DoQueuedUpdates(-1);
mCtx[0].mRec->DumpSettings();
mCtx[iThread].mChain->DoQueuedUpdates(-1);
mCtx[iThread].mRec->DumpSettings(dir);
}

int32_t GPUO2Interface::RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs, uint32_t iThread, GPUInterfaceInputUpdate* inputUpdateCallback)
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Interface/GPUO2Interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ class GPUO2Interface

int32_t RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs = nullptr, uint32_t iThread = 0, GPUInterfaceInputUpdate* inputUpdateCallback = nullptr);
void Clear(bool clearOutputs, uint32_t iThread = 0);
void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data);
void DumpSettings();
void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir = "");
void DumpSettings(uint32_t iThread, const char* dir = "");

void GetITSTraits(o2::its::TrackerTraits<7>*& trackerTraits, o2::its::VertexerTraits<7>*& vertexerTraits, o2::its::TimeFrame<7>*& timeFrame);
const o2::base::Propagator* GetDeviceO2Propagator(int32_t iThread = 0) const;
Expand Down
10 changes: 5 additions & 5 deletions GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,11 @@ int32_t ReadConfiguration(int argc, char** argv)
}
}
if (configStandalone.setO2Settings) {
if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) {
printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n");
return 1;
}
if (configStandalone.runGPU) {
if (configStandalone.runGPU && configStandalone.proc.debugLevel <= 1) {
if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) {
printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n");
return 1;
}
configStandalone.proc.forceHostMemoryPoolSize = 1024 * 1024 * 1024;
}
configStandalone.rec.tpc.trackReferenceX = 83;
Expand Down
1 change: 1 addition & 0 deletions GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class GPURecoWorkflowSpec : public o2::framework::Task
int64_t mCreationForCalib = -1; ///< creation time for calib manipulation
int32_t mVerbosity = 0;
uint32_t mNTFs = 0;
uint32_t mNTFDumps = 0;
uint32_t mNDebugDumps = 0;
uint32_t mNextThreadIndex = 0;
bool mUpdateGainMapCCDB = true;
Expand Down
1 change: 1 addition & 0 deletions GPU/Workflow/src/GPUWorkflowInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct GPURecoWorkflow_QueueObject {
bool jobSubmitted = false;
bool jobFinished = false;
int32_t jobReturnValue = 0;
volatile int32_t jobThreadIndex = -1;
std::mutex jobFinishedMutex;
std::condition_variable jobFinishedNotify;
bool jobInputFinal = false;
Expand Down
4 changes: 2 additions & 2 deletions GPU/Workflow/src/GPUWorkflowPipeline.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ void GPURecoWorkflowSpec::RunWorkerThread(int32_t id)
context = workerContext.inputQueue.front();
workerContext.inputQueue.pop();
}
context->jobThreadIndex = id;
context->jobReturnValue = runMain(nullptr, context->jobPtrs, context->jobOutputRegions, id, context->jobInputUpdateCallback.get());
{
std::lock_guard lk(context->jobFinishedMutex);
Expand Down Expand Up @@ -179,8 +180,7 @@ int32_t GPURecoWorkflowSpec::handlePipeline(ProcessingContext& pc, GPUTrackingIn
}
mPipeline->completionPolicyQueue.pop();
}
}
if (mSpecConfig.enableDoublePipeline == 2) {
} else if (mSpecConfig.enableDoublePipeline == 2) {
auto prepareDummyMessage = pc.outputs().make<DataAllocator::UninitializedVector<char>>(Output{gDataOriginGPU, "PIPELINEPREPARE", 0}, 0u);

size_t ptrsTotal = 0;
Expand Down
31 changes: 26 additions & 5 deletions GPU/Workflow/src/GPUWorkflowSpec.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -825,11 +825,31 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)

lockDecodeInput.reset();

uint32_t threadIndex;
if (mConfParam->dump) {
if (mNTFs == 1) {
mGPUReco->DumpSettings();
if (mSpecConfig.enableDoublePipeline && pipelineContext->jobSubmitted) {
while (pipelineContext->jobThreadIndex == -1) {
}
threadIndex = pipelineContext->jobThreadIndex;
} else {
threadIndex = 0; // TODO: Not sure if this is safe, but it is not yet known which threadIndex will pick up the enqueued job
}

std::string dir = "";
if (mConfParam->dumpFolder != "") {
dir = std::regex_replace(mConfParam->dumpFolder, std::regex("\\[P\\]"), std::to_string(getpid()));
if (mNTFs == 1) {
mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
}
dir += "/";
}
if (mNTFs == 1) { // Must dump with first TF, since will enforce enqueued calib updates
mGPUReco->DumpSettings(threadIndex, dir.c_str());
}
if (tinfo.tfCounter >= mConfParam->dumpFirst && (mConfParam->dumpLast == -1 || tinfo.tfCounter <= mConfParam->dumpLast)) {
mGPUReco->DumpEvent(mNTFDumps, &ptrs, threadIndex, dir.c_str());
mNTFDumps++;
}
mGPUReco->DumpEvent(mNTFs - 1, &ptrs);
}
std::unique_ptr<GPUTrackingInOutPointers> ptrsDump;
if (mConfParam->dumpBadTFMode == 2) {
Expand All @@ -847,9 +867,10 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
std::unique_lock lk(pipelineContext->jobFinishedMutex);
pipelineContext->jobFinishedNotify.wait(lk, [context = pipelineContext.get()]() { return context->jobFinished; });
retVal = pipelineContext->jobReturnValue;
threadIndex = pipelineContext->jobThreadIndex;
} else {
// uint32_t threadIndex = pc.services().get<ThreadPool>().threadIndex;
uint32_t threadIndex = mNextThreadIndex;
threadIndex = mNextThreadIndex;
if (mConfig->configProcessing.doublePipeline) {
mNextThreadIndex = (mNextThreadIndex + 1) % 2;
}
Expand Down Expand Up @@ -879,7 +900,7 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
}
fclose(fp);
} else if (mConfParam->dumpBadTFMode == 2) {
mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get());
mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get(), threadIndex);
}
}

Expand Down
3 changes: 3 additions & 0 deletions prodtests/full-system-test/dpl-workflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ if [[ $EPNSYNCMODE == 1 ]]; then
fi
fi
fi
if [[ $GPUTYPE != "CPU" && $NGPUS > 1 ]]; then
GPU_CONFIG_KEY+="GPU_global.dumpFolder=gpu_dump_[P];"
fi
if [[ $SYNCRAWMODE == 1 ]]; then
GPU_CONFIG_KEY+="GPU_proc.tpcIncreasedMinClustersPerRow=500000;GPU_proc.ignoreNonFatalGPUErrors=1;GPU_proc.throttleAlarms=1;"
if [[ $RUNTYPE == "PHYSICS" || $RUNTYPE == "COSMICS" || $RUNTYPE == "TECHNICAL" ]]; then
Expand Down