Skip to content

Commit 498cf5e

Browse files
Implement GPU hang detection
This change uses DRM_IOCTL_I915_GET_RESET_STATS to detect GPU hangs. When such situation is encountered, then zeCommandQueueSynchronize returns ZE_RESULT_ERROR_DEVICE_LOST. Related-To: NEO-5313 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
1 parent 543c854 commit 498cf5e

37 files changed

+556
-101
lines changed

level_zero/core/source/cmdqueue/cmdqueue.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,11 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res
9797
ze_result_t CommandQueueImp::synchronize(uint64_t timeout) {
9898
if ((timeout == std::numeric_limits<uint64_t>::max()) && useKmdWaitFunction) {
9999
auto &waitPair = buffers.getCurrentFlushStamp();
100-
csr->waitForTaskCountWithKmdNotifyFallback(waitPair.first, waitPair.second, false, false);
100+
const auto waitStatus = csr->waitForTaskCountWithKmdNotifyFallback(waitPair.first, waitPair.second, false, false);
101+
if (waitStatus == NEO::WaitStatus::GpuHang) {
102+
return ZE_RESULT_ERROR_DEVICE_LOST;
103+
}
104+
101105
postSyncOperations();
102106
return ZE_RESULT_SUCCESS;
103107
} else {
@@ -116,12 +120,15 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout)
116120
timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
117121
}
118122

119-
bool ready = csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
120-
if (!ready) {
123+
const auto waitStatus = csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
124+
if (waitStatus == NEO::WaitStatus::NotReady) {
121125
return ZE_RESULT_NOT_READY;
122126
}
123-
postSyncOperations();
127+
if (waitStatus == NEO::WaitStatus::GpuHang) {
128+
return ZE_RESULT_ERROR_DEVICE_LOST;
129+
}
124130

131+
postSyncOperations();
125132
return ZE_RESULT_SUCCESS;
126133
}
127134

level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,23 +137,22 @@ using MultiTileCommandQueueSynchronizeTest = Test<SingleRootMultiSubDeviceFixtur
137137

138138
template <typename GfxFamily>
139139
struct SynchronizeCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
140-
141140
SynchronizeCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield)
142141
: NEO::UltCommandStreamReceiver<GfxFamily>(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment), 0, deviceBitfield) {
143142
CommandStreamReceiver::tagAddress = &tagAddressData[0];
144143
memset(const_cast<uint32_t *>(CommandStreamReceiver::tagAddress), 0xFFFFFFFF, tagSize * sizeof(uint32_t));
145144
}
146145

147-
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
146+
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
148147
enableTimeoutSet = enableTimeout;
149148
waitForComplitionCalledTimes++;
150149
partitionCountSet = this->activePartitions;
151-
return true;
150+
return waitForCompletionWithTimeoutResult;
152151
}
153152

154-
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override {
153+
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override {
155154
waitForTaskCountWithKmdNotifyFallbackCalled++;
156-
NEO::UltCommandStreamReceiver<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode);
155+
return NEO::UltCommandStreamReceiver<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode);
157156
}
158157

159158
static constexpr size_t tagSize = 128;
@@ -162,6 +161,7 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
162161
uint32_t waitForTaskCountWithKmdNotifyFallbackCalled = 0;
163162
uint32_t partitionCountSet = 0;
164163
bool enableTimeoutSet = false;
164+
WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready;
165165
};
166166

167167
template <typename GfxFamily>
@@ -201,6 +201,61 @@ HWTEST_F(CommandQueueSynchronizeTest, givenCallToSynchronizeThenCorrectEnableTim
201201
L0::CommandQueue::fromHandle(commandQueue)->destroy();
202202
}
203203

204+
HWTEST_F(CommandQueueSynchronizeTest, givenGpuHangWhenCallingSynchronizeThenErrorIsPropagated) {
205+
auto csr = std::unique_ptr<SynchronizeCsr<FamilyType>>(new SynchronizeCsr<FamilyType>(*device->getNEODevice()->getExecutionEnvironment(),
206+
device->getNEODevice()->getDeviceBitfield()));
207+
csr->waitForCompletionWithTimeoutResult = NEO::WaitStatus::GpuHang;
208+
209+
ze_command_queue_desc_t desc{};
210+
ze_command_queue_handle_t commandQueue{};
211+
ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue);
212+
213+
ASSERT_EQ(ZE_RESULT_SUCCESS, res);
214+
ASSERT_NE(nullptr, commandQueue);
215+
216+
auto queue = whitebox_cast(L0::CommandQueue::fromHandle(commandQueue));
217+
queue->csr = csr.get();
218+
219+
constexpr auto timeout{std::numeric_limits<uint64_t>::max()};
220+
const auto synchronizationResult{queue->synchronize(timeout)};
221+
222+
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, synchronizationResult);
223+
EXPECT_EQ(1u, csr->waitForComplitionCalledTimes);
224+
EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyFallbackCalled);
225+
EXPECT_FALSE(csr->enableTimeoutSet);
226+
227+
L0::CommandQueue::fromHandle(commandQueue)->destroy();
228+
}
229+
230+
HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledAndGpuHangWhenCallingSynchronizeThenErrorIsPropagated) {
231+
DebugManagerStateRestore restore;
232+
NEO::DebugManager.flags.OverrideUseKmdWaitFunction.set(1);
233+
234+
auto csr = std::unique_ptr<SynchronizeCsr<FamilyType>>(new SynchronizeCsr<FamilyType>(*device->getNEODevice()->getExecutionEnvironment(),
235+
device->getNEODevice()->getDeviceBitfield()));
236+
csr->waitForCompletionWithTimeoutResult = NEO::WaitStatus::GpuHang;
237+
238+
ze_command_queue_desc_t desc{};
239+
ze_command_queue_handle_t commandQueue{};
240+
ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue);
241+
242+
ASSERT_EQ(ZE_RESULT_SUCCESS, res);
243+
ASSERT_NE(nullptr, commandQueue);
244+
245+
auto queue = whitebox_cast(L0::CommandQueue::fromHandle(commandQueue));
246+
queue->csr = csr.get();
247+
248+
constexpr auto timeout{std::numeric_limits<uint64_t>::max()};
249+
const auto synchronizationResult{queue->synchronize(timeout)};
250+
251+
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, synchronizationResult);
252+
EXPECT_EQ(1u, csr->waitForComplitionCalledTimes);
253+
EXPECT_EQ(1u, csr->waitForTaskCountWithKmdNotifyFallbackCalled);
254+
EXPECT_FALSE(csr->enableTimeoutSet);
255+
256+
L0::CommandQueue::fromHandle(commandQueue)->destroy();
257+
}
258+
204259
HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchronizeThenCorrectEnableTimeoutAndTimeoutValuesAreUsed) {
205260
DebugManagerStateRestore restore;
206261
NEO::DebugManager.flags.OverrideUseKmdWaitFunction.set(1);
@@ -349,7 +404,8 @@ struct TestCmdQueueCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
349404
TestCmdQueueCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield)
350405
: NEO::UltCommandStreamReceiver<GfxFamily>(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment), 0, deviceBitfield) {
351406
}
352-
ADDMETHOD_NOBASE(waitForCompletionWithTimeout, bool, false, (bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait));
407+
408+
ADDMETHOD_NOBASE(waitForCompletionWithTimeout, NEO::WaitStatus, NEO::WaitStatus::NotReady, (bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait));
353409
};
354410

355411
HWTEST_F(CommandQueueSynchronizeTest, givenSinglePartitionCountWhenWaitFunctionFailsThenReturnNotReady) {

opencl/test/unit_test/command_queue/command_queue_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -942,8 +942,8 @@ class CommandStreamReceiverHwMock : public CommandStreamReceiverHw<GfxFamily> {
942942
: CommandStreamReceiverHw<GfxFamily>(executionEnvironment, rootDeviceIndex, deviceBitfield) {}
943943
bool wiatForTaskCountCalled = false;
944944

945-
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override {
946-
return;
945+
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override {
946+
return WaitStatus::Ready;
947947
}
948948

949949
void waitForTaskCount(uint32_t requiredTaskCount) override {

opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1886,5 +1886,5 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenWaitForCompletionWithTimeoutI
18861886
mockCsr.latestSentTaskCount = 1;
18871887
auto cmdBuffer = std::make_unique<CommandBuffer>(*pDevice);
18881888
mockCsr.submissionAggregator->recordCommandBuffer(cmdBuffer.release());
1889-
EXPECT_FALSE(mockCsr.waitForCompletionWithTimeout(false, 0, 1));
1889+
EXPECT_EQ(NEO::WaitStatus::NotReady, mockCsr.waitForCompletionWithTimeout(false, 0, 1));
18901890
}

opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,6 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenTagValueNotMeetingTaskCountTo
738738
CpuIntrinsicsTests::pauseAddress = mockCsr->tagAddress;
739739
CpuIntrinsicsTests::pauseValue = taskCountToWait;
740740

741-
bool ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait);
742-
EXPECT_TRUE(ret);
741+
const auto ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait);
742+
EXPECT_EQ(NEO::WaitStatus::Ready, ret);
743743
}

opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2021 Intel Corporation
2+
* Copyright (C) 2020-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -468,7 +468,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenComputeOverrideDisableWhenComputeSup
468468
HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNotifyThenExpectImplicitBusyLoopWaitCalled) {
469469
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
470470
commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false;
471-
commandStreamReceiver.returnWaitForCompletionWithTimeout = false;
471+
commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady;
472472

473473
commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false);
474474
EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled);
@@ -477,7 +477,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNot
477477
HWTEST_F(UltCommandStreamReceiverTest, givenMultiplePartitionsWhenCallingWaitKmdNotifyThenExpectExplicitBusyLoopWaitCalled) {
478478
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
479479
commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false;
480-
commandStreamReceiver.returnWaitForCompletionWithTimeout = false;
480+
commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady;
481481

482482
commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false);
483483
EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled);

opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2021 Intel Corporation
2+
* Copyright (C) 2020-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -610,13 +610,14 @@ HWTEST_F(BcsTests, whenBlitFromHostPtrCalledThenCallWaitWithKmdFallback) {
610610
public:
611611
using UltCommandStreamReceiver<FamilyType>::UltCommandStreamReceiver;
612612

613-
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
614-
bool useQuickKmdSleep, bool forcePowerSavingMode) override {
613+
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
614+
bool useQuickKmdSleep, bool forcePowerSavingMode) override {
615615
waitForTaskCountWithKmdNotifyFallbackCalled++;
616616
taskCountToWaitPassed = taskCountToWait;
617617
flushStampToWaitPassed = flushStampToWait;
618618
useQuickKmdSleepPassed = useQuickKmdSleep;
619619
forcePowerSavingModePassed = forcePowerSavingMode;
620+
return WaitStatus::Ready;
620621
}
621622

622623
FlushStamp flushStampToWaitPassed = 0;

opencl/test/unit_test/event/event_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,7 +1487,7 @@ struct TestEventCsr : public UltCommandStreamReceiver<GfxFamily> {
14871487
TestEventCsr(const ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield)
14881488
: UltCommandStreamReceiver<GfxFamily>(const_cast<ExecutionEnvironment &>(executionEnvironment), 0, deviceBitfield) {}
14891489

1490-
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
1490+
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
14911491
waitForCompletionWithTimeoutCalled++;
14921492
waitForCompletionWithTimeoutParamsPassed.push_back({enableTimeout, timeoutMs, taskCountToWait});
14931493
return waitForCompletionWithTimeoutResult;
@@ -1500,7 +1500,7 @@ struct TestEventCsr : public UltCommandStreamReceiver<GfxFamily> {
15001500
};
15011501

15021502
uint32_t waitForCompletionWithTimeoutCalled = 0u;
1503-
bool waitForCompletionWithTimeoutResult = true;
1503+
WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready;
15041504
StackVec<WaitForCompletionWithTimeoutParams, 1> waitForCompletionWithTimeoutParamsPassed{};
15051505
};
15061506

opencl/test/unit_test/helpers/kmd_notify_tests.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ struct KmdNotifyTests : public ::testing::Test {
8888
bool waitForFlushStampResult = true;
8989
StackVec<WaitForFlushStampParams, 1> waitForFlushStampParamsPassed{};
9090

91-
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
91+
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
9292
waitForCompletionWithTimeoutCalled++;
9393
waitForCompletionWithTimeoutParamsPassed.push_back({enableTimeout, timeoutMs, taskCountToWait});
9494
return waitForCompletionWithTimeoutResult;
@@ -101,7 +101,7 @@ struct KmdNotifyTests : public ::testing::Test {
101101
};
102102

103103
uint32_t waitForCompletionWithTimeoutCalled = 0u;
104-
bool waitForCompletionWithTimeoutResult = true;
104+
WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready;
105105
StackVec<WaitForCompletionWithTimeoutParams, 2> waitForCompletionWithTimeoutParamsPassed{};
106106
};
107107

@@ -127,7 +127,6 @@ struct KmdNotifyTests : public ::testing::Test {
127127

128128
HWTEST_F(KmdNotifyTests, givenTaskCountWhenWaitUntilCompletionCalledThenAlwaysTryCpuPolling) {
129129
auto csr = createMockCsr<FamilyType>();
130-
131130
cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false);
132131
EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled);
133132
EXPECT_EQ(true, csr->waitForCompletionWithTimeoutParamsPassed[0].enableTimeout);
@@ -138,7 +137,6 @@ HWTEST_F(KmdNotifyTests, givenTaskCountWhenWaitUntilCompletionCalledThenAlwaysTr
138137
HWTEST_F(KmdNotifyTests, givenTaskCountAndKmdNotifyDisabledWhenWaitUntilCompletionCalledThenTryCpuPollingWithoutTimeout) {
139138
overrideKmdNotifyParams(false, 0, false, 0, false, 0, false, 0);
140139
auto csr = createMockCsr<FamilyType>();
141-
142140
cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false);
143141
EXPECT_EQ(0u, csr->waitForFlushStampCalled);
144142
EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled);
@@ -152,7 +150,8 @@ HWTEST_F(KmdNotifyTests, givenNotReadyTaskCountWhenWaitUntilCompletionCalledThen
152150
*csr->getTagAddress() = taskCountToWait - 1;
153151

154152
::testing::InSequence is;
155-
csr->waitForCompletionWithTimeoutResult = false;
153+
154+
csr->waitForCompletionWithTimeoutResult = WaitStatus::NotReady;
156155

157156
//we have unrecoverable for this case, this will throw.
158157
EXPECT_THROW(cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false), std::exception);
@@ -220,7 +219,7 @@ HWTEST_F(KmdNotifyTests, givenDisabledQuickSleepWhenWaitUntilCompleteWithQuickSl
220219
HWTEST_F(KmdNotifyTests, givenNotReadyTaskCountWhenPollForCompletionCalledThenTimeout) {
221220
*device->getDefaultEngine().commandStreamReceiver->getTagAddress() = taskCountToWait - 1;
222221
auto success = device->getUltCommandStreamReceiver<FamilyType>().waitForCompletionWithTimeout(true, 1, taskCountToWait);
223-
EXPECT_FALSE(success);
222+
EXPECT_NE(NEO::WaitStatus::Ready, success);
224223
}
225224

226225
HWTEST_F(KmdNotifyTests, givenZeroFlushStampWhenWaitIsCalledThenDisableTimeout) {
@@ -263,6 +262,7 @@ HWTEST_F(KmdNotifyTests, givenNonQuickSleepRequestWhenItsNotSporadicWaitThenOver
263262
HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenPowerSavingModeIsRequestedThenTimeoutIsEnabled) {
264263
overrideKmdNotifyParams(false, 3, false, 2, false, 9999999, false, 0);
265264
auto csr = createMockCsr<FamilyType>();
265+
266266
csr->waitForTaskCountWithKmdNotifyFallback(taskCountToWait, 1, false, true);
267267
EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled);
268268
EXPECT_EQ(true, csr->waitForCompletionWithTimeoutParamsPassed[0].enableTimeout);

opencl/test/unit_test/kernel/kernel_tests.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,8 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
484484
return NEO::SubmissionStatus::SUCCESS;
485485
}
486486

487-
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override {
487+
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override {
488+
return WaitStatus::Ready;
488489
}
489490
uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) override { return taskCount; };
490491

0 commit comments

Comments
 (0)