Skip to content

Commit 4ae994b

Browse files
Added retries on internal channel acquire errors.
Changed internal channel acquire to do internal retries on acquire errors. Fixed error logging messages and exception messages on timeout errors to give more information and correct cause. Co-authored-by: John Connelly <john.connelly@oracle.com>
1 parent 09102c8 commit 4ae994b

File tree

2 files changed

+75
-23
lines changed

2 files changed

+75
-23
lines changed

driver/src/main/java/oracle/nosql/driver/http/Client.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -401,9 +401,12 @@ public Result execute(Request kvRequest) {
401401
kvRequest.setStartTimeMs(startTime);
402402
final String requestClass = kvRequest.getClass().getSimpleName();
403403

404+
String requestId = "";
405+
int thisIterationTimeoutMs = 0;
406+
404407
do {
405408
long thisTime = System.currentTimeMillis();
406-
int thisIterationTimeoutMs = timeoutMs - (int)(thisTime - startTime);
409+
thisIterationTimeoutMs = timeoutMs - (int)(thisTime - startTime);
407410

408411
/*
409412
* Check rate limiters before executing the request.
@@ -462,7 +465,7 @@ public Result execute(Request kvRequest) {
462465
* operations in the loop.
463466
*/
464467
Channel channel = httpClient.getChannel(thisIterationTimeoutMs);
465-
String requestId = Long.toString(nextRequestId());
468+
requestId = Long.toString(nextRequestId());
466469
responseHandler =
467470
new ResponseHandler(httpClient, logger, channel, requestId);
468471

@@ -521,7 +524,7 @@ public Result execute(Request kvRequest) {
521524
responseHandler.await(thisIterationTimeoutMs);
522525
if (isTimeout) {
523526
throw new TimeoutException("Request timed out after " +
524-
timeoutMs + " milliseconds");
527+
timeoutMs + " milliseconds: requestId=" + requestId);
525528
}
526529

527530
if (isLoggable(logger, Level.FINE)) {
@@ -687,6 +690,7 @@ public Result execute(Request kvRequest) {
687690
throw new NoSQLException(
688691
"Unable to execute request: " + ee.getCause().getMessage());
689692
} catch (TimeoutException te) {
693+
exception = te;
690694
logInfo(logger, "Timeout exception: " + te);
691695
break; /* fall through to exception below */
692696
} catch (Throwable t) {
@@ -721,8 +725,9 @@ public Result execute(Request kvRequest) {
721725
kvRequest.setRateLimitDelayedMs(rateDelayedMs);
722726
statsControl.observeError(kvRequest);
723727
throw new RequestTimeoutException(timeoutMs,
724-
"Request timed out after " + kvRequest.getNumRetries() +
725-
(kvRequest.getNumRetries() == 1 ? " retry. " : " retries. ") +
728+
requestClass + " timed out: requestId=" + requestId + " " +
729+
" nextRequestId=" + nextRequestId() +
730+
" iterationTimeout=" + thisIterationTimeoutMs + "ms " +
726731
(kvRequest.getRetryStats() != null ?
727732
kvRequest.getRetryStats() : ""), exception);
728733
}

driver/src/main/java/oracle/nosql/driver/httpclient/HttpClient.java

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ public class HttpClient {
9797
private final int port;
9898
private final String name;
9999

100+
/*
101+
* Amount of time to wait for acquiring a channel before timing
102+
* out and possibly retrying
103+
*/
104+
private final int acquireRetryIntervalMs;
105+
100106
/*
101107
* Non-null if using SSL
102108
*/
@@ -225,6 +231,11 @@ public HttpClient(String host,
225231
connectionPoolSize,
226232
poolMaxPending,
227233
true); /* do health check on release */
234+
235+
/* TODO: eventually add this to Config? */
236+
acquireRetryIntervalMs = Integer.getInteger(
237+
"oracle.nosql.driver.acquire.retryinterval",
238+
1000);
228239
}
229240

230241
SslContext getSslContext() {
@@ -304,31 +315,67 @@ public void shutdown() {
304315
public Channel getChannel(int timeoutMs)
305316
throws InterruptedException, ExecutionException, TimeoutException {
306317

318+
long startMs = System.currentTimeMillis();
319+
long now = startMs;
320+
int retries = 0;
321+
307322
while (true) {
323+
long msDiff = now - startMs;
324+
325+
/* retry loop with at most (retryInterval) ms timeouts */
326+
long thisTimeoutMs = (timeoutMs - msDiff);
327+
if (thisTimeoutMs <= 0) {
328+
String msg = "Timed out after " + msDiff +
329+
"ms (" + retries + " retries) trying " +
330+
"to acquire channel";
331+
logInfo(logger, "HttpClient " + name + " " + msg);
332+
throw new TimeoutException(msg);
333+
}
334+
if (thisTimeoutMs > acquireRetryIntervalMs) {
335+
thisTimeoutMs = acquireRetryIntervalMs;
336+
}
308337
Future<Channel> fut = pool.acquire();
309-
Channel retChan = fut.get(timeoutMs, TimeUnit.MILLISECONDS);
338+
Channel retChan = null;
339+
try {
340+
retChan = fut.get(thisTimeoutMs, TimeUnit.MILLISECONDS);
341+
} catch (TimeoutException e) {
342+
if (retries == 0) {
343+
logInfo(logger, "Timed out after " + msDiff +
344+
"ms trying to acquire channel: retrying");
345+
}
346+
/* fall through */
347+
}
348+
310349
/*
311-
* Ensure that the channel is in good shape
350+
* Ensure that the channel is in good shape. retChan is null
351+
* on a timeout exception from above; that path will retry.
312352
*/
313-
if (fut.isSuccess() && retChan.isActive()) {
314-
/*
315-
* Clear out any previous state. The channel should not
316-
* have any state associated with it, but this code is here
317-
* just in case it does.
318-
*/
319-
if (retChan.attr(STATE_KEY).get() != null) {
320-
if (isFineEnabled(logger)) {
321-
logFine(logger, "HttpClient acquired a channel with " +
322-
"a still-active state: clearing.");
353+
if (retChan != null) {
354+
if (fut.isSuccess() && retChan.isActive()) {
355+
/*
356+
* Clear out any previous state. The channel should not
357+
* have any state associated with it, but this code is here
358+
* just in case it does.
359+
*/
360+
if (retChan.attr(STATE_KEY).get() != null) {
361+
if (isFineEnabled(logger)) {
362+
logFine(logger,
363+
"HttpClient acquired a channel with " +
364+
"a still-active state: clearing.");
365+
}
366+
retChan.attr(STATE_KEY).set(null);
323367
}
324-
retChan.attr(STATE_KEY).set(null);
368+
return retChan;
325369
}
326-
return retChan;
370+
logFine(logger,
371+
"HttpClient " + name + ", acquired an inactive " +
372+
"channel, releasing it and retrying, reason: " +
373+
fut.cause());
374+
releaseChannel(retChan);
327375
}
328-
logInfo(logger,
329-
"HttpClient " + name + ", acquired an inactive channel, " +
330-
"releasing it and retrying, reason: " + fut.cause());
331-
releaseChannel(retChan);
376+
/* reset "now" and increment retries */
377+
now = System.currentTimeMillis();
378+
retries++;
332379
}
333380
}
334381

0 commit comments

Comments
 (0)