-
Notifications
You must be signed in to change notification settings - Fork 124
Description
When an NFS client and server share a file, and the file is deleted on the NFS server while it is still open (referenced) by a process on the NFS client, there is currently no reliable way to use lsof on the client to filter and list those file descriptors that correspond to deleted / “stale file handle” (ESTALE) files.
This makes troubleshooting difficult: for example, du -sh on the server and df -h on the client (or across hosts) can diverge more and more over time, and it is hard to identify the root cause because the open-but-deleted NFS files cannot be easily discovered with lsof.
I wrote a patch as an experiment, but I am not an lsof expert and I do not think the implementation quality is good enough for submission. I’m sharing this problem description to ask whether the lsof community can provide a proper solution for detecting and filtering these NFS “deleted but still open” / ESTALE file descriptors on clients.
nfs client side:
root@virtme-ng:/home/jeffxie/vng-share/nfs# exec 3<./test.txt
nfs server side:
# rm ./test.txt
root@virtme-ng:/home/jeffxie/vng-share# lsof +L1
NOTHING OUTPUT
root@virtme-ng:/home/jeffxie/vng-share# stat -L /proc/399/fd/3
stat: cannot statx '/proc/399/fd/3': Stale file handle
root@virtme-ng:/home/jeffxie/vng-share# echo $?
1
patch:
diff --git a/Lsof.8 b/Lsof.8
index b254dd4..1da4e00 100644
--- a/Lsof.8
+++ b/Lsof.8
@@ -1302,6 +1302,10 @@ It is also useful when host name lookup is not working properly.
.B \-N
selects the listing of NFS files.
.TP \w'names'u+4
+.B \-y
+selects the listing of files whose stat(2)/lstat(2) fails with
+ESTALE ("Stale file handle"), e.g., stale NFS file handles.
+.TP \w'names'u+4
.BI \-o
directs
.I lsof
diff --git a/lib/common.h b/lib/common.h
index 1e6c67e..88cd4fa 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -569,14 +569,15 @@ extern int ZoneColW;
# define SELEVTFDINFO \
0x200000 /* selected for evetnfd info; \
* cleared in link_lfile() */
+# define SELSTALE 0x400000 /* select files with ESTALE (stale file handle) */
# define SELALL \
(SELCMD | SELCNTX | SELFD | SELNA | SELNET | SELNM | SELNFS | SELPID | \
- SELUID | SELUNX | SELZONE | SELTASK)
+ SELUID | SELUNX | SELZONE | SELTASK | SELSTALE)
# define SELPROC \
(SELCMD | SELCNTX | SELPGID | SELPID | SELUID | SELZONE | SELTASK)
/* process selecters */
-# define SELFILE (SELFD | SELNFS | SELNLINK | SELNM) /* file selecters */
+# define SELFILE (SELFD | SELNFS | SELNLINK | SELNM | SELSTALE) /* file selecters */
# define SELNW (SELNA | SELNET | SELUNX) /* network selecters */
/*
diff --git a/lib/dialects/linux/dproc.c b/lib/dialects/linux/dproc.c
index 8dc7c27..38dc7f8 100644
--- a/lib/dialects/linux/dproc.c
+++ b/lib/dialects/linux/dproc.c
@@ -917,6 +917,8 @@ static int process_id(struct lsof_context *ctx, /* context */
static int pathil = 0;
char *rest;
int txts = 0;
+ int enss_fd = 0;
+ int enls_fd = 0;
#if defined(HASSELINUX)
cntxlist_t *cntxp;
@@ -1206,6 +1208,7 @@ static int process_id(struct lsof_context *ctx, /* context */
} else {
if (HasNFS) {
if (lstatsafely(ctx, path, &lsb)) {
+ enls_fd = errno;
(void)statEx(ctx, pbuf, &lsb, &ls);
enls = errno;
} else {
@@ -1213,6 +1216,7 @@ static int process_id(struct lsof_context *ctx, /* context */
ls = SB_ALL;
}
if (statsafely(ctx, path, &sb)) {
+ enss_fd = errno;
(void)statEx(ctx, pbuf, &sb, &ss);
enss = errno;
} else {
@@ -1352,7 +1356,11 @@ static int process_id(struct lsof_context *ctx, /* context */
"[pidfd:%d]", fi.pid);
enter_nm(ctx, rest);
}
-
+ if ((Selflags & SELSTALE) &&
+ (enss_fd == ESTALE || enls_fd == ESTALE)) {
+ Lf->sf |= SELSTALE;
+ (void)add_nma(ctx, " (STALE)", 8);
+ }
if (Lf->sf)
link_lfile(ctx);
}
diff --git a/src/main.c b/src/main.c
index 5d91a6e..ec79d8a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
* Create option mask.
*/
(void)snpf(options, sizeof(options),
- "?a%sbc:%sD:d:%s%sf:F:g:hHi:%s%slL:%s%snNo:Op:QPr:%ss:S:tT:u:"
+ "?a%sbc:%sD:d:%s%sf:F:g:hHi:%s%slL:%s%snNyo:Op:QPr:%ss:S:tT:u:"
"UvVwx:%s%s%s",
#if defined(HAS_AFS) && defined(HASAOPT)
@@ -700,6 +700,9 @@ int main(int argc, char *argv[]) {
case 'N':
Fnfs = 1;
break;
+ case 'y':
+ Selflags |= SELSTALE;
+ break;
case 'o':
if (!GOv || *GOv == '-' || *GOv == '+') {
Foffset = 1;
diff --git a/src/usage.c b/src/usage.c
index 546b3ae..df604a8 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -299,7 +299,7 @@ void usage(struct lsof_context *ctx, /* context */
(void)fprintf(stderr, " latest FAQ: %s\n", LSOF_FAQ_URL);
(void)fprintf(stderr, " latest (non-formatted) man page: %s\n",
LSOF_MAN_URL);
- (void)fprintf(stderr, " usage: [-?ab%shH%slnNoOP%s%stUvV%s]",
+ (void)fprintf(stderr, " usage: [-?ab%shH%slnNoyOP%s%stUvV%s]",
#if defined(HASNCACHE)
"C",
@@ -513,6 +513,7 @@ void usage(struct lsof_context *ctx, /* context */
col = print_in_col(col, "-l list UID numbers");
col = print_in_col(col, "-n no host names");
col = print_in_col(col, "-N select NFS files");
+ col = print_in_col(col, "-y select stale file handles (ESTALE)");
col = print_in_col(col, "-o list file offset");
col = print_in_col(col, "-O no overhead *RISKY*");
col = print_in_col(col, "-P no port names");