Skip to content

[FEATURE]Request: Add lsof Support to Identify Open-but-Deleted NFS Files (ESTALE / “Stale file handle”) #343

@x-lugoo

Description

@x-lugoo

When an NFS client and server share a file, and the file is deleted on the NFS server while it is still open (referenced) by a process on the NFS client, there is currently no reliable way to use lsof on the client to filter and list those file descriptors that correspond to deleted / “stale file handle” (ESTALE) files.

This makes troubleshooting difficult: for example, du -sh on the server and df -h on the client (or across hosts) can diverge more and more over time, and it is hard to identify the root cause because the open-but-deleted NFS files cannot be easily discovered with lsof.

I wrote a patch as an experiment, but I am not an lsof expert and I do not think the implementation quality is good enough for submission. I’m sharing this problem description to ask whether the lsof community can provide a proper solution for detecting and filtering these NFS “deleted but still open” / ESTALE file descriptors on clients.

nfs client side:

root@virtme-ng:/home/jeffxie/vng-share/nfs# exec 3<./test.txt 

nfs server side:

# rm ./test.txt

root@virtme-ng:/home/jeffxie/vng-share# lsof +L1
NOTHING OUTPUT

root@virtme-ng:/home/jeffxie/vng-share# stat -L /proc/399/fd/3 
stat: cannot statx '/proc/399/fd/3': Stale file handle
root@virtme-ng:/home/jeffxie/vng-share# echo $?
1

patch:

diff --git a/Lsof.8 b/Lsof.8
index b254dd4..1da4e00 100644
--- a/Lsof.8
+++ b/Lsof.8
@@ -1302,6 +1302,10 @@ It is also useful when host name lookup is not working properly.
 .B \-N
 selects the listing of NFS files.
 .TP \w'names'u+4
+.B \-y
+selects the listing of files whose stat(2)/lstat(2) fails with
+ESTALE ("Stale file handle"), e.g., stale NFS file handles.
+.TP \w'names'u+4
 .BI \-o
 directs
 .I lsof
diff --git a/lib/common.h b/lib/common.h
index 1e6c67e..88cd4fa 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -569,14 +569,15 @@ extern int ZoneColW;
 #    define SELEVTFDINFO                                                       \
         0x200000 /* selected for evetnfd info;                                 \
                   * cleared in link_lfile() */
+#    define SELSTALE 0x400000 /* select files with ESTALE (stale file handle) */
 
 #    define SELALL                                                             \
         (SELCMD | SELCNTX | SELFD | SELNA | SELNET | SELNM | SELNFS | SELPID | \
-         SELUID | SELUNX | SELZONE | SELTASK)
+         SELUID | SELUNX | SELZONE | SELTASK | SELSTALE)
 #    define SELPROC                                                            \
         (SELCMD | SELCNTX | SELPGID | SELPID | SELUID | SELZONE | SELTASK)
 /* process selecters */
-#    define SELFILE (SELFD | SELNFS | SELNLINK | SELNM) /* file selecters */
+#    define SELFILE (SELFD | SELNFS | SELNLINK | SELNM | SELSTALE) /* file selecters */
 #    define SELNW (SELNA | SELNET | SELUNX)             /* network selecters */
 
 /*
diff --git a/lib/dialects/linux/dproc.c b/lib/dialects/linux/dproc.c
index 8dc7c27..38dc7f8 100644
--- a/lib/dialects/linux/dproc.c
+++ b/lib/dialects/linux/dproc.c
@@ -917,6 +917,8 @@ static int process_id(struct lsof_context *ctx, /* context */
     static int pathil = 0;
     char *rest;
     int txts = 0;
+    int enss_fd = 0;
+    int enls_fd = 0;
 
 #if defined(HASSELINUX)
     cntxlist_t *cntxp;
@@ -1206,6 +1208,7 @@ static int process_id(struct lsof_context *ctx, /* context */
             } else {
                 if (HasNFS) {
                     if (lstatsafely(ctx, path, &lsb)) {
+                        enls_fd = errno;
                         (void)statEx(ctx, pbuf, &lsb, &ls);
                         enls = errno;
                     } else {
@@ -1213,6 +1216,7 @@ static int process_id(struct lsof_context *ctx, /* context */
                         ls = SB_ALL;
                     }
                     if (statsafely(ctx, path, &sb)) {
+                        enss_fd = errno;
                         (void)statEx(ctx, pbuf, &sb, &ss);
                         enss = errno;
                     } else {
@@ -1352,7 +1356,11 @@ static int process_id(struct lsof_context *ctx, /* context */
                                 "[pidfd:%d]", fi.pid);
                     enter_nm(ctx, rest);
                 }
-
+                if ((Selflags & SELSTALE) &&
+                    (enss_fd == ESTALE || enls_fd == ESTALE)) {
+                    Lf->sf |= SELSTALE;
+                    (void)add_nma(ctx, " (STALE)", 8);
+                }
                 if (Lf->sf)
                     link_lfile(ctx);
             }
diff --git a/src/main.c b/src/main.c
index 5d91a6e..ec79d8a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
      * Create option mask.
      */
     (void)snpf(options, sizeof(options),
-               "?a%sbc:%sD:d:%s%sf:F:g:hHi:%s%slL:%s%snNo:Op:QPr:%ss:S:tT:u:"
+               "?a%sbc:%sD:d:%s%sf:F:g:hHi:%s%slL:%s%snNyo:Op:QPr:%ss:S:tT:u:"
                "UvVwx:%s%s%s",
 
 #if defined(HAS_AFS) && defined(HASAOPT)
@@ -700,6 +700,9 @@ int main(int argc, char *argv[]) {
         case 'N':
             Fnfs = 1;
             break;
+        case 'y':
+            Selflags |= SELSTALE;
+            break;
         case 'o':
             if (!GOv || *GOv == '-' || *GOv == '+') {
                 Foffset = 1;
diff --git a/src/usage.c b/src/usage.c
index 546b3ae..df604a8 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -299,7 +299,7 @@ void usage(struct lsof_context *ctx, /* context */
         (void)fprintf(stderr, " latest FAQ: %s\n", LSOF_FAQ_URL);
         (void)fprintf(stderr, " latest (non-formatted) man page: %s\n",
                       LSOF_MAN_URL);
-        (void)fprintf(stderr, " usage: [-?ab%shH%slnNoOP%s%stUvV%s]",
+        (void)fprintf(stderr, " usage: [-?ab%shH%slnNoyOP%s%stUvV%s]",
 
 #if defined(HASNCACHE)
                       "C",
@@ -513,6 +513,7 @@ void usage(struct lsof_context *ctx, /* context */
         col = print_in_col(col, "-l list UID numbers");
         col = print_in_col(col, "-n no host names");
         col = print_in_col(col, "-N select NFS files");
+        col = print_in_col(col, "-y select stale file handles (ESTALE)");
         col = print_in_col(col, "-o list file offset");
         col = print_in_col(col, "-O no overhead *RISKY*");
         col = print_in_col(col, "-P no port names");

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions