From a84a1cb276aded58830b9d4e91877fde70901c8a Mon Sep 17 00:00:00 2001
From: ANtutov <tutovanton26@gmail.com>
Date: Tue, 9 Dec 2025 22:09:52 +0000
Subject: [PATCH 1/2] fix: remove dead restart_failed_sync from custody
 backfill

---
 .../src/sync/custody_backfill_sync/mod.rs        | 16 ----------------
 1 file changed, 16 deletions(-)
diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
index bb2c6799f1d..48a89917a13 100644
--- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
+++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
@@ -113,10 +113,6 @@ pub struct CustodyBackFillSync<T: BeaconChainTypes> {
     /// These are batches that we've skipped because we have no columns to fetch for the epoch.
     skipped_batches: HashSet<BatchId>,
 
-    /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined.
-    /// This signifies that we are able to attempt to restart a failed chain.
-    restart_failed_sync: bool,
-
     /// Reference to the beacon chain to obtain initial starting points for custody backfill sync.
     beacon_chain: Arc<BeaconChain<T>>,
 
@@ -141,7 +137,6 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
             skipped_batches: HashSet::new(),
             current_processing_batch: None,
             validated_batches: 0,
-            restart_failed_sync: false,
             beacon_chain,
             network_globals,
         }
@@ -201,7 +196,6 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         // Remove all batches and active requests.
         self.batches.clear();
         self.skipped_batches.clear();
-        self.restart_failed_sync = false;
 
         // Reset all downloading and processing targets
         // NOTE: Lets keep validated_batches for posterity
@@ -734,7 +728,6 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
                         "Custody backfill sync completed"
                     );
                     self.batches.clear();
-                    self.restart_failed_sync = false;
                     self.processing_target = self.current_start;
                     self.to_be_downloaded = self.current_start;
                     self.last_batch_downloaded = false;
@@ -1093,7 +1086,6 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         self.pause("Sync has failed".to_string());
         // Remove all batches and active requests.
         self.batches.clear();
-        self.restart_failed_sync = false;
 
         // Reset all downloading and processing targets
         // NOTE: Lets keep validated_batches for posterity
@@ -1115,12 +1107,4 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         *self.network_globals.custody_sync_state.write() = state;
     }
 
-    /// A fully synced peer has joined us.
-    /// If we are in a failed state, update a local variable to indicate we are able to restart
-    /// the failed sync on the next attempt.
-    pub fn fully_synced_peer_joined(&mut self) {
-        if matches!(self.state(), CustodyBackFillState::Pending(_)) {
-            self.restart_failed_sync = true;
-        }
-    }
 }

From 28b32592b9c2518ccfc19c2750aebb3de3f23e75 Mon Sep 17 00:00:00 2001
From: ANtutov <tutovanton26@gmail.com>
Date: Wed, 10 Dec 2025 23:27:24 +0200
Subject: [PATCH 2/2] fix: gate custody backfill restart on new synced peer

---
 .../src/sync/custody_backfill_sync/mod.rs     | 42 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
index 48a89917a13..d073d19ebca 100644
--- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
+++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs
@@ -113,6 +113,13 @@ pub struct CustodyBackFillSync<T: BeaconChainTypes> {
     /// These are batches that we've skipped because we have no columns to fetch for the epoch.
     skipped_batches: HashSet<BatchId>,
 
+    /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined.
+    /// This signifies that we are able to attempt to restart a failed sync.
+    restart_failed_sync: bool,
+
+    /// Indicates that the custody backfill sync has failed and is waiting to be retried.
+    failed_sync: bool,
+
     /// Reference to the beacon chain to obtain initial starting points for custody backfill sync.
     beacon_chain: Arc<BeaconChain<T>>,
 
@@ -137,6 +144,8 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
             skipped_batches: HashSet::new(),
             current_processing_batch: None,
             validated_batches: 0,
+            restart_failed_sync: false,
+            failed_sync: false,
             beacon_chain,
             network_globals,
         }
@@ -196,6 +205,8 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         // Remove all batches and active requests.
         self.batches.clear();
         self.skipped_batches.clear();
+        self.restart_failed_sync = false;
+        self.failed_sync = false;
 
         // Reset all downloading and processing targets
         // NOTE: Lets keep validated_batches for posterity
@@ -238,12 +249,16 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
                 }
 
                 if self.check_completed() {
+                    self.failed_sync = false;
+                    self.restart_failed_sync = false;
                     self.set_state(CustodyBackFillState::Completed);
                     return Ok(SyncStart::NotSyncing);
                 }
             }
             CustodyBackFillState::Pending(_) | CustodyBackFillState::Completed => {
                 if self.check_completed() {
+                    self.failed_sync = false;
+                    self.restart_failed_sync = false;
                     self.set_state(CustodyBackFillState::Completed);
                     return Ok(SyncStart::NotSyncing);
                 }
@@ -252,7 +267,18 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
                 if !self.should_start_custody_backfill_sync() {
                     return Ok(SyncStart::NotSyncing);
                 }
-                self.set_start_epoch();
+
+                // If the last custody backfill attempt failed, only restart once a new fully
+                // synced peer has joined and set `restart_failed_sync`.
+                if self.failed_sync {
+                    if !self.restart_failed_sync {
+                        return Ok(SyncStart::NotSyncing);
+                    }
+                    // We can now safely restart a failed sync with a fresh run id.
+                    self.restart_sync();
+                } else {
+                    self.set_start_epoch();
+                }
                 if self
                     .network_globals
                     .peers
@@ -1082,10 +1108,15 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
             return Ok(());
         }
 
+        // Mark this sync as failed and wait for a new fully synced peer before restarting.
+        self.failed_sync = true;
+        self.restart_failed_sync = false;
+
         // Set the state
         self.pause("Sync has failed".to_string());
         // Remove all batches and active requests.
         self.batches.clear();
+        self.skipped_batches.clear();
 
         // Reset all downloading and processing targets
         // NOTE: Lets keep validated_batches for posterity
@@ -1093,7 +1124,6 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         self.to_be_downloaded = self.current_start;
         self.last_batch_downloaded = false;
         self.current_processing_batch = None;
-        self.restart_sync();
 
         Err(error)
     }
@@ -1107,4 +1137,12 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
         *self.network_globals.custody_sync_state.write() = state;
     }
 
+    /// A fully synced peer has joined us.
+    /// If the last custody backfill sync failed, update a local variable to indicate we are able
+    /// to restart the failed sync on the next attempt.
+    pub fn fully_synced_peer_joined(&mut self) {
+        if self.failed_sync && matches!(self.state(), CustodyBackFillState::Pending(_)) {
+            self.restart_failed_sync = true;
+        }
+    }
 }