From a84a1cb276aded58830b9d4e91877fde70901c8a Mon Sep 17 00:00:00 2001 From: ANtutov Date: Tue, 9 Dec 2025 22:09:52 +0000 Subject: [PATCH 1/2] fix: remove dead restart_failed_sync from custody backfill --- .../src/sync/custody_backfill_sync/mod.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs index bb2c6799f1d..48a89917a13 100644 --- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -113,10 +113,6 @@ pub struct CustodyBackFillSync { /// These are batches that we've skipped because we have no columns to fetch for the epoch. skipped_batches: HashSet, - /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined. - /// This signifies that we are able to attempt to restart a failed chain. - restart_failed_sync: bool, - /// Reference to the beacon chain to obtain initial starting points for custody backfill sync. beacon_chain: Arc>, @@ -141,7 +137,6 @@ impl CustodyBackFillSync { skipped_batches: HashSet::new(), current_processing_batch: None, validated_batches: 0, - restart_failed_sync: false, beacon_chain, network_globals, } @@ -201,7 +196,6 @@ impl CustodyBackFillSync { // Remove all batches and active requests. self.batches.clear(); self.skipped_batches.clear(); - self.restart_failed_sync = false; // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -734,7 +728,6 @@ impl CustodyBackFillSync { "Custody backfill sync completed" ); self.batches.clear(); - self.restart_failed_sync = false; self.processing_target = self.current_start; self.to_be_downloaded = self.current_start; self.last_batch_downloaded = false; @@ -1093,7 +1086,6 @@ impl CustodyBackFillSync { self.pause("Sync has failed".to_string()); // Remove all batches and active requests. self.batches.clear(); - self.restart_failed_sync = false; // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -1115,12 +1107,4 @@ impl CustodyBackFillSync { *self.network_globals.custody_sync_state.write() = state; } - /// A fully synced peer has joined us. - /// If we are in a failed state, update a local variable to indicate we are able to restart - /// the failed sync on the next attempt. - pub fn fully_synced_peer_joined(&mut self) { - if matches!(self.state(), CustodyBackFillState::Pending(_)) { - self.restart_failed_sync = true; - } - } } From 28b32592b9c2518ccfc19c2750aebb3de3f23e75 Mon Sep 17 00:00:00 2001 From: ANtutov Date: Wed, 10 Dec 2025 23:27:24 +0200 Subject: [PATCH 2/2] fix: gate custody backfill restart on new synced peer --- .../src/sync/custody_backfill_sync/mod.rs | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs index 48a89917a13..d073d19ebca 100644 --- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -113,6 +113,13 @@ pub struct CustodyBackFillSync { /// These are batches that we've skipped because we have no columns to fetch for the epoch. skipped_batches: HashSet, + /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined. + /// This signifies that we are able to attempt to restart a failed sync. + restart_failed_sync: bool, + + /// Indicates that the custody backfill sync has failed and is waiting to be retried. + failed_sync: bool, + /// Reference to the beacon chain to obtain initial starting points for custody backfill sync. beacon_chain: Arc>, @@ -137,6 +144,8 @@ impl CustodyBackFillSync { skipped_batches: HashSet::new(), current_processing_batch: None, validated_batches: 0, + restart_failed_sync: false, + failed_sync: false, beacon_chain, network_globals, } @@ -196,6 +205,8 @@ impl CustodyBackFillSync { // Remove all batches and active requests. self.batches.clear(); self.skipped_batches.clear(); + self.restart_failed_sync = false; + self.failed_sync = false; // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -238,12 +249,16 @@ impl CustodyBackFillSync { } if self.check_completed() { + self.failed_sync = false; + self.restart_failed_sync = false; self.set_state(CustodyBackFillState::Completed); return Ok(SyncStart::NotSyncing); } } CustodyBackFillState::Pending(_) | CustodyBackFillState::Completed => { if self.check_completed() { + self.failed_sync = false; + self.restart_failed_sync = false; self.set_state(CustodyBackFillState::Completed); return Ok(SyncStart::NotSyncing); } @@ -252,7 +267,18 @@ impl CustodyBackFillSync { if !self.should_start_custody_backfill_sync() { return Ok(SyncStart::NotSyncing); } - self.set_start_epoch(); + + // If the last custody backfill attempt failed, only restart once a new fully + // synced peer has joined and set `restart_failed_sync`. + if self.failed_sync { + if !self.restart_failed_sync { + return Ok(SyncStart::NotSyncing); + } + // We can now safely restart a failed sync with a fresh run id. + self.restart_sync(); + } else { + self.set_start_epoch(); + } if self .network_globals .peers @@ -1082,10 +1108,15 @@ impl CustodyBackFillSync { return Ok(()); } + // Mark this sync as failed and wait for a new fully synced peer before restarting. + self.failed_sync = true; + self.restart_failed_sync = false; + // Set the state self.pause("Sync has failed".to_string()); // Remove all batches and active requests. self.batches.clear(); + self.skipped_batches.clear(); // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -1093,7 +1124,6 @@ impl CustodyBackFillSync { self.to_be_downloaded = self.current_start; self.last_batch_downloaded = false; self.current_processing_batch = None; - self.restart_sync(); Err(error) } @@ -1107,4 +1137,12 @@ impl CustodyBackFillSync { *self.network_globals.custody_sync_state.write() = state; } + /// A fully synced peer has joined us. + /// If the last custody backfill sync failed, update a local variable to indicate we are able + /// to restart the failed sync on the next attempt. + pub fn fully_synced_peer_joined(&mut self) { + if self.failed_sync && matches!(self.state(), CustodyBackFillState::Pending(_)) { + self.restart_failed_sync = true; + } + } }