From 970d4d9459aa159ae8b92ae4b09bb0a7923d9f81 Mon Sep 17 00:00:00 2001 From: Joost Jager Date: Thu, 26 Feb 2026 10:09:31 +0100 Subject: [PATCH 1/3] Replace dual-sync-async persistence panic with Watch contract Commit 0760f99 ("Disallow dual-sync-async persistence without restarting") added a panic in non-test builds when a Persist implementation returns both Completed and InProgress from the same ChannelManager instance. However, this check runs against the status that ChainMonitor returns to ChannelManager, not the raw Persist result. When ChannelMonitor::update_monitor fails (e.g. a counterparty commitment_signed arrives after a funding spend confirms), ChainMonitor persists the full monitor successfully but overrides the return value to InProgress. If the user's Persist impl only ever returns Completed, this override triggers a false mode-mismatch panic. This replaces the panic with a per-channel contract at the Watch trait level: a Watch implementation must not return Completed for a channel update while prior InProgress updates are still pending. Switching from Completed to InProgress is always allowed, but switching back is impractical because the Watch implementation cannot observe when ChannelManager has finished processing a MonitorEvent::Completed. The documentation on ChannelMonitorUpdateStatus is updated to describe these rules. The mode tracking and panic checks from 0760f99 are removed and replaced with a panic that validates the new contract directly on the in-flight update state. Legacy tests that switch the persister between modes mid-flight can opt out via Node::disable_monitor_completeness_assertion(). Co-Authored-By: Claude Opus 4.6 --- lightning/src/chain/channelmonitor.rs | 1 + lightning/src/chain/mod.rs | 12 ++---- lightning/src/ln/chanmon_update_fail_tests.rs | 6 +++ lightning/src/ln/channelmanager.rs | 41 +++++++++---------- lightning/src/ln/functional_test_utils.rs | 8 ++++ lightning/src/ln/monitor_tests.rs | 1 + lightning/src/ln/reload_tests.rs | 3 ++ 7 files changed, 43 insertions(+), 29 deletions(-) diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs index 8e7b6035523..1eb1484d07d 100644 --- a/lightning/src/chain/channelmonitor.rs +++ b/lightning/src/chain/channelmonitor.rs @@ -7048,6 +7048,7 @@ mod tests { let legacy_cfg = test_legacy_channel_config(); let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[Some(legacy_cfg.clone()), Some(legacy_cfg.clone()), Some(legacy_cfg)]); let nodes = create_network(3, &node_cfgs, &node_chanmgrs); + nodes[1].disable_monitor_completeness_assertion(); let channel = create_announced_chan_between_nodes(&nodes, 0, 1); create_announced_chan_between_nodes(&nodes, 1, 2); diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs index bc47f1b1db6..99e184d8fda 100644 --- a/lightning/src/chain/mod.rs +++ b/lightning/src/chain/mod.rs @@ -233,11 +233,10 @@ pub enum ChannelMonitorUpdateStatus { /// This includes performing any `fsync()` calls required to ensure the update is guaranteed to /// be available on restart even if the application crashes. /// - /// If you return this variant, you cannot later return [`InProgress`] from the same instance of - /// [`Persist`]/[`Watch`] without first restarting. + /// You cannot switch from [`InProgress`] to this variant for the same channel without first + /// restarting. However, switching from this variant to [`InProgress`] is always allowed. /// /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress - /// [`Persist`]: chainmonitor::Persist Completed, /// Indicates that the update will happen asynchronously in the background or that a transient /// failure occurred which is being retried in the background and will eventually complete. @@ -263,12 +262,7 @@ pub enum ChannelMonitorUpdateStatus { /// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the /// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*. /// - /// If you return this variant, you cannot later return [`Completed`] from the same instance of - /// [`Persist`]/[`Watch`] without first restarting. - /// /// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress - /// [`Completed`]: ChannelMonitorUpdateStatus::Completed - /// [`Persist`]: chainmonitor::Persist InProgress, /// Indicates that an update has failed and will not complete at any point in the future. /// @@ -328,6 +322,8 @@ pub trait Watch { /// cannot be retried, the node should shut down immediately after returning /// [`ChannelMonitorUpdateStatus::UnrecoverableError`], see its documentation for more info. /// + /// See [`ChannelMonitorUpdateStatus`] for requirements on when each variant may be returned. + /// /// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager fn update_channel( &self, channel_id: ChannelId, update: &ChannelMonitorUpdate, diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs index 623d028560f..11fc8ac0ea3 100644 --- a/lightning/src/ln/chanmon_update_fail_tests.rs +++ b/lightning/src/ln/chanmon_update_fail_tests.rs @@ -176,6 +176,7 @@ fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) { let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); + nodes[0].disable_monitor_completeness_assertion(); let node_a_id = nodes[0].node.get_our_node_id(); let node_b_id = nodes[1].node.get_our_node_id(); @@ -317,6 +318,7 @@ fn do_test_monitor_temporary_update_fail(disconnect_count: usize) { let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); + nodes[0].disable_monitor_completeness_assertion(); let node_a_id = nodes[0].node.get_our_node_id(); let node_b_id = nodes[1].node.get_our_node_id(); @@ -970,6 +972,7 @@ fn do_test_monitor_update_fail_raa(test_ignore_second_cs: bool) { let node_cfgs = create_node_cfgs(3, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[None, None, None]); let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs); + nodes[1].disable_monitor_completeness_assertion(); let node_a_id = nodes[0].node.get_our_node_id(); let node_b_id = nodes[1].node.get_our_node_id(); @@ -1501,6 +1504,7 @@ fn claim_while_disconnected_monitor_update_fail() { let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); + nodes[1].disable_monitor_completeness_assertion(); let node_a_id = nodes[0].node.get_our_node_id(); let node_b_id = nodes[1].node.get_our_node_id(); @@ -1728,6 +1732,7 @@ fn first_message_on_recv_ordering() { let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs); + nodes[1].disable_monitor_completeness_assertion(); let node_a_id = nodes[0].node.get_our_node_id(); let node_b_id = nodes[1].node.get_our_node_id(); @@ -3850,6 +3855,7 @@ fn do_test_durable_preimages_on_closed_channel( // Now reload node B let manager_b = nodes[1].node.encode(); reload_node!(nodes[1], &manager_b, &[&mon_ab, &mon_bc], persister, chain_mon, node_b_reload); + nodes[1].disable_monitor_completeness_assertion(); nodes[0].node.peer_disconnected(node_b_id); nodes[2].node.peer_disconnected(node_b_id); diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 70617b20894..3ec174c58e1 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -2870,12 +2870,12 @@ pub struct ChannelManager< #[cfg(any(test, feature = "_test_utils"))] pub(super) per_peer_state: FairRwLock>>>, - /// We only support using one of [`ChannelMonitorUpdateStatus::InProgress`] and - /// [`ChannelMonitorUpdateStatus::Completed`] without restarting. Because the API does not - /// otherwise directly enforce this, we enforce it in non-test builds here by storing which one - /// is in use. - #[cfg(not(any(test, feature = "_externalize_tests")))] - monitor_update_type: AtomicUsize, + /// When set, disables the panic when `Watch::update_channel` returns `Completed` while + /// prior updates are still `InProgress`. Some legacy tests switch the persister between + /// `InProgress` and `Completed` mid-flight, which violates this contract but is otherwise + /// harmless in a test context. + #[cfg(test)] + pub(crate) skip_monitor_update_assertion: AtomicBool, /// The set of events which we need to give to the user to handle. In some cases an event may /// require some further action after the user handles it (currently only blocking a monitor @@ -3618,8 +3618,8 @@ impl< per_peer_state: FairRwLock::new(new_hash_map()), - #[cfg(not(any(test, feature = "_externalize_tests")))] - monitor_update_type: AtomicUsize::new(0), + #[cfg(test)] + skip_monitor_update_assertion: AtomicBool::new(false), pending_events: Mutex::new(VecDeque::new()), pending_events_processor: AtomicBool::new(false), @@ -10380,6 +10380,15 @@ This indicates a bug inside LDK. Please report this error at https://github.com/ if update_completed { let _ = in_flight_updates.remove(update_idx); } + // A Watch implementation must not return Completed while prior updates are + // still InProgress, as this would violate the async persistence contract. + #[cfg(test)] + let skip_check = self.skip_monitor_update_assertion.load(Ordering::Relaxed); + #[cfg(not(test))] + let skip_check = false; + if !skip_check && update_completed && !in_flight_updates.is_empty() { + panic!("Watch::update_channel returned Completed while prior updates are still InProgress"); + } (update_completed, update_completed && in_flight_updates.is_empty()) } else { // We blindly assume that the ChannelMonitorUpdate will be regenerated on startup if we @@ -10445,23 +10454,13 @@ This indicates a bug inside LDK. Please report this error at https://github.com/ panic!("{}", err_str); }, ChannelMonitorUpdateStatus::InProgress => { - #[cfg(not(any(test, feature = "_externalize_tests")))] - if self.monitor_update_type.swap(1, Ordering::Relaxed) == 2 { - panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart"); - } log_debug!( logger, "ChannelMonitor update in flight, holding messages until the update completes.", ); false }, - ChannelMonitorUpdateStatus::Completed => { - #[cfg(not(any(test, feature = "_externalize_tests")))] - if self.monitor_update_type.swap(2, Ordering::Relaxed) == 1 { - panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart"); - } - true - }, + ChannelMonitorUpdateStatus::Completed => true, } } @@ -20112,8 +20111,8 @@ impl< per_peer_state: FairRwLock::new(per_peer_state), - #[cfg(not(any(test, feature = "_externalize_tests")))] - monitor_update_type: AtomicUsize::new(0), + #[cfg(test)] + skip_monitor_update_assertion: AtomicBool::new(false), pending_events: Mutex::new(pending_events_read), pending_events_processor: AtomicBool::new(false), diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs index 596b2420ca2..a16adf88b51 100644 --- a/lightning/src/ln/functional_test_utils.rs +++ b/lightning/src/ln/functional_test_utils.rs @@ -598,6 +598,14 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> { self.node.init_features() | self.onion_messenger.provided_init_features(peer_node_id) }) } + + /// Disables the panic when `Watch::update_channel` returns `Completed` while prior updates + /// are still `InProgress`. Some legacy tests switch the persister between modes mid-flight, + /// which violates this contract but is otherwise harmless. + #[cfg(test)] + pub fn disable_monitor_completeness_assertion(&self) { + self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed); + } } impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {} diff --git a/lightning/src/ln/monitor_tests.rs b/lightning/src/ln/monitor_tests.rs index 2368776dd3f..efd2084a38e 100644 --- a/lightning/src/ln/monitor_tests.rs +++ b/lightning/src/ln/monitor_tests.rs @@ -3384,6 +3384,7 @@ fn test_claim_event_never_handled() { let chan_0_monitor_serialized = get_monitor!(nodes[1], chan.2).encode(); let mons = &[&chan_0_monitor_serialized[..]]; reload_node!(nodes[1], &init_node_ser, mons, persister, new_chain_mon, nodes_1_reload); + nodes[1].disable_monitor_completeness_assertion(); expect_payment_claimed!(nodes[1], payment_hash_a, 1_000_000); // The reload logic spuriously generates a redundant payment preimage-containing diff --git a/lightning/src/ln/reload_tests.rs b/lightning/src/ln/reload_tests.rs index bb730f8fba8..8d9eac5c001 100644 --- a/lightning/src/ln/reload_tests.rs +++ b/lightning/src/ln/reload_tests.rs @@ -823,12 +823,14 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool, double_rest // Now restart nodes[3]. reload_node!(nodes[3], original_manager.clone(), &[&updated_monitor.0, &original_monitor.0], persist_d_1, chain_d_1, node_d_1); + nodes[3].disable_monitor_completeness_assertion(); if double_restart { // Previously, we had a bug where we'd fail to reload if we re-persist the `ChannelManager` // without updating any `ChannelMonitor`s as we'd fail to double-initiate the claim replay. // We test that here ensuring that we can reload again. reload_node!(nodes[3], node_d_1.encode(), &[&updated_monitor.0, &original_monitor.0], persist_d_2, chain_d_2, node_d_2); + nodes[3].disable_monitor_completeness_assertion(); } // Until the startup background events are processed (in `get_and_clear_pending_events`, @@ -2216,6 +2218,7 @@ fn test_reload_with_mpp_claims_on_same_channel() { nodes_1_deserialized, Some(true) ); + nodes[1].disable_monitor_completeness_assertion(); // When the claims are reconstructed during reload, PaymentForwarded events are regenerated. let events = nodes[1].node.get_and_clear_pending_events(); From 1d3704d53252bc6aa714df604d01bf74f8f2abf7 Mon Sep 17 00:00:00 2001 From: Joost Jager Date: Thu, 5 Mar 2026 13:49:57 +0100 Subject: [PATCH 2/3] Add test for monitor update after funding spend Add a regression test that reproduces the panic when a commitment_signed is processed after the counterparty commitment transaction has confirmed. The ChannelMonitor's no_further_updates_allowed() returns true, causing update_monitor to fail, which ChainMonitor overrides to InProgress. A subsequent preimage claim returning Completed then triggers the per-channel assertion that Completed must not follow InProgress. AI tools were used in preparing this commit. --- lightning/src/ln/chanmon_update_fail_tests.rs | 71 ++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs index 11fc8ac0ea3..87856c950d1 100644 --- a/lightning/src/ln/chanmon_update_fail_tests.rs +++ b/lightning/src/ln/chanmon_update_fail_tests.rs @@ -16,7 +16,7 @@ use crate::chain::chaininterface::LowerBoundedFeeEstimator; use crate::chain::chainmonitor::ChainMonitor; use crate::chain::channelmonitor::{ChannelMonitor, MonitorEvent, ANTI_REORG_DELAY}; use crate::chain::transaction::OutPoint; -use crate::chain::{ChannelMonitorUpdateStatus, Listen, Watch}; +use crate::chain::{ChannelMonitorUpdateStatus, Confirm, Listen, Watch}; use crate::events::{ClosureReason, Event, HTLCHandlingFailureType, PaymentPurpose}; use crate::ln::channel::AnnouncementSigsState; use crate::ln::channelmanager::{PaymentId, RAACommitmentOrder}; @@ -5421,3 +5421,72 @@ fn test_late_counterparty_commitment_update_after_holder_commitment_spend() { fn test_late_counterparty_commitment_update_after_holder_commitment_spend_dust() { do_test_late_counterparty_commitment_update_after_holder_commitment_spend(true); } + +#[test] +#[should_panic( + expected = "Watch::update_channel returned Completed while prior updates are still InProgress" +)] +fn test_monitor_update_fail_after_funding_spend() { + // When a counterparty commitment transaction confirms (funding spend), the + // ChannelMonitor sets funding_spend_seen. If a commitment_signed from the + // counterparty is then processed (a race between chain events and message + // processing), update_monitor returns Err because no_further_updates_allowed() + // is true. ChainMonitor overrides the result to InProgress, permanently + // freezing the channel. A subsequent preimage claim returning Completed then + // triggers the per-channel assertion. + let chanmon_cfgs = create_chanmon_cfgs(2); + let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); + let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); + let nodes = create_network(2, &node_cfgs, &node_chanmgrs); + + let node_a_id = nodes[0].node.get_our_node_id(); + + let (_, _, chan_id, _) = create_announced_chan_between_nodes(&nodes, 0, 1); + + // Route payment 1 fully so B can claim it later. + let (payment_preimage_1, _payment_hash_1, ..) = + route_payment(&nodes[0], &[&nodes[1]], 1_000_000); + + // Get A's commitment tx (this is the "counterparty" commitment from B's perspective). + let as_commitment_tx = get_local_commitment_txn!(nodes[0], chan_id); + assert_eq!(as_commitment_tx.len(), 1); + + // Confirm A's commitment tx on B's chain_monitor ONLY (not on B's ChannelManager). + // This sets funding_spend_seen in the monitor, making no_further_updates_allowed() true. + let (block_hash, height) = nodes[1].best_block_info(); + let block = create_dummy_block(block_hash, height + 1, vec![as_commitment_tx[0].clone()]); + let txdata: Vec<_> = block.txdata.iter().enumerate().collect(); + nodes[1].chain_monitor.chain_monitor.transactions_confirmed(&block.header, &txdata, height + 1); + + // Send payment 2 from A to B. + let (route, payment_hash_2, _, payment_secret_2) = + get_route_and_payment_hash!(&nodes[0], nodes[1], 1_000_000); + nodes[0] + .node + .send_payment_with_route( + route, + payment_hash_2, + RecipientOnionFields::secret_only(payment_secret_2, 1_000_000), + PaymentId(payment_hash_2.0), + ) + .unwrap(); + check_added_monitors(&nodes[0], 1); + + let mut events = nodes[0].node.get_and_clear_pending_msg_events(); + assert_eq!(events.len(), 1); + let payment_event = SendEvent::from_event(events.remove(0)); + + nodes[1].node.handle_update_add_htlc(node_a_id, &payment_event.msgs[0]); + + // B processes commitment_signed. The monitor's update_monitor succeeds on the + // update steps, but returns Err at the end because no_further_updates_allowed() + // is true (funding_spend_seen). ChainMonitor overrides the result to InProgress. + nodes[1].node.handle_commitment_signed(node_a_id, &payment_event.commitment_msg[0]); + check_added_monitors(&nodes[1], 1); + assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty()); + + // B claims payment 1. The PaymentPreimage monitor update returns Completed + // (update_monitor succeeds for preimage, and persister returns Completed), + // but the prior InProgress from the commitment_signed is still pending. + nodes[1].node.claim_funds(payment_preimage_1); +} From 533f37fb4859c8d093dc74e4d13a6f291f689697 Mon Sep 17 00:00:00 2001 From: Joost Jager Date: Mon, 9 Mar 2026 06:27:30 -0400 Subject: [PATCH 3/3] Defer monitor update completions after funding spend When no_further_updates_allowed() is true and the persister returns Completed, ChainMonitor now overrides the return to InProgress and queues the update_id in deferred_monitor_update_completions. These are resolved in release_pending_monitor_events, so ChannelManager sees them complete together with the force-close MonitorEvents. This eliminates phantom InProgress entries that would never complete: previously, a rejected pre-close update (e.g. commitment_signed arriving after funding spend) returned InProgress with no completion path, blocking MonitorUpdateCompletionActions (PaymentClaimed, PaymentForwarded) indefinitely. A subsequent post-close update returning Completed would then violate the in-order completion invariant. AI tools were used in preparing this commit. --- lightning/src/chain/chainmonitor.rs | 94 +++++++++++- lightning/src/ln/chanmon_update_fail_tests.rs | 144 +++++++++++++----- lightning/src/ln/channelmanager.rs | 5 +- 3 files changed, 197 insertions(+), 46 deletions(-) diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs index 396ee277067..612002d11f6 100644 --- a/lightning/src/chain/chainmonitor.rs +++ b/lightning/src/chain/chainmonitor.rs @@ -238,6 +238,11 @@ struct MonitorHolder { /// [`ChannelMonitorUpdate`] which was already applied. While this isn't an issue for the /// LDK-provided update-based [`Persist`], it is somewhat surprising for users so we avoid it. pending_monitor_updates: Mutex>, + /// Monitor update IDs for which the persister returned `Completed` but we overrode the return + /// to `InProgress` because the channel is post-close (`no_further_updates_allowed()` is true). + /// Resolved during the next monitor event release so that `ChannelManager` sees them complete + /// together with the force-close `MonitorEvent`s. + deferred_monitor_update_completions: Mutex>, } impl MonitorHolder { @@ -1100,7 +1105,11 @@ where if let Some(ref chain_source) = self.chain_source { monitor.load_outputs_to_watch(chain_source, &self.logger); } - entry.insert(MonitorHolder { monitor, pending_monitor_updates: Mutex::new(Vec::new()) }); + entry.insert(MonitorHolder { + monitor, + pending_monitor_updates: Mutex::new(Vec::new()), + deferred_monitor_update_completions: Mutex::new(Vec::new()), + }); Ok(ChannelMonitorUpdateStatus::Completed) } @@ -1141,6 +1150,7 @@ where entry.insert(MonitorHolder { monitor, pending_monitor_updates: Mutex::new(pending_monitor_updates), + deferred_monitor_update_completions: Mutex::new(Vec::new()), }); Ok(persist_res) } @@ -1171,6 +1181,12 @@ where let logger = WithChannelMonitor::from(&self.logger, &monitor, None); log_trace!(logger, "Updating ChannelMonitor to id {}", update.update_id,); + // We hold `deferred_monitor_update_completions` through the entire + // update process so that `release_pending_monitor_events` either + // sees both the `MonitorEvent`s generated by `update_monitor` and + // the corresponding deferral entry, or neither. + let mut deferred = + monitor_state.deferred_monitor_update_completions.lock().unwrap(); // We hold a `pending_monitor_updates` lock through `update_monitor` to ensure we // have well-ordered updates from the users' point of view. See the // `pending_monitor_updates` docs for more. @@ -1222,6 +1238,7 @@ where ChannelMonitorUpdateStatus::UnrecoverableError => { // Take the monitors lock for writing so that we poison it and any future // operations going forward fail immediately. + core::mem::drop(deferred); core::mem::drop(pending_monitor_updates); core::mem::drop(monitors); let _poison = self.monitors.write().unwrap(); @@ -1250,7 +1267,31 @@ where } } - if update_res.is_err() { + debug_assert!( + update_res.is_ok() || monitor.no_further_updates_allowed(), + "update_monitor returned Err but channel is not post-close", + ); + + // We also check update_res.is_err() as a defensive measure: an + // error should only occur on a post-close monitor (validated by + // the debug_assert above), but we defer here regardless to avoid + // returning Completed for a failed update. + if (update_res.is_err() || monitor.no_further_updates_allowed()) + && persist_res == ChannelMonitorUpdateStatus::Completed + { + // The channel is post-close (funding spend seen, lockdown, or holder + // tx signed). Return InProgress so ChannelManager freezes the + // channel until the force-close MonitorEvents are processed. We + // track this update_id in deferred_monitor_update_completions so it + // gets resolved during release_pending_monitor_events, together with + // those MonitorEvents. + pending_monitor_updates.push(update_id); + deferred.push(update_id); + log_debug!( + logger, + "Deferring completion of ChannelMonitorUpdate id {:?} (channel is post-close)", + update_id, + ); ChannelMonitorUpdateStatus::InProgress } else { persist_res @@ -1614,8 +1655,13 @@ where for (channel_id, update_id) in self.persister.get_and_clear_completed_updates() { let _ = self.channel_monitor_updated(channel_id, update_id); } + let monitors = self.monitors.read().unwrap(); let mut pending_monitor_events = self.pending_monitor_events.lock().unwrap().split_off(0); - for monitor_state in self.monitors.read().unwrap().values() { + for monitor_state in monitors.values() { + // Hold the deferred lock across both `get_and_clear_pending_monitor_events` + // and the deferred resolution so that `update_monitor` cannot insert a + // `MonitorEvent` and deferral entry between the two steps. + let mut deferred = monitor_state.deferred_monitor_update_completions.lock().unwrap(); let monitor_events = monitor_state.monitor.get_and_clear_pending_monitor_events(); if monitor_events.len() > 0 { let monitor_funding_txo = monitor_state.monitor.get_funding_txo(); @@ -1628,6 +1674,48 @@ where counterparty_node_id, )); } + // Resolve any deferred monitor update completions after collecting regular monitor + // events. The regular events include the force-close (CommitmentTxConfirmed), which + // ChannelManager processes first. The deferred completions come after, so that + // completion actions resolve once the ChannelForceClosed update (generated during + // force-close processing) also gets deferred and resolved in the next event cycle. + let mut pending = monitor_state.pending_monitor_updates.lock().unwrap(); + for update_id in deferred.drain(..) { + let Some(pos) = pending.iter().position(|id| *id == update_id) else { + // Already resolved by channel_monitor_updated; skip to avoid + // duplicate MonitorEvent::Completed. + let channel_id = monitor_state.monitor.channel_id(); + debug_assert!( + false, + "Deferred update {} already resolved for channel {}", + update_id, channel_id + ); + let logger = WithContext::from( + &self.logger, + Some(monitor_state.monitor.get_counterparty_node_id()), + Some(channel_id), + None, + ); + log_error!(logger, "Deferred update {} already resolved", update_id); + continue; + }; + pending.remove(pos); + let monitor_is_pending_updates = monitor_state.has_pending_updates(&pending); + if !monitor_is_pending_updates { + let funding_txo = monitor_state.monitor.get_funding_txo(); + let channel_id = monitor_state.monitor.channel_id(); + pending_monitor_events.push(( + funding_txo, + channel_id, + vec![MonitorEvent::Completed { + funding_txo, + channel_id, + monitor_update_id: monitor_state.monitor.get_latest_update_id(), + }], + monitor_state.monitor.get_counterparty_node_id(), + )); + } + } } pending_monitor_events } diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs index 87856c950d1..0d8a4a020f0 100644 --- a/lightning/src/ln/chanmon_update_fail_tests.rs +++ b/lightning/src/ln/chanmon_update_fail_tests.rs @@ -3905,11 +3905,28 @@ fn do_test_durable_preimages_on_closed_channel( } if !close_chans_before_reload { check_closed_broadcast(&nodes[1], 1, false); - let reason = ClosureReason::CommitmentTxConfirmed; - check_closed_event(&nodes[1], 1, reason, &[node_a_id], 100000); + // When hold=false, get_and_clear_pending_events also triggers + // process_background_events (replaying the preimage and force-close updates) + // and resolves the deferred completions, firing PaymentForwarded alongside + // ChannelClosed. When hold=true, only ChannelClosed fires. + let evs = nodes[1].node.get_and_clear_pending_events(); + let expected = if hold_post_reload_mon_update { 1 } else { 2 }; + assert_eq!(evs.len(), expected, "{:?}", evs); + assert!(evs.iter().any(|e| matches!( + e, + Event::ChannelClosed { reason: ClosureReason::CommitmentTxConfirmed, .. } + ))); + if !hold_post_reload_mon_update { + assert!(evs.iter().any(|e| matches!(e, Event::PaymentForwarded { .. }))); + check_added_monitors(&nodes[1], mons_added); + } } nodes[1].node.timer_tick_occurred(); - check_added_monitors(&nodes[1], mons_added); + // For !close_chans_before_reload && !hold, background events were already replayed + // during get_and_clear_pending_events above, so timer_tick adds no monitors. + let expected_mons = + if !close_chans_before_reload && !hold_post_reload_mon_update { 0 } else { mons_added }; + check_added_monitors(&nodes[1], expected_mons); // Finally, check that B created a payment preimage transaction and close out the payment. let bs_txn = nodes[1].tx_broadcaster.txn_broadcasted.lock().unwrap().split_off(0); @@ -3924,44 +3941,61 @@ fn do_test_durable_preimages_on_closed_channel( check_closed_broadcast(&nodes[0], 1, false); expect_payment_sent(&nodes[0], payment_preimage, None, true, true); + if close_chans_before_reload && !hold_post_reload_mon_update { + // For close_chans_before_reload with hold=false, the deferred completions + // haven't been processed yet. Trigger process_pending_monitor_events now. + let _ = nodes[1].node.get_and_clear_pending_msg_events(); + check_added_monitors(&nodes[1], 0); + } + if !close_chans_before_reload || close_only_a { // Make sure the B<->C channel is still alive and well by sending a payment over it. let mut reconnect_args = ReconnectArgs::new(&nodes[1], &nodes[2]); reconnect_args.pending_responding_commitment_signed.1 = true; - // The B<->C `ChannelMonitorUpdate` shouldn't be allowed to complete, which is the - // equivalent to the responding `commitment_signed` being a duplicate for node B, thus we - // need to set the `pending_responding_commitment_signed_dup` flag. - reconnect_args.pending_responding_commitment_signed_dup_monitor.1 = true; + if hold_post_reload_mon_update { + // When the A-B update is still InProgress, B-C monitor updates are blocked, + // so the responding commitment_signed is a duplicate that generates no update. + reconnect_args.pending_responding_commitment_signed_dup_monitor.1 = true; + } reconnect_args.pending_raa.1 = true; reconnect_nodes(reconnect_args); } - // Once the blocked `ChannelMonitorUpdate` *finally* completes, the pending - // `PaymentForwarded` event will finally be released. - let (_, ab_update_id) = nodes[1].chain_monitor.get_latest_mon_update_id(chan_id_ab); - nodes[1].chain_monitor.chain_monitor.force_channel_monitor_updated(chan_id_ab, ab_update_id); + if hold_post_reload_mon_update { + // When the persister returned InProgress, we need to manually complete the + // A-B monitor update to unblock the PaymentForwarded completion action. + let (_, ab_update_id) = nodes[1].chain_monitor.get_latest_mon_update_id(chan_id_ab); + nodes[1] + .chain_monitor + .chain_monitor + .force_channel_monitor_updated(chan_id_ab, ab_update_id); + } // If the A<->B channel was closed before we reload, we'll replay the claim against it on // reload, causing the `PaymentForwarded` event to get replayed. let evs = nodes[1].node.get_and_clear_pending_events(); - assert_eq!(evs.len(), if close_chans_before_reload { 2 } else { 1 }); - for ev in evs { - if let Event::PaymentForwarded { claim_from_onchain_tx, next_htlcs, .. } = ev { - if !claim_from_onchain_tx { - // If the outbound channel is still open, the `next_user_channel_id` should be available. - // This was previously broken. - assert!(next_htlcs[0].user_channel_id.is_some()) + if !close_chans_before_reload && !hold_post_reload_mon_update { + // PaymentForwarded already fired during get_and_clear_pending_events above. + assert!(evs.is_empty(), "{:?}", evs); + } else { + assert_eq!(evs.len(), if close_chans_before_reload { 2 } else { 1 }, "{:?}", evs); + for ev in evs { + if let Event::PaymentForwarded { claim_from_onchain_tx, next_htlcs, .. } = ev { + if !claim_from_onchain_tx { + assert!(next_htlcs[0].user_channel_id.is_some()) + } + } else { + panic!("Unexpected event: {:?}", ev); } - } else { - panic!(); } } if !close_chans_before_reload || close_only_a { - // Once we call `process_pending_events` the final `ChannelMonitor` for the B<->C channel - // will fly, removing the payment preimage from it. - check_added_monitors(&nodes[1], 1); + if hold_post_reload_mon_update { + // The B-C monitor update from the completion action fires now. + check_added_monitors(&nodes[1], 1); + } assert!(nodes[1].node.get_and_clear_pending_events().is_empty()); send_payment(&nodes[1], &[&nodes[2]], 100_000); } @@ -5423,17 +5457,16 @@ fn test_late_counterparty_commitment_update_after_holder_commitment_spend_dust() } #[test] -#[should_panic( - expected = "Watch::update_channel returned Completed while prior updates are still InProgress" -)] -fn test_monitor_update_fail_after_funding_spend() { - // When a counterparty commitment transaction confirms (funding spend), the - // ChannelMonitor sets funding_spend_seen. If a commitment_signed from the - // counterparty is then processed (a race between chain events and message - // processing), update_monitor returns Err because no_further_updates_allowed() - // is true. ChainMonitor overrides the result to InProgress, permanently - // freezing the channel. A subsequent preimage claim returning Completed then - // triggers the per-channel assertion. +fn test_monitor_update_after_funding_spend() { + // Test that monitor updates still work after a funding spend is detected by the + // ChainMonitor but before ChannelManager has processed the corresponding block. + // + // When the counterparty commitment transaction confirms (funding spend), the + // ChannelMonitor sets funding_spend_seen and no_further_updates_allowed() returns + // true. ChainMonitor overrides all subsequent update_channel results to InProgress + // to freeze the channel. These overridden updates complete via deferred completions + // in release_pending_monitor_events, so that MonitorUpdateCompletionActions (like + // PaymentClaimed) can still fire. let chanmon_cfgs = create_chanmon_cfgs(2); let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); @@ -5444,7 +5477,7 @@ fn test_monitor_update_fail_after_funding_spend() { let (_, _, chan_id, _) = create_announced_chan_between_nodes(&nodes, 0, 1); // Route payment 1 fully so B can claim it later. - let (payment_preimage_1, _payment_hash_1, ..) = + let (payment_preimage_1, payment_hash_1, ..) = route_payment(&nodes[0], &[&nodes[1]], 1_000_000); // Get A's commitment tx (this is the "counterparty" commitment from B's perspective). @@ -5453,10 +5486,14 @@ fn test_monitor_update_fail_after_funding_spend() { // Confirm A's commitment tx on B's chain_monitor ONLY (not on B's ChannelManager). // This sets funding_spend_seen in the monitor, making no_further_updates_allowed() true. + // We also update the best block on the chain_monitor so the broadcaster height is + // consistent when claiming HTLCs. let (block_hash, height) = nodes[1].best_block_info(); let block = create_dummy_block(block_hash, height + 1, vec![as_commitment_tx[0].clone()]); let txdata: Vec<_> = block.txdata.iter().enumerate().collect(); nodes[1].chain_monitor.chain_monitor.transactions_confirmed(&block.header, &txdata, height + 1); + nodes[1].chain_monitor.chain_monitor.best_block_updated(&block.header, height + 1); + nodes[1].blocks.lock().unwrap().push((block, height + 1)); // Send payment 2 from A to B. let (route, payment_hash_2, _, payment_secret_2) = @@ -5478,15 +5515,38 @@ fn test_monitor_update_fail_after_funding_spend() { nodes[1].node.handle_update_add_htlc(node_a_id, &payment_event.msgs[0]); - // B processes commitment_signed. The monitor's update_monitor succeeds on the - // update steps, but returns Err at the end because no_further_updates_allowed() - // is true (funding_spend_seen). ChainMonitor overrides the result to InProgress. + // B processes commitment_signed. The monitor applies the update but returns Err + // because no_further_updates_allowed() is true. ChainMonitor overrides to InProgress, + // freezing the channel. nodes[1].node.handle_commitment_signed(node_a_id, &payment_event.commitment_msg[0]); check_added_monitors(&nodes[1], 1); - assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty()); - // B claims payment 1. The PaymentPreimage monitor update returns Completed - // (update_monitor succeeds for preimage, and persister returns Completed), - // but the prior InProgress from the commitment_signed is still pending. + // B claims payment 1. The preimage monitor update also returns InProgress (deferred), + // so no Completed-while-InProgress assertion fires. nodes[1].node.claim_funds(payment_preimage_1); + check_added_monitors(&nodes[1], 1); + + // First event cycle: the force-close MonitorEvent (CommitmentTxConfirmed) fires first, + // then the deferred completions resolve. The force-close generates a ChannelForceClosed + // update (also deferred), which blocks completion actions. So we only get ChannelClosed. + let events = nodes[1].node.get_and_clear_pending_events(); + assert_eq!(events.len(), 1); + match &events[0] { + Event::ChannelClosed { reason: ClosureReason::CommitmentTxConfirmed, .. } => {}, + _ => panic!("Unexpected event: {:?}", events[0]), + } + check_added_monitors(&nodes[1], 1); + nodes[1].node.get_and_clear_pending_msg_events(); + + // Second event cycle: the ChannelForceClosed deferred completion resolves, unblocking + // the PaymentClaimed completion action. + let events = nodes[1].node.get_and_clear_pending_events(); + assert_eq!(events.len(), 1); + match &events[0] { + Event::PaymentClaimed { payment_hash, amount_msat, .. } => { + assert_eq!(payment_hash_1, *payment_hash); + assert_eq!(1_000_000, *amount_msat); + }, + _ => panic!("Unexpected event: {:?}", events[0]), + } } diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs index 3ec174c58e1..3a6b8e28b07 100644 --- a/lightning/src/ln/channelmanager.rs +++ b/lightning/src/ln/channelmanager.rs @@ -10387,7 +10387,10 @@ This indicates a bug inside LDK. Please report this error at https://github.com/ #[cfg(not(test))] let skip_check = false; if !skip_check && update_completed && !in_flight_updates.is_empty() { - panic!("Watch::update_channel returned Completed while prior updates are still InProgress"); + panic!( + "Watch::update_channel returned Completed for channel {} while {} prior updates are still InProgress", + channel_id, in_flight_updates.len(), + ); } (update_completed, update_completed && in_flight_updates.is_empty()) } else {