From ff567e217434d9005b7a43d5c701e16da17a7f6c Mon Sep 17 00:00:00 2001 From: VarunChandola Date: Wed, 4 Mar 2026 14:39:01 +1100 Subject: [PATCH] Prevent permanent worker deadlock when cutover times out waiting for binlog sentinel Buffer allEventsUpToLockProcessed to MaxRetries() so the applier's send always completes immediately even after waitForEventsUpToLock has timed out and exited. --- go/logic/migrator.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 6c915ea62..5267ed8f3 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -111,7 +111,15 @@ func NewMigrator(context *base.MigrationContext, appVersion string) *Migrator { ghostTableMigrated: make(chan bool), firstThrottlingCollected: make(chan bool, 3), rowCopyComplete: make(chan error), - allEventsUpToLockProcessed: make(chan *lockProcessedStruct), + // Buffered to MaxRetries() to prevent a deadlock when waitForEventsUpToLock times + // out. The sentinel applyEventFunc may still be queued in applyEventsQueue when the + // timeout fires; when the worker eventually executes it, it sends on this channel + // with no active receiver. An unbuffered channel would block the worker permanently: + // the queue fills, the listener goroutine stalls, heartbeat lag grows unboundedly, + // and no further cutover attempts are made. With a buffer sized to the retry limit + // the send always completes immediately. Stale sentinels accumulate in the buffer + // and are discarded by the stale-skip loop in waitForEventsUpToLock. + allEventsUpToLockProcessed: make(chan *lockProcessedStruct, context.MaxRetries()), copyRowsQueue: make(chan tableWriteFunc), applyEventsQueue: make(chan *applyEventStruct, base.MaxEventsBatchSize),