From 55641e798a3d0556b39e30ca63906b81dbb18024 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Thu, 9 Apr 2026 18:35:07 +0200 Subject: [PATCH] Fix race condition in ProcessesSync causing SSH key mismatch The sync could update an LRP with stale SSH route data from a previous version when the process version changed between the initial scan and the batch execution. Timeline of the race condition: Sync User Request ---- ------------ | | | fetch diego LRPs | | (gets {guid}-{vA} with SSH key A) | | | | scan CC processes | | process has version A, matches | | to_update[id] = diego_lrp_A | | | | | update process | | version changes to B | | | | desire_app called | | creates {guid}-{vB} | | with new SSH key B | | | re-fetch process by id | | (now has version B) | | | | update_app(process_vB, diego_lrp_A) | | process_guid = {guid}-{vB} | | but uses SSH route from diego_lrp_A! | | | v v Result: LRP {guid}-{vB} has SSH key A in routes but SSH key B in run_info (sshd arguments), breaking cf ssh. Fix: Before executing desire/update, verify the process version still matches what was recorded during the initial scan. Skip if mismatched - the next sync cycle will handle it correctly. Co-authored-by: Jochen Ehret --- lib/cloud_controller/diego/processes_sync.rb | 14 ++++++--- spec/isolated_specs/processes_sync_spec.rb | 32 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/lib/cloud_controller/diego/processes_sync.rb b/lib/cloud_controller/diego/processes_sync.rb index a3b00f34cef..eadc8d20b00 100644 --- a/lib/cloud_controller/diego/processes_sync.rb +++ b/lib/cloud_controller/diego/processes_sync.rb @@ -21,7 +21,7 @@ def sync @bump_freshness = true diego_lrps = bbs_apps_client.fetch_scheduling_infos.index_by { |d| d.desired_lrp_key.process_guid } logger.info('fetched-scheduling-infos') - to_desire = [] + to_desire = {} to_update = {} batched_processes_for_sync do |processes| processes.each do |process| @@ -29,7 +29,7 @@ def sync diego_lrp = diego_lrps.delete(process_guid) if diego_lrp.nil? - to_desire.append(process.id) + to_desire[process.id] = process_guid elsif process.updated_at.to_f.to_s != diego_lrp.annotation to_update[process.id] = diego_lrp end @@ -87,7 +87,10 @@ def process_workpool_exceptions(exceptions) def update_lrps(to_update) batched_processes(to_update.keys) do |processes| processes.each do |process| - workpool.submit(process, to_update[process.id]) do |p, l| + existing_lrp = to_update[process.id] + next if ProcessGuid.from_process(process) != existing_lrp.desired_lrp_key.process_guid + + workpool.submit(process, existing_lrp) do |p, l| logger.info('updating-lrp', process_guid: p.guid, app_guid: p.app_guid) bbs_apps_client.update_app(p, l) logger.info('update-lrp', process_guid: p.guid) @@ -97,8 +100,11 @@ def update_lrps(to_update) end def desire_lrps(to_desire) - batched_processes(to_desire) do |processes| + batched_processes(to_desire.keys) do |processes| processes.each do |process| + desired_process_guid = to_desire[process.id] + next if ProcessGuid.from_process(process) != desired_process_guid + workpool.submit(process) do |p| logger.info('desiring-lrp', process_guid: p.guid, app_guid: p.app_guid) bbs_apps_client.desire_app(p) diff --git a/spec/isolated_specs/processes_sync_spec.rb b/spec/isolated_specs/processes_sync_spec.rb index b47f5ed78e8..87c2e16fd50 100644 --- a/spec/isolated_specs/processes_sync_spec.rb +++ b/spec/isolated_specs/processes_sync_spec.rb @@ -121,6 +121,22 @@ module Diego expect(bbs_apps_client).to have_received(:bump_freshness).once end + context 'when the stale process gets a new version between fetching for sync (only guid + version + created_at) and fetching for actual update (full model)' do + before do + allow(subject).to receive(:update_lrps).and_wrap_original do |method, *args| + stale_process.set_new_version + stale_process.save + method.call(*args) + end + end + + it 'skips the (obsolete) update' do + allow(bbs_apps_client).to receive(:update_app) + subject.sync + expect(bbs_apps_client).not_to have_received(:update_app) + end + end + context 'when the process is a cnb app' do let(:app) { AppModel.make(:cnb, droplet: DropletModel.make(:cnb)) } let!(:stale_process) { ProcessModel.make(:cnb, state: 'STARTED', app: app) } @@ -253,6 +269,22 @@ module Diego expect(bbs_apps_client).to have_received(:bump_freshness).once end + context 'when the missing process gets a new version between fetching for sync (only guid + version + created_at) and fetching for actual creation (full model)' do + before do + allow(subject).to receive(:desire_lrps).and_wrap_original do |method, *args| + missing_process.set_new_version + missing_process.save + method.call(*args) + end + end + + it 'skips the (obsolete) creation' do + allow(bbs_apps_client).to receive(:desire_app) + subject.sync + expect(bbs_apps_client).not_to have_received(:desire_app) + end + end + context 'when desiring app fails' do # bbs_apps_client will raise ApiErrors as of right now, we should think about factoring that out so that # the background job doesn't have to deal with API concerns