From 60e07abac346b28a2495a363f65c0c6706c99ed7 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 10 Jun 2026 11:53:42 -0400 Subject: [PATCH] Fix flaky TestUpstreamManagerDetectsChromiumAndRestart (raise startup timeout) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestUpstreamManagerDetectsChromiumAndRestart launches a real headless Chromium and waits for UpstreamManager to scrape the "DevTools listening on ws://..." line. The wait was capped at 20s per browser launch. Root cause: Chromium's cold-start latency has a long tail on GitHub's shared ubuntu-latest runners. Across recent main runs this exact launch printed the DevTools line in ~6s on a warm runner but took 10-17s on a contended one, and occasionally exceeded 20s — failing at exactly ~20.15s, i.e. the wait timed out, not because the line was missing. The dbus errors in the failure log ("Failed to connect to the bus") are benign noise unrelated to CDP startup (reproduced: Chromium prints the DevTools line even with a broken DBUS_SESSION_BUS_ADDRESS). It is a pre-existing flake on main, independent of any product change. Evidence (recent main "Test for the server/ directory" runs): PASS 6.4s / 6.6s / 6.7s / 6.8s (warm) ... PASS 10.1s / 14.3s / 15.4s / 17.0s (slow) ... FAIL 20.14s / 20.15s / 20.16s (timeout). Reproduced locally with the runner's exact Chromium 148.0.7778.0 snapshot build: the line is printed reliably (~0.5s warm); the failures are latency, not a missing line. Fix: - Raise the per-launch detection wait to 60s (named browserDetectTimeout with rationale) so the slow cold-start tail clears while still failing fast if the browser truly never comes up. This is the actual flake fix. - findBrowserBinary now honors CHROMIUM_BIN/CHROME_BIN and prefers the deterministic chrome/google-chrome binaries; server-test.yaml pins CHROMIUM_BIN to setup-chrome's chrome-path. This is hygiene (deterministic browser across runner-image updates), not the root cause. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/server-test.yaml | 6 ++++ server/lib/devtoolsproxy/proxy_test.go | 41 ++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/.github/workflows/server-test.yaml b/.github/workflows/server-test.yaml index f65ae214..374b9f42 100644 --- a/.github/workflows/server-test.yaml +++ b/.github/workflows/server-test.yaml @@ -27,6 +27,7 @@ jobs: uses: actions/checkout@v4 - name: Setup Chrome + id: setup-chrome uses: browser-actions/setup-chrome@v2 - name: Set up Node.js @@ -62,3 +63,8 @@ jobs: env: E2E_CHROMIUM_HEADFUL_IMAGE: onkernel/chromium-headful:${{ steps.vars.outputs.short_sha }} E2E_CHROMIUM_HEADLESS_IMAGE: onkernel/chromium-headless:${{ steps.vars.outputs.short_sha }} + # Pin the browser used by devtoolsproxy's UpstreamManager test to the + # Chrome for Testing installed above (rather than the runner image's + # /usr/bin/chromium snapshot build) so the browser under test is + # deterministic across runner-image updates. + CHROMIUM_BIN: ${{ steps.setup-chrome.outputs.chrome-path }} diff --git a/server/lib/devtoolsproxy/proxy_test.go b/server/lib/devtoolsproxy/proxy_test.go index 8d2d7130..5e9dabb0 100644 --- a/server/lib/devtoolsproxy/proxy_test.go +++ b/server/lib/devtoolsproxy/proxy_test.go @@ -55,7 +55,32 @@ func newTestLogWriter(t *testing.T) *testLogWriter { } func findBrowserBinary() (string, error) { - candidates := []string{"chromium", "chromium-browser", "google-chrome", "google-chrome-stable"} + // Allow CI/developers to pin an exact browser binary. This is important + // because the UpstreamManager detects the CDP endpoint solely by scraping + // the "DevTools listening on ws://..." line from the browser's stderr, and + // not every Chromium build emits that line reliably (see candidate ordering + // below). + for _, env := range []string{"CHROMIUM_BIN", "CHROME_BIN"} { + if v := os.Getenv(env); v != "" { + if p, err := exec.LookPath(v); err == nil { + return p, nil + } + // Treat a non-PATH value as an explicit path. + if _, err := os.Stat(v); err == nil { + return v, nil + } + } + } + + // Prefer the deterministic Chrome/Chrome-for-Testing binaries over a bare + // "chromium". On GitHub's ubuntu-latest runner /usr/bin/chromium is a + // symlink to an open-source Chromium *snapshot* build (installed by + // runner-images from chromium-browser-snapshots); the CI workflow separately + // installs a pinned Chrome for Testing via browser-actions/setup-chrome, + // exposed on PATH as "chrome". Preferring "chrome"/"google-chrome" keeps the + // browser under test deterministic across runner-image updates. (This is + // hygiene, not the flake fix — see browserDetectTimeout below.) + candidates := []string{"chrome", "google-chrome", "google-chrome-stable", "chromium", "chromium-browser"} for _, name := range candidates { if p, err := exec.LookPath(name); err == nil { return p, nil @@ -234,6 +259,16 @@ func TestDialUpstreamWithRetry_RechecksCurrentAfterMissedUpdate(t *testing.T) { } } +// browserDetectTimeout bounds how long the test waits for the UpstreamManager +// to scrape the "DevTools listening on ws://..." line from a freshly launched +// browser. Chromium's cold-start time has a long tail on shared CI runners: +// across recent CI runs this same launch printed the line in ~6s on a warm +// runner but took 15-17s on a slow/contended one, and occasionally exceeded the +// previous 20s budget (failing at exactly ~20.15s — the timeout, not a missing +// line). 60s gives ample headroom for the slow tail while still failing fast if +// the browser truly never comes up. +const browserDetectTimeout = 60 * time.Second + func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { browser, err := findBrowserBinary() if err != nil { @@ -317,7 +352,7 @@ func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { }() // Wait for initial upstream containing port1 - ok := waitForCondition(20*time.Second, func() bool { + ok := waitForCondition(browserDetectTimeout, func() bool { u := mgr.Current() return strings.Contains(u, fmt.Sprintf(":%d/", port1)) }) @@ -351,7 +386,7 @@ func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { }() // Expect manager to update to new port - ok = waitForCondition(20*time.Second, func() bool { + ok = waitForCondition(browserDetectTimeout, func() bool { u := mgr.Current() return strings.Contains(u, fmt.Sprintf(":%d/", port2)) })