From 715e6ee0dd447eb3f13e54a5894fbd6a46cd41a1 Mon Sep 17 00:00:00 2001 From: hiroTamada <88675973+hiroTamada@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:12:00 +0000 Subject: [PATCH 1/2] test: de-flake TestUpstreamManagerDetectsChromiumAndRestart Two flake modes: chromium's renderer/crashpad children outlive Process.Kill() on the parent and keep writing the user-data-dir while t.TempDir() cleanup deletes it (TempDir RemoveAll: directory not empty), and the 20s devtools-detection wait is too tight on loaded CI runners. Start chromium in its own process group and kill the whole group on cleanup, and raise the detection wait to 60s. Co-Authored-By: Claude Opus 4.7 --- server/lib/devtoolsproxy/proxy_test.go | 38 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/server/lib/devtoolsproxy/proxy_test.go b/server/lib/devtoolsproxy/proxy_test.go index 8d2d7130..3ed455c1 100644 --- a/server/lib/devtoolsproxy/proxy_test.go +++ b/server/lib/devtoolsproxy/proxy_test.go @@ -16,6 +16,7 @@ import ( "strings" "sync" "sync/atomic" + "syscall" "testing" "time" @@ -84,6 +85,17 @@ func waitForCondition(timeout time.Duration, cond func() bool) bool { return cond() } +// killProcessGroup kills cmd's entire process group and reaps the parent. +// Requires the command to have been started with Setpgid so the group ID +// equals the child's PID. +func killProcessGroup(cmd *exec.Cmd) { + if cmd == nil || cmd.Process == nil { + return + } + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + _, _ = cmd.Process.Wait() +} + func TestWaitForInitialTimeoutWhenLogMissing(t *testing.T) { mgr := NewUpstreamManager("/tmp/not-a-real-file-hopefully", silentLogger()) ctx, cancel := context.WithCancel(context.Background()) @@ -294,6 +306,10 @@ func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { cmd = exec.Command(browser, chromiumArgs...) } + // Own process group so cleanup can kill the whole tree — killing + // only the parent leaves renderer/crashpad children writing the + // user-data-dir while t.TempDir() cleanup deletes it. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.Stdout = logFile cmd.Stderr = logFile if err := cmd.Start(); err != nil { @@ -311,13 +327,13 @@ func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { if err != nil { t.Fatalf("start chromium 1: %v", err) } - defer func() { - _ = cmd1.Process.Kill() - _, _ = cmd1.Process.Wait() - }() + defer killProcessGroup(cmd1) - // Wait for initial upstream containing port1 - ok := waitForCondition(20*time.Second, func() bool { + // Wait for initial upstream containing port1. Generous timeout: CI + // runners can take tens of seconds to bring up chromium's devtools + // endpoint under load. + const upstreamDetectTimeout = 60 * time.Second + ok := waitForCondition(upstreamDetectTimeout, func() bool { u := mgr.Current() return strings.Contains(u, fmt.Sprintf(":%d/", port1)) }) @@ -338,20 +354,16 @@ func TestUpstreamManagerDetectsChromiumAndRestart(t *testing.T) { t.Fatalf("get free port 2: %v", err) } t.Logf("killing first chromium instance to restart on port %d", port2) - _ = cmd1.Process.Kill() - _, _ = cmd1.Process.Wait() + killProcessGroup(cmd1) cmd2, err := startChromium(port2) if err != nil { t.Fatalf("start chromium 2: %v", err) } - defer func() { - _ = cmd2.Process.Kill() - _, _ = cmd2.Process.Wait() - }() + defer killProcessGroup(cmd2) // Expect manager to update to new port - ok = waitForCondition(20*time.Second, func() bool { + ok = waitForCondition(upstreamDetectTimeout, func() bool { u := mgr.Current() return strings.Contains(u, fmt.Sprintf(":%d/", port2)) }) From 4566e1ce05ede939a6fcc4b1beea7190ba3dd51e Mon Sep 17 00:00:00 2001 From: hiroTamada <88675973+hiroTamada@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:50:01 +0000 Subject: [PATCH 2/2] test: make killProcessGroup idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deferred kill after an explicit kill+reap re-signalled the freed PGID, which could hit a recycled process group. Guard on ProcessState (set by Wait) so a reaped cmd is never signalled again — restoring the no-op-after-Wait semantics the previous os.Process.Kill provided. Co-Authored-By: Claude Opus 4.7 --- server/lib/devtoolsproxy/proxy_test.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/server/lib/devtoolsproxy/proxy_test.go b/server/lib/devtoolsproxy/proxy_test.go index 3ed455c1..1d8b2ad6 100644 --- a/server/lib/devtoolsproxy/proxy_test.go +++ b/server/lib/devtoolsproxy/proxy_test.go @@ -87,13 +87,15 @@ func waitForCondition(timeout time.Duration, cond func() bool) bool { // killProcessGroup kills cmd's entire process group and reaps the parent. // Requires the command to have been started with Setpgid so the group ID -// equals the child's PID. +// equals the child's PID. Idempotent: a reaped cmd (ProcessState set by +// Wait below) is never re-signalled, so a recycled PGID can't be hit by +// a second call — e.g. a defer following an explicit kill. func killProcessGroup(cmd *exec.Cmd) { - if cmd == nil || cmd.Process == nil { + if cmd == nil || cmd.Process == nil || cmd.ProcessState != nil { return } _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) - _, _ = cmd.Process.Wait() + _ = cmd.Wait() } func TestWaitForInitialTimeoutWhenLogMissing(t *testing.T) {