From 8e60a592075eefe415a6e5e9f36c672d99ce2adb Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Fri, 29 May 2026 17:47:53 +0200 Subject: [PATCH 1/6] debug process exit hang on MSVC --- .github/workflows/cpp_extra.yml | 51 +++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 73b06f9deec5..53878d3b6f11 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -441,7 +441,7 @@ jobs: - name: Install Dependencies run: | brew bundle --file=cpp/Brewfile - + # We want to use bundled RE2 for static linking. If # Homebrew's RE2 is installed, its header file may be used. # We uninstall Homebrew's RE2 to ensure using bundled RE2. @@ -547,7 +547,7 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON ARROW_BUILD_TYPE: release - # Turn Arrow CSV off to disable `find_package(Arrow)` check on MSVC CI. + # Turn Arrow CSV off to disable `find_package(Arrow)` check on MSVC CI. # GH-49050 TODO: enable `find_package(Arrow)` check on MSVC CI. ARROW_CSV: OFF ARROW_DEPENDENCY_SOURCE: VCPKG @@ -634,6 +634,51 @@ jobs: shell: cmd run: | call "cpp\src\arrow\flight\sql\odbc\tests\install_odbc.cmd" ${{ github.workspace }}\build\cpp\%ARROW_BUILD_TYPE%\arrow_flight_sql_odbc.dll + # Debug https://github.com/apache/arrow/issues/49465: + # arrow-flight-test passes all 94 tests in ~4s, then + # intermittently hangs on exit and ctest kills it at the 300s timeout. + # Run it in a loop and minidump the hung process to inspect its threads. + - name: Capture Flight test exit-hang dump (diagnostic) + shell: pwsh + continue-on-error: true + env: + ARROW_TEST_DATA: ${{ github.workspace }}\testing\data + run: | + $ErrorActionPreference = 'Continue' + $binDir = "${{ github.workspace }}\build\cpp\$env:ARROW_BUILD_TYPE" + $exe = Join-Path $binDir 'arrow-flight-test.exe' + $dumpDir = "${{ github.workspace }}\flight-hang-dumps" + New-Item -ItemType Directory -Force -Path $dumpDir | Out-Null + + # install ProcDump (Sysinternals) + Invoke-WebRequest -Uri https://download.sysinternals.com/files/Procdump.zip ` + -OutFile Procdump.zip + Expand-Archive -Path Procdump.zip -DestinationPath procdump -Force + $procdump = (Resolve-Path procdump\procdump64.exe).Path + + # intermittent across runs, so retry several times to repro + for ($i = 1; $i -le 10; $i++) { + Write-Host "=== arrow-flight-test attempt $i/10 ===" + $proc = Start-Process -FilePath $exe -PassThru -NoNewWindow ` + -WorkingDirectory $binDir ` + -RedirectStandardOutput (Join-Path $dumpDir "run$i.out.txt") ` + -RedirectStandardError (Join-Path $dumpDir "run$i.err.txt") + # normal run is ~5s. If still alive at 60s means the exit hang + if ($proc.WaitForExit(60000)) { continue } + & $procdump -accepteula -ma $proc.Id (Join-Path $dumpDir "flight-hang.dmp") + try { Stop-Process -Id $proc.Id -Force } catch {} + break + } + # include PDBs so Arrow frames in the dump symbolize + Get-ChildItem -Path $binDir -Filter '*flight*.pdb' -ErrorAction SilentlyContinue | + Copy-Item -Destination $dumpDir -ErrorAction SilentlyContinue + - name: Upload Flight test exit-hang dump (diagnostic) + if: always() + uses: actions/upload-artifact@v7 + with: + name: flight-hang-dumps + path: flight-hang-dumps + if-no-files-found: warn - name: Test shell: cmd run: | @@ -721,7 +766,7 @@ jobs: dev_msi_name=$(echo ${msi_name} | sed -e "s/win64\.msi$/dev-$(date +%Y-%m-%d)-win64.msi/") mv "${msi_name}" "${dev_msi_name}" cd .. - + tree odbc-installer - name: Checkout Arrow uses: actions/checkout@v6 From eeb2299822fa37127bb7bbb9bf65a59fb6ca4d11 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Fri, 29 May 2026 23:21:51 +0200 Subject: [PATCH 2/6] debug hung process during ctest --- .github/workflows/cpp_extra.yml | 61 +++++++++++------------------ ci/scripts/flight_hang_watchdog.ps1 | 51 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 38 deletions(-) create mode 100644 ci/scripts/flight_hang_watchdog.ps1 diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 53878d3b6f11..cc3ddfc89715 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -635,50 +635,28 @@ jobs: run: | call "cpp\src\arrow\flight\sql\odbc\tests\install_odbc.cmd" ${{ github.workspace }}\build\cpp\%ARROW_BUILD_TYPE%\arrow_flight_sql_odbc.dll # Debug https://github.com/apache/arrow/issues/49465: - # arrow-flight-test passes all 94 tests in ~4s, then - # intermittently hangs on exit and ctest kills it at the 300s timeout. - # Run it in a loop and minidump the hung process to inspect its threads. - - name: Capture Flight test exit-hang dump (diagnostic) + # arrow-flight-test passes all 94 tests in ~4s, then intermittently hangs + # on exit and ctest kills it at the 300s timeout. The hang only reproduces + # under the parallel ctest run (not standalone). So try a background + # watchdog to minidump the hung process during the Test step below. + - name: Start Flight hang watchdog (diagnostic) shell: pwsh continue-on-error: true - env: - ARROW_TEST_DATA: ${{ github.workspace }}\testing\data run: | - $ErrorActionPreference = 'Continue' - $binDir = "${{ github.workspace }}\build\cpp\$env:ARROW_BUILD_TYPE" - $exe = Join-Path $binDir 'arrow-flight-test.exe' $dumpDir = "${{ github.workspace }}\flight-hang-dumps" New-Item -ItemType Directory -Force -Path $dumpDir | Out-Null - - # install ProcDump (Sysinternals) - Invoke-WebRequest -Uri https://download.sysinternals.com/files/Procdump.zip ` - -OutFile Procdump.zip - Expand-Archive -Path Procdump.zip -DestinationPath procdump -Force - $procdump = (Resolve-Path procdump\procdump64.exe).Path - - # intermittent across runs, so retry several times to repro - for ($i = 1; $i -le 10; $i++) { - Write-Host "=== arrow-flight-test attempt $i/10 ===" - $proc = Start-Process -FilePath $exe -PassThru -NoNewWindow ` - -WorkingDirectory $binDir ` - -RedirectStandardOutput (Join-Path $dumpDir "run$i.out.txt") ` - -RedirectStandardError (Join-Path $dumpDir "run$i.err.txt") - # normal run is ~5s. If still alive at 60s means the exit hang - if ($proc.WaitForExit(60000)) { continue } - & $procdump -accepteula -ma $proc.Id (Join-Path $dumpDir "flight-hang.dmp") - try { Stop-Process -Id $proc.Id -Force } catch {} - break - } - # include PDBs so Arrow frames in the dump symbolize - Get-ChildItem -Path $binDir -Filter '*flight*.pdb' -ErrorAction SilentlyContinue | + Invoke-WebRequest -Uri https://download.sysinternals.com/files/Procdump.zip -OutFile Procdump.zip + Expand-Archive -Path Procdump.zip -DestinationPath "${{ github.workspace }}\procdump" -Force + $procdump = "${{ github.workspace }}\procdump\procdump64.exe" + # collect Arrow PDBs if present (release builds usually have none) + Get-ChildItem -Path "${{ github.workspace }}\build\cpp\$env:ARROW_BUILD_TYPE" -Filter '*.pdb' -ErrorAction SilentlyContinue | + Where-Object { $_.Name -match 'arrow|flight' } | Copy-Item -Destination $dumpDir -ErrorAction SilentlyContinue - - name: Upload Flight test exit-hang dump (diagnostic) - if: always() - uses: actions/upload-artifact@v7 - with: - name: flight-hang-dumps - path: flight-hang-dumps - if-no-files-found: warn + # launch detached so it keeps polling while the Test step runs + $wd = Start-Process pwsh -WindowStyle Hidden -PassThru -ArgumentList ` + '-NoProfile', '-File', "${{ github.workspace }}\ci\scripts\flight_hang_watchdog.ps1", ` + $dumpDir, $procdump + Write-Host "watchdog started, PID $($wd.Id)" - name: Test shell: cmd run: | @@ -688,6 +666,13 @@ jobs: # Convert VCPKG Windows path to MSYS path for /f "usebackq delims=" %%I in (`bash -c "cygpath -u \"$VCPKG_ROOT_KEEP\""` ) do set VCPKG_ROOT=%%I bash -c "ci/scripts/cpp_test.sh $(pwd) $(pwd)/build" + - name: Upload Flight hang dump (diagnostic) + if: always() + uses: actions/upload-artifact@v7 + with: + name: flight-hang-dumps + path: flight-hang-dumps + if-no-files-found: warn - name: Install WiX Toolset shell: pwsh run: | diff --git a/ci/scripts/flight_hang_watchdog.ps1 b/ci/scripts/flight_hang_watchdog.ps1 new file mode 100644 index 000000000000..966ddbcb3177 --- /dev/null +++ b/ci/scripts/flight_hang_watchdog.ps1 @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Diagnostic for GH-49465. arrow-flight-test passes all 94 tests in ~4s, then +# intermittently hangs on exit and ctest kills it at the 300s timeout. The hang +# only reproduces under the parallel ctest run, not when the binary is run +# standalone, so this watchdog runs in the background during the ctest step and +# minidumps any arrow-flight-test process that outlives the normal run (then +# kills it so ctest moves on), letting us inspect the stuck threads. + +param( + [Parameter(Mandatory = $true)][string]$DumpDir, + [Parameter(Mandatory = $true)][string]$ProcDump, + [int]$AgeSeconds = 90, + [int]$DeadlineMinutes = 30 +) + +$ErrorActionPreference = 'Continue' +$log = Join-Path $DumpDir 'watchdog.log' +$seen = @{} +$dumped = @{} +$deadline = (Get-Date).AddMinutes($DeadlineMinutes) +"$(Get-Date -Format o) watchdog started (dump arrow-flight-test alive > ${AgeSeconds}s)" | Add-Content $log + +while ((Get-Date) -lt $deadline) { + foreach ($p in @(Get-Process arrow-flight-test -ErrorAction SilentlyContinue)) { + if (-not $seen.ContainsKey($p.Id)) { $seen[$p.Id] = Get-Date } + $age = ((Get-Date) - $seen[$p.Id]).TotalSeconds + if ($age -gt $AgeSeconds -and -not $dumped.ContainsKey($p.Id)) { + "$(Get-Date -Format o) dumping PID $($p.Id) (age $([int]$age)s)" | Add-Content $log + & $ProcDump -accepteula -ma $p.Id (Join-Path $DumpDir "flight-hang-$($p.Id).dmp") *>> $log + $dumped[$p.Id] = $true + try { Stop-Process -Id $p.Id -Force } catch {} + } + } + Start-Sleep -Seconds 5 +} From 76abb0a9c1a58c6f7364ae20ec579ed41dceb922 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Sat, 30 May 2026 22:17:20 +0200 Subject: [PATCH 3/6] test GPR_DISABLE_ABSEIL_SYNC for flight exit hang --- .github/workflows/cpp_extra.yml | 4 ++++ .../x64-windows.cmake | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index cc3ddfc89715..58f20b5dab88 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -554,10 +554,14 @@ jobs: ARROW_FLIGHT_SQL_ODBC: ON ARROW_FLIGHT_SQL_ODBC_INSTALLER: ON ARROW_HOME: /usr + # GH-49465 diagnostic: match the overlay triplet's macro so grpcpp's Mutex ABI agrees + ARROW_CXXFLAGS: /DGPR_DISABLE_ABSEIL_SYNC CMAKE_GENERATOR: Ninja CMAKE_INSTALL_PREFIX: /usr VCPKG_BINARY_SOURCES: 'clear;nugettimeout,600;nuget,GitHub,readwrite' VCPKG_DEFAULT_TRIPLET: x64-windows + # GH-49465 diagnostic: shadow x64-windows triplet to rebuild gRPC without absl sync + VCPKG_OVERLAY_TRIPLETS: ${{ github.workspace }}/ci/vcpkg/overlay-triplets-no-absl-sync steps: - name: Disable Crash Dialogs run: | diff --git a/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake new file mode 100644 index 000000000000..13e85a048e0e --- /dev/null +++ b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# GH-49465 diagnostic: built-in x64-windows triplet plus GPR_DISABLE_ABSEIL_SYNC +# to rebuild gRPC without absl::Mutex and test if the Windows exit hang goes away. +set(VCPKG_TARGET_ARCHITECTURE x64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE dynamic) +set(VCPKG_C_FLAGS "/DGPR_DISABLE_ABSEIL_SYNC") +set(VCPKG_CXX_FLAGS "/DGPR_DISABLE_ABSEIL_SYNC") From f2587ae6d58fff0ad9fa5fce9e138d5f4977337b Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Sun, 31 May 2026 08:31:52 +0200 Subject: [PATCH 4/6] fix ARROW_CXXFLAGS slash for bash --- .github/workflows/cpp_extra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 58f20b5dab88..0d8e51734d6d 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -555,7 +555,7 @@ jobs: ARROW_FLIGHT_SQL_ODBC_INSTALLER: ON ARROW_HOME: /usr # GH-49465 diagnostic: match the overlay triplet's macro so grpcpp's Mutex ABI agrees - ARROW_CXXFLAGS: /DGPR_DISABLE_ABSEIL_SYNC + ARROW_CXXFLAGS: -DGPR_DISABLE_ABSEIL_SYNC CMAKE_GENERATOR: Ninja CMAKE_INSTALL_PREFIX: /usr VCPKG_BINARY_SOURCES: 'clear;nugettimeout,600;nuget,GitHub,readwrite' From 7aecc6eb83aacefd49e7d5aa4590498bf69d5c10 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Wed, 3 Jun 2026 00:17:10 +0200 Subject: [PATCH 5/6] confirm fix by ctest repeats --- .github/workflows/cpp_extra.yml | 13 +++++++++---- ci/scripts/cpp_test.sh | 2 +- ...windows.cmake => x64-windows-no-absl-sync.cmake} | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) rename ci/vcpkg/overlay-triplets-no-absl-sync/{x64-windows.cmake => x64-windows-no-absl-sync.cmake} (85%) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 0d8e51734d6d..8631151ca795 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -554,13 +554,17 @@ jobs: ARROW_FLIGHT_SQL_ODBC: ON ARROW_FLIGHT_SQL_ODBC_INSTALLER: ON ARROW_HOME: /usr + # GH-49465 diagnostic: re-run the whole suite, failing if any run hangs - + # as the hang only reproduces under parallel load, not with the flight test alone. + ARROW_CTEST_REPEAT: until-fail:10 # GH-49465 diagnostic: match the overlay triplet's macro so grpcpp's Mutex ABI agrees ARROW_CXXFLAGS: -DGPR_DISABLE_ABSEIL_SYNC CMAKE_GENERATOR: Ninja CMAKE_INSTALL_PREFIX: /usr VCPKG_BINARY_SOURCES: 'clear;nugettimeout,600;nuget,GitHub,readwrite' - VCPKG_DEFAULT_TRIPLET: x64-windows - # GH-49465 diagnostic: shadow x64-windows triplet to rebuild gRPC without absl sync + # GH-49465 diagnostic: custom triplet (x64-windows + GPR_DISABLE_ABSEIL_SYNC) so gRPC + # is rebuilt without absl sync. See ci/vcpkg/overlay-triplets-no-absl-sync + VCPKG_DEFAULT_TRIPLET: x64-windows-no-absl-sync VCPKG_OVERLAY_TRIPLETS: ${{ github.workspace }}/ci/vcpkg/overlay-triplets-no-absl-sync steps: - name: Disable Crash Dialogs @@ -656,10 +660,11 @@ jobs: Get-ChildItem -Path "${{ github.workspace }}\build\cpp\$env:ARROW_BUILD_TYPE" -Filter '*.pdb' -ErrorAction SilentlyContinue | Where-Object { $_.Name -match 'arrow|flight' } | Copy-Item -Destination $dumpDir -ErrorAction SilentlyContinue - # launch detached so it keeps polling while the Test step runs + # launch detached so it keeps polling while the Test step runs. Deadline is + # raised to 200 min because ARROW_CTEST_REPEAT re-runs the suite many times. $wd = Start-Process pwsh -WindowStyle Hidden -PassThru -ArgumentList ` '-NoProfile', '-File', "${{ github.workspace }}\ci\scripts\flight_hang_watchdog.ps1", ` - $dumpDir, $procdump + $dumpDir, $procdump, '90', '200' Write-Host "watchdog started, PID $($wd.Id)" - name: Test shell: cmd diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 2f88cdc819b2..f089a2eafe76 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -117,7 +117,7 @@ else --label-regex unittest \ --output-on-failure \ --parallel "${n_jobs}" \ - --repeat until-pass:3 \ + --repeat "${ARROW_CTEST_REPEAT:-until-pass:3}" \ --timeout "${ARROW_CTEST_TIMEOUT:-300}" \ "${ctest_options[@]}" \ "$@" diff --git a/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake similarity index 85% rename from ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake rename to ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake index 13e85a048e0e..87cb12185566 100644 --- a/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows.cmake +++ b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -# GH-49465 diagnostic: built-in x64-windows triplet plus GPR_DISABLE_ABSEIL_SYNC -# to rebuild gRPC without absl::Mutex and test if the Windows exit hang goes away. +# GH-49465 diagnostic: x64-windows settings plus GPR_DISABLE_ABSEIL_SYNC, so gRPC +# is rebuilt without absl::Mutex to test if the Windows exit hang goes away. set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE dynamic) From c20daf741839bced71cd9e44525bee885347a4c2 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Wed, 3 Jun 2026 10:01:33 +0200 Subject: [PATCH 6/6] remove diagnostics and keep fix --- .github/workflows/cpp_extra.yml | 50 ++++-------------- ci/scripts/cpp_test.sh | 2 +- ci/scripts/flight_hang_watchdog.ps1 | 51 ------------------- .../x64-windows-no-absl-sync.cmake | 4 +- 4 files changed, 13 insertions(+), 94 deletions(-) delete mode 100644 ci/scripts/flight_hang_watchdog.ps1 diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 8631151ca795..5647f70b6d39 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -441,7 +441,7 @@ jobs: - name: Install Dependencies run: | brew bundle --file=cpp/Brewfile - + # We want to use bundled RE2 for static linking. If # Homebrew's RE2 is installed, its header file may be used. # We uninstall Homebrew's RE2 to ensure using bundled RE2. @@ -547,23 +547,24 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON ARROW_BUILD_TYPE: release - # Turn Arrow CSV off to disable `find_package(Arrow)` check on MSVC CI. + # Turn Arrow CSV off to disable `find_package(Arrow)` check on MSVC CI. # GH-49050 TODO: enable `find_package(Arrow)` check on MSVC CI. ARROW_CSV: OFF ARROW_DEPENDENCY_SOURCE: VCPKG ARROW_FLIGHT_SQL_ODBC: ON ARROW_FLIGHT_SQL_ODBC_INSTALLER: ON ARROW_HOME: /usr - # GH-49465 diagnostic: re-run the whole suite, failing if any run hangs - - # as the hang only reproduces under parallel load, not with the flight test alone. - ARROW_CTEST_REPEAT: until-fail:10 - # GH-49465 diagnostic: match the overlay triplet's macro so grpcpp's Mutex ABI agrees + # GH-49465: work around the gRPC/Abseil exit hang on Windows + # https://github.com/grpc/grpc/issues/39321 + # https://github.com/abseil/abseil-cpp/issues/1877 + # Build Arrow with GPR_DISABLE_ABSEIL_SYNC so grpcpp's Mutex ABI + # matches the gRPC rebuilt by the overlay triplet. Remove once fixed upstream. ARROW_CXXFLAGS: -DGPR_DISABLE_ABSEIL_SYNC CMAKE_GENERATOR: Ninja CMAKE_INSTALL_PREFIX: /usr VCPKG_BINARY_SOURCES: 'clear;nugettimeout,600;nuget,GitHub,readwrite' - # GH-49465 diagnostic: custom triplet (x64-windows + GPR_DISABLE_ABSEIL_SYNC) so gRPC - # is rebuilt without absl sync. See ci/vcpkg/overlay-triplets-no-absl-sync + # GH-49465: custom triplet that rebuilds gRPC with GPR_DISABLE_ABSEIL_SYNC + # (native sync instead of absl::Mutex). See ci/vcpkg/overlay-triplets-no-absl-sync. VCPKG_DEFAULT_TRIPLET: x64-windows-no-absl-sync VCPKG_OVERLAY_TRIPLETS: ${{ github.workspace }}/ci/vcpkg/overlay-triplets-no-absl-sync steps: @@ -642,30 +643,6 @@ jobs: shell: cmd run: | call "cpp\src\arrow\flight\sql\odbc\tests\install_odbc.cmd" ${{ github.workspace }}\build\cpp\%ARROW_BUILD_TYPE%\arrow_flight_sql_odbc.dll - # Debug https://github.com/apache/arrow/issues/49465: - # arrow-flight-test passes all 94 tests in ~4s, then intermittently hangs - # on exit and ctest kills it at the 300s timeout. The hang only reproduces - # under the parallel ctest run (not standalone). So try a background - # watchdog to minidump the hung process during the Test step below. - - name: Start Flight hang watchdog (diagnostic) - shell: pwsh - continue-on-error: true - run: | - $dumpDir = "${{ github.workspace }}\flight-hang-dumps" - New-Item -ItemType Directory -Force -Path $dumpDir | Out-Null - Invoke-WebRequest -Uri https://download.sysinternals.com/files/Procdump.zip -OutFile Procdump.zip - Expand-Archive -Path Procdump.zip -DestinationPath "${{ github.workspace }}\procdump" -Force - $procdump = "${{ github.workspace }}\procdump\procdump64.exe" - # collect Arrow PDBs if present (release builds usually have none) - Get-ChildItem -Path "${{ github.workspace }}\build\cpp\$env:ARROW_BUILD_TYPE" -Filter '*.pdb' -ErrorAction SilentlyContinue | - Where-Object { $_.Name -match 'arrow|flight' } | - Copy-Item -Destination $dumpDir -ErrorAction SilentlyContinue - # launch detached so it keeps polling while the Test step runs. Deadline is - # raised to 200 min because ARROW_CTEST_REPEAT re-runs the suite many times. - $wd = Start-Process pwsh -WindowStyle Hidden -PassThru -ArgumentList ` - '-NoProfile', '-File', "${{ github.workspace }}\ci\scripts\flight_hang_watchdog.ps1", ` - $dumpDir, $procdump, '90', '200' - Write-Host "watchdog started, PID $($wd.Id)" - name: Test shell: cmd run: | @@ -675,13 +652,6 @@ jobs: # Convert VCPKG Windows path to MSYS path for /f "usebackq delims=" %%I in (`bash -c "cygpath -u \"$VCPKG_ROOT_KEEP\""` ) do set VCPKG_ROOT=%%I bash -c "ci/scripts/cpp_test.sh $(pwd) $(pwd)/build" - - name: Upload Flight hang dump (diagnostic) - if: always() - uses: actions/upload-artifact@v7 - with: - name: flight-hang-dumps - path: flight-hang-dumps - if-no-files-found: warn - name: Install WiX Toolset shell: pwsh run: | @@ -760,7 +730,7 @@ jobs: dev_msi_name=$(echo ${msi_name} | sed -e "s/win64\.msi$/dev-$(date +%Y-%m-%d)-win64.msi/") mv "${msi_name}" "${dev_msi_name}" cd .. - + tree odbc-installer - name: Checkout Arrow uses: actions/checkout@v6 diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index f089a2eafe76..2f88cdc819b2 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -117,7 +117,7 @@ else --label-regex unittest \ --output-on-failure \ --parallel "${n_jobs}" \ - --repeat "${ARROW_CTEST_REPEAT:-until-pass:3}" \ + --repeat until-pass:3 \ --timeout "${ARROW_CTEST_TIMEOUT:-300}" \ "${ctest_options[@]}" \ "$@" diff --git a/ci/scripts/flight_hang_watchdog.ps1 b/ci/scripts/flight_hang_watchdog.ps1 deleted file mode 100644 index 966ddbcb3177..000000000000 --- a/ci/scripts/flight_hang_watchdog.ps1 +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Diagnostic for GH-49465. arrow-flight-test passes all 94 tests in ~4s, then -# intermittently hangs on exit and ctest kills it at the 300s timeout. The hang -# only reproduces under the parallel ctest run, not when the binary is run -# standalone, so this watchdog runs in the background during the ctest step and -# minidumps any arrow-flight-test process that outlives the normal run (then -# kills it so ctest moves on), letting us inspect the stuck threads. - -param( - [Parameter(Mandatory = $true)][string]$DumpDir, - [Parameter(Mandatory = $true)][string]$ProcDump, - [int]$AgeSeconds = 90, - [int]$DeadlineMinutes = 30 -) - -$ErrorActionPreference = 'Continue' -$log = Join-Path $DumpDir 'watchdog.log' -$seen = @{} -$dumped = @{} -$deadline = (Get-Date).AddMinutes($DeadlineMinutes) -"$(Get-Date -Format o) watchdog started (dump arrow-flight-test alive > ${AgeSeconds}s)" | Add-Content $log - -while ((Get-Date) -lt $deadline) { - foreach ($p in @(Get-Process arrow-flight-test -ErrorAction SilentlyContinue)) { - if (-not $seen.ContainsKey($p.Id)) { $seen[$p.Id] = Get-Date } - $age = ((Get-Date) - $seen[$p.Id]).TotalSeconds - if ($age -gt $AgeSeconds -and -not $dumped.ContainsKey($p.Id)) { - "$(Get-Date -Format o) dumping PID $($p.Id) (age $([int]$age)s)" | Add-Content $log - & $ProcDump -accepteula -ma $p.Id (Join-Path $DumpDir "flight-hang-$($p.Id).dmp") *>> $log - $dumped[$p.Id] = $true - try { Stop-Process -Id $p.Id -Force } catch {} - } - } - Start-Sleep -Seconds 5 -} diff --git a/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake index 87cb12185566..129977b05d2a 100644 --- a/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake +++ b/ci/vcpkg/overlay-triplets-no-absl-sync/x64-windows-no-absl-sync.cmake @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -# GH-49465 diagnostic: x64-windows settings plus GPR_DISABLE_ABSEIL_SYNC, so gRPC -# is rebuilt without absl::Mutex to test if the Windows exit hang goes away. +# GH-49465: rebuild gRPC with native sync instead of absl::Mutex to avoid the +# Windows exit hang. See the ODBC Windows job in cpp_extra.yml set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE dynamic)