From c21d8822f5ecbee350433c08e6b4f8deb099afc5 Mon Sep 17 00:00:00 2001 From: Max042004 Date: Sat, 6 Jun 2026 12:11:34 +0800 Subject: [PATCH 1/2] Fix cross-process signal delivery to EL0-preempted guests test-fork's phase-2 signal child spins in `while (!got_usr1) usleep()` waiting for a SIGUSR1 sent cross-process by its parent. The signal was delivered only ~35% of the time and lost the rest, so `make check` hung at test-fork until the 60s per-test timeout -- often longer, as leaked `elfuse --fork-child` orphans kept the driver's stdout pipe open. Two complementary defects: 1. vcpu_run_loop treated HV_EXIT_REASON_UNKNOWN as fatal. A host SIGUSR2 (the cross-process guest-signal transport) that interrupts hv_vcpu_run mid-execution aborts the run with UNKNOWN rather than the clean CANCELED that hv_vcpus_exit() produces for a vCPU caught between runs. Route UNKNOWN through the same cancellation handling so the already-queued guest signal is delivered instead of crashing the child. 2. signal_deliver redirected to the handler only via ELR_EL1, which takes effect solely on an ERET from EL1 (the syscall-return path, gated by the shim's X8==2 exec_drop_frame marker). When the signal is delivered from the cancellation branch -- i.e. the vCPU was preempted while running EL0 code (cross-process SIGUSR2, or SIGALRM in a tight loop) -- there is no pending ERET, the resume uses HV_REG_PC, and the ELR_EL1 write is a no-op: the handler never runs and only the X0=signum clobber lands, re-running the interrupted nanosleep with a bogus arg and spinning forever. Detect EL0 preemption from the live PSTATE (CPSR M[3:0]==0), save the interrupted PC from HV_REG_PC instead of the stale ELR_EL1, and redirect HV_REG_PC/CPSR directly; skip the X8==2 marker since there is no shim frame to drop. test-fork now passes 20/20 (was ~7/20); `make check` is green with no hang. --- src/syscall/proc.c | 13 ++++++++++++- src/syscall/signal.c | 42 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 647bab8..f79720f 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1941,11 +1941,22 @@ int vcpu_run_loop(hv_vcpu_t vcpu, exit_code = 128; running = false; } - } else if (vexit->reason == HV_EXIT_REASON_CANCELED) { + } else if (vexit->reason == HV_EXIT_REASON_CANCELED || + vexit->reason == HV_EXIT_REASON_UNKNOWN) { /* Canceled by hv_vcpus_exit(). Can be: alarm timeout, * exit_group from another thread, or signal preemption * (signal_queue called hv_vcpus_exit to deliver a signal * while the guest was in a tight loop). + * + * HV_EXIT_REASON_UNKNOWN is the same event seen from the other + * side of a race: when a host signal (e.g. the SIGUSR2 used by the + * cross-process guest-signal transport) is delivered to this thread + * while it is actively executing guest code inside hv_vcpu_run, the + * run aborts with UNKNOWN instead of the clean CANCELED that + * hv_vcpus_exit() produces for a vCPU caught between runs. The + * pending guest signal has already been drained and queued, so it + * is fully deliverable -- fall through to the same handling and + * resume rather than treating it as a fatal unexpected exit. */ if (is_main && g_timed_out) { /* Timeout already handled above the exception switch -- diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 5e4c820..5a25194 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -1386,14 +1386,35 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) /* Deliver to user handler: build rt_sigframe on guest stack */ - /* 1. Save current vCPU state */ + /* 1. Save current vCPU state. + * + * ELR_EL1/SPSR_EL1 hold the interrupted EL0 return state only while the + * guest is unwinding a syscall (it is at EL1 in the shim, about to ERET). + * When the vCPU was preempted while executing EL0 code -- a tight compute + * loop interrupted by SIGALRM, or the cross-process guest-signal transport + * (SIGUSR2) firing mid-execution -- the live interrupted state is in + * HV_REG_PC / HV_REG_CPSR and ELR_EL1 is stale from the previous syscall. + * Redirecting via ELR_EL1 alone is then a no-op because the resume uses + * HV_REG_PC, so the handler never runs and the X0..X2 writes below clobber + * the interrupted registers instead. Detect the EL0-preemption case from + * the live PSTATE (M[3:0]==0 => EL0t) and use PC for both save and + * redirect. + */ uint64_t saved_regs[31]; uint64_t saved_sp, saved_pc, saved_pstate; + uint64_t cur_cpsr = 0; + hv_vcpu_get_reg(vcpu, HV_REG_CPSR, &cur_cpsr); + bool el0_preempt = (cur_cpsr & 0xfULL) == 0; vcpu_snapshot_gprs(vcpu, saved_regs); saved_sp = vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL0); - saved_pc = vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1); - saved_pstate = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1); + if (el0_preempt) { + hv_vcpu_get_reg(vcpu, HV_REG_PC, &saved_pc); + saved_pstate = cur_cpsr; + } else { + saved_pc = vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1); + saved_pstate = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1); + } /* 1b. rseq abort: if the thread is in a restartable sequence critical * section, abort it. Linux does this on every signal delivery. @@ -1549,6 +1570,16 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) /* SPSR_EL1: EL0t (user mode) */ hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, 0); + /* EL0-preemption delivery: the resume runs from HV_REG_PC, not via an + * ERET that consumes ELR_EL1, so redirect the live PC/PSTATE directly. + * The ELR_EL1/SPSR_EL1 writes above still cover the rt_sigreturn path, + * which unwinds back to EL0 through the shim ERET. + */ + if (el0_preempt) { + hv_vcpu_set_reg(vcpu, HV_REG_PC, act->sa_handler); + hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0); /* EL0t */ + } + /* X0 = signal number */ hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) signum); @@ -1590,8 +1621,11 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) * shim still has the interrupted syscall frame on its EL1 stack. Tell it * to drop that frame so the handler PC/SP/LR/args installed above are not * overwritten before ERET. Fault/BRK delivery paths ignore this marker. + * The EL0-preemption path resumes straight into the handler at EL0 with + * no shim frame to drop, so the marker is neither needed nor consulted. */ - hv_vcpu_set_reg(vcpu, HV_REG_X8, 2); + if (!el0_preempt) + hv_vcpu_set_reg(vcpu, HV_REG_X8, 2); pthread_mutex_unlock(&sig_lock); return 1; From 23b23073bb2fa03a023dbe655629b8c7aa4f5b39 Mon Sep 17 00:00:00 2001 From: Max042004 Date: Sat, 6 Jun 2026 21:09:41 +0800 Subject: [PATCH 2/2] Deliver synchronous fault signals to the faulting thread A guest exception (SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP) was delivered by calling signal_set_fault_info() (which records si_code/si_addr in a _Thread_local slot) followed by signal_queue() + signal_deliver(). But signal_queue() sets a bit in the process-wide pending mask, and every vCPU thread runs signal_deliver() at its own poll points. So a fault raised by thread A could be dequeued and delivered by thread B, whose thread-local fault info is empty -- the siginfo then degrades to si_code=SI_USER with no si_addr instead of SEGV_MAPERR/si_addr. A JVM treats a SIGSEGV with si_code=SI_USER as a fatal external signal rather than a recoverable implicit null check, so javac (heavily multi-threaded, many null-check faults) crashed with a fatal error report. Two threads faulting on the same signal also collapsed into a single bitmask bit, dropping one fault entirely. Add signal_deliver_fault(), which delivers a given signal directly to the current thread using its thread-local fault info and never touches the process-wide pending set. signal_deliver()'s frame-building tail is factored into deliver_signal_locked() and shared. Route all four synchronous-fault sites in the vCPU loop (SEGV_ACCERR, BRK->SIGTRAP, SIGILL, data-abort SIGSEGV) through it. Add test-fault-signal-mt: N threads each take recoverable SIGSEGVs in a loop and assert every delivery is SEGV_MAPERR with the right si_addr on the faulting thread. It fails deterministically (rc=2, fault delivered to a non-faulting thread) on the old queue-based path and passes on the fix. (cherry picked from commit 201b4fa0d2ffb2218e252be71930e1cfb9e360a5) --- Makefile | 6 ++ src/syscall/proc.c | 19 +++--- src/syscall/signal.c | 89 +++++++++++++++++-------- src/syscall/signal.h | 10 +++ tests/manifest.txt | 1 + tests/test-fault-signal-mt.c | 124 +++++++++++++++++++++++++++++++++++ 6 files changed, 215 insertions(+), 34 deletions(-) create mode 100644 tests/test-fault-signal-mt.c diff --git a/Makefile b/Makefile index e382cc4..671be28 100644 --- a/Makefile +++ b/Makefile @@ -177,6 +177,12 @@ $(BUILD_DIR)/test-scm-creds: tests/test-scm-creds.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-fault-signal-mt spawns pthreads that each take recoverable SIGSEGVs to +# stress synchronous-fault delivery routing in a multi-threaded guest. +$(BUILD_DIR)/test-fault-signal-mt: tests/test-fault-signal-mt.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + # test-shim-cred-race spawns a pthread reader while the main thread # toggles setresuid; the reader spins on the identity fast path. $(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR) diff --git a/src/syscall/proc.c b/src/syscall/proc.c index f79720f..f4defae 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1454,8 +1454,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu, uint64_t esr; hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &esr); signal_set_fault_info(LINUX_SEGV_ACCERR, far, esr); - signal_queue(LINUX_SIGSEGV); - int sig_ret = signal_deliver(vcpu, g, &exit_code); + int sig_ret = + signal_deliver_fault(vcpu, g, LINUX_SIGSEGV, + &exit_code); if (sig_ret < 0) running = false; break; @@ -1538,7 +1539,6 @@ int vcpu_run_loop(hv_vcpu_t vcpu, hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &brk_esr); signal_set_fault_info(LINUX_TRAP_BRKPT, brk_pc, brk_esr); - signal_queue(LINUX_SIGTRAP); if (verbose) { uint64_t thread_blocked = current_thread ? current_thread->blocked @@ -1550,7 +1550,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu, (unsigned long long) signal_get_state() ->pending); } - int sig_ret = signal_deliver(vcpu, g, &exit_code); + int sig_ret = + signal_deliver_fault(vcpu, g, LINUX_SIGTRAP, + &exit_code); if (verbose) log_debug("%s: signal_deliver returned %d", prefix, sig_ret); @@ -1612,8 +1614,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu, prefix, (unsigned long long) elr_addr, (unsigned long long) esr, fault_ec); signal_set_fault_info(LINUX_ILL_ILLOPC, elr_addr, esr); - signal_queue(LINUX_SIGILL); - int sig_ret = signal_deliver(vcpu, g, &exit_code); + int sig_ret = + signal_deliver_fault(vcpu, g, LINUX_SIGILL, + &exit_code); /* HVC #11 consumes X8 as the post-fault TLBI opcode. * signal_deliver() may leave it unchanged when no * handler is materialized, or set the syscall-path @@ -1681,8 +1684,8 @@ int vcpu_run_loop(hv_vcpu_t vcpu, (unsigned long long) esr, fsc, code_name); } signal_set_fault_info(si_code, far_addr, esr); - signal_queue(LINUX_SIGSEGV); - int sig_ret = signal_deliver(vcpu, g, &exit_code); + int sig_ret = + signal_deliver_fault(vcpu, g, LINUX_SIGSEGV, &exit_code); /* HVC #11 consumes X8 as the post-fault TLBI opcode. * signal_deliver() may leave it unchanged when no * handler is materialized, or set the syscall-path diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 5a25194..7c917c4 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -1317,37 +1317,27 @@ static void build_sigcontext_reserved(uint8_t *reserved, memset(reserved + off, 0, 8); } -int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) +/* Build and install the rt_sigframe for `signum` on the current thread, with + * sig_lock held on entry and released on every return path. Shared by + * signal_deliver() (signal selected from the process-wide pending set) and + * signal_deliver_fault() (synchronous fault forced onto the faulting thread). + * rt_info supplies si_code/si_pid/sigval when no thread-local pending_fault is + * set; the pending_fault is consumed (one-shot) when valid. Returns 1 if a + * handler frame was installed, 0 if the signal was ignored, and -1 (with + * *exit_code set) when the default disposition terminates the guest. + */ +static int deliver_signal_locked(hv_vcpu_t vcpu, + guest_t *g, + int signum, + signal_rt_info_t rt_info, + int *exit_code) { - pthread_mutex_lock(&sig_lock); uint64_t *blocked = thread_blocked_ptr(); uint64_t *saved_ptr = thread_saved_blocked_ptr(); bool *valid_ptr = thread_saved_valid_ptr(); - uint64_t deliverable = sig_state.pending & ~*blocked; - if (deliverable == 0) { - pthread_mutex_unlock(&sig_lock); - return 0; - } - /* Find lowest pending unblocked signal */ - int signum = bit_ctz64(deliverable) + 1; - signal_rt_info_t rt_info = signal_default_info(signum); - - /* Dequeue: for RT signals, decrement count and only clear the - * pending bit when the queue is empty. Standard signals are - * always cleared (single instance, bitmask semantics). - */ - if (signum >= LINUX_SIGRTMIN) { - signal_rt_dequeue_locked(signum, &rt_info); - } else { - rt_info = signal_standard_peek_locked(signum); - sig_state.std_info_valid[signum - 1] = false; - sig_state.pending &= ~sig_bit(signum); - } - - /* signum is bit_ctz64(deliverable) + 1, bounded 1..64 by the 64-bit - * pending mask. The static analyzer cannot see the bound, so gate the - * array access defensively. + /* signum is 1..64 from the caller; the static analyzer cannot see the + * bound, so gate the array access defensively. */ int idx = signum - 1; if (!RANGE_CHECK(idx, 0, LINUX_NSIG)) { @@ -1631,6 +1621,53 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) return 1; } +int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) +{ + pthread_mutex_lock(&sig_lock); + uint64_t *blocked = thread_blocked_ptr(); + uint64_t deliverable = sig_state.pending & ~*blocked; + if (deliverable == 0) { + pthread_mutex_unlock(&sig_lock); + return 0; + } + + /* Find lowest pending unblocked signal */ + int signum = bit_ctz64(deliverable) + 1; + signal_rt_info_t rt_info = signal_default_info(signum); + + /* Dequeue: for RT signals, decrement count and only clear the + * pending bit when the queue is empty. Standard signals are + * always cleared (single instance, bitmask semantics). + */ + if (signum >= LINUX_SIGRTMIN) { + signal_rt_dequeue_locked(signum, &rt_info); + } else { + rt_info = signal_standard_peek_locked(signum); + sig_state.std_info_valid[signum - 1] = false; + sig_state.pending &= ~sig_bit(signum); + } + + return deliver_signal_locked(vcpu, g, signum, rt_info, exit_code); +} + +int signal_deliver_fault(hv_vcpu_t vcpu, guest_t *g, int signum, int *exit_code) +{ + /* Synchronous faults (SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP) are specific to + * the thread that triggered them and must be delivered to that thread with + * the thread-local fault info set by signal_set_fault_info(). Routing them + * through the process-wide pending bitmask (signal_queue + signal_deliver) + * is racy: another vCPU thread can dequeue the bit and deliver it with no + * fault info (si_code becomes SI_USER, which makes a JVM treat a recoverable + * implicit null-check as a fatal external signal), and two threads faulting + * on the same signal collapse into one bit so one fault is lost. Deliver + * directly here, never touching sig_state.pending. The blocked mask is + * intentionally ignored: a synchronous fault cannot be postponed. + */ + pthread_mutex_lock(&sig_lock); + signal_rt_info_t rt_info = signal_default_info(signum); + return deliver_signal_locked(vcpu, g, signum, rt_info, exit_code); +} + /* rt_sigreturn. */ int signal_rt_sigreturn(hv_vcpu_t vcpu, guest_t *g) diff --git a/src/syscall/signal.h b/src/syscall/signal.h index f5792f5..927e1aa 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -274,6 +274,16 @@ void signal_set_shim_globals_guest(guest_t *g); */ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code); +/* Deliver a synchronous fault signal directly to the faulting (current) thread, + * bypassing the process-wide pending set. The caller must have set the fault + * info via signal_set_fault_info() immediately before. Same return convention + * as signal_deliver(). Use this for SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP raised + * from a guest exception, never signal_queue()+signal_deliver(): a queued fault + * can be stolen by another vCPU thread (delivered as SI_USER, no si_addr) or + * coalesced with another thread's fault into one bitmask bit. + */ +int signal_deliver_fault(hv_vcpu_t vcpu, guest_t *g, int signum, int *exit_code); + /* Handle rt_sigreturn (SYS 139): restore registers from rt_sigframe on * the guest stack. Returns SYSCALL_EXEC_HAPPENED to skip X0 writeback. */ diff --git a/tests/manifest.txt b/tests/manifest.txt index 7273505..7255ad6 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -87,6 +87,7 @@ test-negative # diff=skip [section] Signal + thread tests test-signal-thread +test-fault-signal-mt # diff=skip [section] Fork edge cases test-clone3 # diff=skip diff --git a/tests/test-fault-signal-mt.c b/tests/test-fault-signal-mt.c new file mode 100644 index 0000000..49a857c --- /dev/null +++ b/tests/test-fault-signal-mt.c @@ -0,0 +1,124 @@ +/* Multi-threaded synchronous-fault delivery test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression lock-in for synchronous fault (SIGSEGV) delivery in a + * multi-threaded guest. Faults were routed through the process-wide pending + * bitmask (signal_queue + signal_deliver), so a fault raised by one vCPU + * thread could be dequeued and delivered by another: the other thread had no + * thread-local fault info, so the kernel-visible siginfo became si_code=SI_USER + * with no si_addr instead of SEGV_MAPERR. A JVM treats such a SIGSEGV as a + * fatal external signal rather than a recoverable implicit null check, so javac + * crashed. Two threads faulting at once also collapsed into a single bitmask + * bit, dropping one fault entirely. + * + * Each thread repeatedly takes a recoverable SIGSEGV (read from a known bad + * address) and the handler asserts the siginfo is the correct synchronous-fault + * shape. Under the bug, a fault delivered to the wrong thread is seen either as + * SI_USER or on a thread with no armed recovery point, which this test flags. + * + * Syscalls exercised: rt_sigaction, clone (pthreads), the fault delivery path. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#define NTHREADS 4 +#define NFAULTS 4000 + +/* The bad address every thread reads. Page zero is never mapped, so the read + * faults with a translation fault -> SEGV_MAPERR and si_addr == FAULT_ADDR. */ +#define FAULT_ADDR ((volatile int *) (uintptr_t) 0x8) + +static _Thread_local sigjmp_buf recover; +static _Thread_local volatile int armed; + +/* Set by the handler when it observes a malformed delivery. async-signal-safe + * stores to sig_atomic_t/int flags only. */ +static volatile sig_atomic_t bad_si_code = 0; +static volatile sig_atomic_t bad_si_addr = 0; +static volatile sig_atomic_t unarmed_delivery = 0; + +static void segv_handler(int sig, siginfo_t *info, void *uc) +{ + (void) sig; + (void) uc; + /* A synchronous fault must report a fault si_code (SEGV_MAPERR for an + * unmapped address), never SI_USER (0), which is what a stolen/cross-thread + * delivery produced. */ + if (info->si_code != SEGV_MAPERR && info->si_code != SEGV_ACCERR) + bad_si_code = 1; + if (info->si_addr != (void *) FAULT_ADDR) + bad_si_addr = 1; + if (!armed) { + /* Delivered to a thread that is not currently faulting: the fault was + * misrouted. Cannot recover (no jmp target), so fail and exit. */ + unarmed_delivery = 1; + _exit(2); + } + siglongjmp(recover, 1); +} + +static void *fault_loop(void *arg) +{ + (void) arg; + for (int i = 0; i < NFAULTS; i++) { + armed = 1; + if (sigsetjmp(recover, 1) == 0) { + /* Trigger the fault. volatile so the read is not optimized away. */ + volatile int v = *FAULT_ADDR; + (void) v; + } + armed = 0; + } + return NULL; +} + +int main(void) +{ + printf("test-fault-signal-mt: multi-threaded SIGSEGV delivery\n\n"); + + struct sigaction sa; + sa.sa_sigaction = segv_handler; + sa.sa_flags = SA_SIGINFO | SA_NODEFER; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) != 0) { + printf(" %-34s FAIL: sigaction (errno=%d)\n", "setup", errno); + return 1; + } + + TEST("concurrent recoverable SIGSEGV faults"); + pthread_t th[NTHREADS]; + int spawned = 0; + for (int i = 0; i < NTHREADS; i++) { + if (pthread_create(&th[i], NULL, fault_loop, NULL) != 0) + break; + spawned++; + } + for (int i = 0; i < spawned; i++) + pthread_join(th[i], NULL); + + if (spawned != NTHREADS) + FAIL("could not spawn all threads"); + else if (bad_si_code) + FAIL("fault delivered with si_code != SEGV_MAPERR/ACCERR (SI_USER?)"); + else if (bad_si_addr) + FAIL("fault delivered with wrong si_addr"); + else if (unarmed_delivery) + FAIL("fault delivered to a non-faulting thread"); + else + PASS(); + + SUMMARY("test-fault-signal-mt"); + return fails ? 1 : 0; +}