Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions mk/tests.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
test-glibc-coreutils test-perf \
test-rosetta-cli test-rosetta-statics test-rosetta-failure-modes \
test-rosetta-alpine test-rosetta-audit test-rosetta-jit \
test-rosetta-glibc test-rosetta-all bench-rosetta \
test-rosetta-glibc test-rosetta-madvise test-rosetta-all bench-rosetta \
test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \
test-full test-multi-vcpu test-rwx test-sysroot-rename \
test-case-collision test-case-collision-fallback test-sysroot-create-paths \
Expand Down Expand Up @@ -187,10 +187,14 @@ test-rosetta-jit: $(ELFUSE_BIN)
test-rosetta-glibc: $(ELFUSE_BIN)
$(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-glibc.sh $(ELFUSE_BIN),test-rosetta-glibc)

test-rosetta-madvise: $(ELFUSE_BIN)
$(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-madvise.sh $(ELFUSE_BIN),test-rosetta-madvise)

## Run every Rosetta-specific test target in sequence.
test-rosetta-all: test-rosetta-cli test-rosetta-failure-modes \
test-rosetta-statics test-rosetta-alpine \
test-rosetta-audit test-rosetta-jit test-rosetta-glibc
test-rosetta-audit test-rosetta-jit test-rosetta-glibc \
test-rosetta-madvise

## Wall-clock bench harness for x86_64-via-Rosetta workloads. Prints
## best-of-N samples plus the aarch64 reference where available. Set
Expand Down
21 changes: 17 additions & 4 deletions src/syscall/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -2997,7 +2997,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
* is updated.
*/
uint64_t off = addr - g->ipa_base;
if (off > g->guest_size || length > g->guest_size - off)
/* Accept ranges in the primary IPA window, and also high-VA mmap regions
* (gpa_base != start) that the tracker records as mapped. Rosetta's own
* slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window
* via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with
* ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */
bool in_primary = (off <= g->guest_size && length <= g->guest_size - off);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: Missing off + length overflow guard can make invalid high-VA madvise ranges pass mapping checks and return success.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/syscall/mem.c, line 3005:

<comment>Missing `off + length` overflow guard can make invalid high-VA madvise ranges pass mapping checks and return success.</comment>

<file context>
@@ -2997,7 +2997,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
+     * slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window
+     * via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with
+     * ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */
+    bool in_primary = (off <= g->guest_size && length <= g->guest_size - off);
+    if (!in_primary && !madvise_range_mapped(g, off, length))
         return -LINUX_ENOMEM;
</file context>
Suggested change
bool in_primary = (off <= g->guest_size && length <= g->guest_size - off);
if (off > UINT64_MAX - length)
return -LINUX_ENOMEM;
bool in_primary = (off <= g->guest_size && length <= g->guest_size - off);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: The high-VA admission check is too loose: it can approve ranges based only on region coverage even though the rest of sys_madvise() still uses the low-IPA off layout.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/syscall/mem.c, line 3005:

<comment>The high-VA admission check is too loose: it can approve ranges based only on region coverage even though the rest of `sys_madvise()` still uses the low-IPA `off` layout.</comment>

<file context>
@@ -2997,7 +2997,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
+     * slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window
+     * via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with
+     * ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */
+    bool in_primary = (off <= g->guest_size && length <= g->guest_size - off);
+    if (!in_primary && !madvise_range_mapped(g, off, length))
         return -LINUX_ENOMEM;
</file context>

if (!in_primary && !madvise_range_mapped(g, off, length))
return -LINUX_ENOMEM;

/* Defensive guard against destructive advice on infrastructure
Expand Down Expand Up @@ -3050,11 +3056,18 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)

uint64_t zstart = (r->start > off) ? r->start : off;
uint64_t zend = (r->end < end) ? r->end : end;
memset((uint8_t *) g->host_base + zstart, 0, zend - zstart);
if (!(r->flags & LINUX_MAP_ANONYMOUS)) {
/* High-VA regions back their pages at gpa_base, not at the VA;
* resolve the host pointer through the GPA so the reset hits the
* real backing (host_ptr_for_gpa also follows live overlays). For
* identity regions gpa_base == start, so this is unchanged. */
uint64_t rgpa = r->gpa_base + (zstart - r->start);
memset(host_ptr_for_gpa(g, rgpa), 0, zend - zstart);
if (!(r->flags & LINUX_MAP_ANONYMOUS) && r->gpa_base == r->start) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: MADV_DONTNEED on high-VA file-backed regions now zero-fills without restoring file contents, producing incorrect data after a successful call.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/syscall/mem.c, line 3065:

<comment>`MADV_DONTNEED` on high-VA file-backed regions now zero-fills without restoring file contents, producing incorrect data after a successful call.</comment>

<file context>
@@ -3050,11 +3056,18 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
+             * identity regions gpa_base == start, so this is unchanged. */
+            uint64_t rgpa = r->gpa_base + (zstart - r->start);
+            memset(host_ptr_for_gpa(g, rgpa), 0, zend - zstart);
+            if (!(r->flags & LINUX_MAP_ANONYMOUS) && r->gpa_base == r->start) {
                 /* EOF leaves the tail zero per mmap rules; the helper
                  * already returns 0 in that case after stopping the
</file context>

/* EOF leaves the tail zero per mmap rules; the helper
* already returns 0 in that case after stopping the
* read loop.
* read loop. File-backed restore via host_base+off is only
* correct for identity regions; high-VA file mappings keep
* the zero-fill above (not exercised by current JIT guests).
*/
int err = read_file_range_to_guest(
g, zstart, r->backing_fd, r->offset + (zstart - r->start),
Expand Down
8 changes: 6 additions & 2 deletions tests/fixtures/rosetta/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Rosetta x86_64 test fixtures vendored for self-contained matrix coverage.
- static x86_64 Linux ELF built from `tests/x86_64-rosetta-audit.c`
- `x86_64-rosetta-tls0`
- static x86_64 Linux ELF built from `tests/x86_64-rosetta-tls0.c`
- `x86_64-rosetta-madvise`
- static x86_64 Linux ELF built from `tests/x86_64-rosetta-madvise.c`
- used by `tests/test-rosetta-madvise.sh`
- `x86_64-glibc-rootfs.tar.gz`
- minimal x86_64 glibc rootfs used by `tests/test-rosetta-glibc.sh`
- contains `hello-dynamic`, `dlopen-probe`, `tls-probe`,
Expand Down Expand Up @@ -34,8 +37,9 @@ gcc -O2 -o tls-probe tests/x86_64-glibc-tls.c
gcc -O2 -fPIC -shared -o libgdtls.so tests/x86_64-glibc-gdtls-lib.c
gcc -O2 -ldl -o gdtls-probe tests/x86_64-glibc-gdtls.c
gcc -O2 -pthread -o pthread-tls-probe tests/x86_64-glibc-pthread-tls.c
gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c
gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c
gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c
gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c
gcc -O2 -static -o x86_64-rosetta-madvise tests/x86_64-rosetta-madvise.c
# Stage the matching ld.so / libc.so.6 / libm.so.6 from the same host
# into a rootfs/ tree alongside libgdtls.so under lib/x86_64-linux-gnu/,
# then tar -czf x86_64-glibc-rootfs.tar.gz rootfs/.
Expand Down
Binary file added tests/fixtures/rosetta/x86_64-rosetta-madvise
Binary file not shown.
6 changes: 5 additions & 1 deletion tests/test-matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -612,10 +612,14 @@ run_rosetta_x86_64_suites()
printf "\nRosetta glibc dynamic\n"
run_summary_suite "rosetta-glibc" \
bash "${REPO_ROOT}/tests/test-rosetta-glibc.sh" "$ELFUSE" || rc=1

printf "\nRosetta high-VA madvise\n"
run_summary_suite "rosetta-madvise" \
bash "${REPO_ROOT}/tests/test-rosetta-madvise.sh" "$ELFUSE" || rc=1
else
local suite
for suite in rosetta-statics rosetta-alpine rosetta-audit rosetta-jit \
rosetta-glibc; do
rosetta-glibc rosetta-madvise; do
skip_suite "$suite" "Rosetta translator not installed"
done
fi
Expand Down
66 changes: 66 additions & 0 deletions tests/test-rosetta-madvise.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# test-rosetta-madvise.sh - madvise(MADV_DONTNEED) on high-VA regions via Rosetta
#
# Copyright 2026 elfuse contributors
# SPDX-License-Identifier: Apache-2.0
#
# Regression for elfuse sys_madvise rejecting high-VA mmap regions with ENOMEM.
# Under Rosetta, anonymous mmap(NULL) lands in the high-VA window where
# sys_madvise was primary-window-only and returned ENOMEM for every
# MADV_DONTNEED. V8's page allocator decommits guard/code pages with
# mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the result,
# so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT initialized.
#
# Fixture: tests/fixtures/rosetta/x86_64-rosetta-madvise (vendored x86_64 ELF).
#
# Usage: tests/test-rosetta-madvise.sh [path/to/elfuse]

set -euo pipefail

ELFUSE_INPUT="${1:-build/elfuse}"
case "$ELFUSE_INPUT" in
/*) ELFUSE="$ELFUSE_INPUT" ;;
*) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;;
esac

ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}"
MADV_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-madvise"

# shellcheck source=tests/lib/rosetta-test.sh
. "$(dirname "$0")/lib/rosetta-test.sh"

pass=0
fail=0
skip=0
total=0

if [ ! -x "$ROSETTA_PATH" ]; then
printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2
exit 77
fi
if [ ! -x "$ELFUSE" ]; then
printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2
exit 1
fi

require_timeout

if [ ! -x "$MADV_BIN" ]; then
printf 'vendored Rosetta madvise fixture missing under tests/fixtures/rosetta/\n' >&2
exit 77
fi

total=$((total + 1))
set +e
madv_out="$("$TIMEOUT" 30 "$ELFUSE" "$MADV_BIN" 2>&1)"
madv_rc=$?
set -e
if [ "$madv_rc" -eq 0 ] &&
printf '%s\n' "$madv_out" | grep -q 'madvise high-VA: all subtests passed'; then
report_pass "madvise-high-va-dontneed"
else
report_fail "madvise-high-va-dontneed: rc=$madv_rc"
printf '%s\n' "$madv_out" >&2
fi

report_summary "$total"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Script exits 0 even when tests fail — missing if [ "$fail" -gt 0 ]; then exit 1; fi after report_summary. This masks test failures in standalone runs and weakens the matrix runner's belt-and-suspenders || rc=1 check.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At tests/test-rosetta-madvise.sh, line 66:

<comment>Script exits 0 even when tests fail — missing `if [ "$fail" -gt 0 ]; then exit 1; fi` after `report_summary`. This masks test failures in standalone runs and weakens the matrix runner's belt-and-suspenders `|| rc=1` check.</comment>

<file context>
@@ -0,0 +1,66 @@
+    printf '%s\n' "$madv_out" >&2
+fi
+
+report_summary "$total"
</file context>

185 changes: 185 additions & 0 deletions tests/x86_64-rosetta-madvise.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
/* x86_64-rosetta-madvise.c - madvise(MADV_DONTNEED) on high-VA regions
*
* Copyright 2026 elfuse contributors
* SPDX-License-Identifier: Apache-2.0
*
* Regression for elfuse sys_madvise rejecting high-VA mmap regions with
* ENOMEM. Under Rosetta, anonymous mmap(NULL) lands in the high-VA window
* (the region's gpa_base diverges from its VA start), where sys_madvise was
* primary-window-only: it computed off = addr - ipa_base and rejected any
* range past guest_size with ENOMEM, even though sys_mprotect already handles
* the same high-VA range. V8's page allocator decommits guard/code pages with
* mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the madvise
* return, so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT
* initialized.
*
* Each subtest prints "PASS <name>" / "FAIL <name>"; main() exits non-zero on
* any failure so the shell harness can gate on the exit code.
*
* This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not
* built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and
* re-vendor per tests/fixtures/rosetta/README.md.
*/

#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>

#ifndef MADV_DONTNEED
#define MADV_DONTNEED 4
#endif

#define PAGE ((size_t) 4096)

static int fails;

/* The primary IPA window is a handful of GiB; Rosetta places guest mappings at
* their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA
* window that exercises the regression. */
static int is_high_va(const void *p)
{
return (uint64_t) (uintptr_t) p > 0x100000000ULL;
}

/* MADV_DONTNEED on a writable high-VA page returns 0 and zero-fills. */
static void test_dontneed_rw(void)
{
void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED) {
printf("FAIL dontneed-rw: mmap errno=%d\n", errno);
fails++;
return;
}
if (!is_high_va(p)) {
printf("FAIL dontneed-rw: mapping not in high-VA window (%p)\n", p);
fails++;
munmap(p, PAGE);
return;
}
memset(p, 0xAA, PAGE);
errno = 0;
if (madvise(p, PAGE, MADV_DONTNEED) != 0) {
printf("FAIL dontneed-rw: madvise rc=-1 errno=%d\n", errno);
fails++;
munmap(p, PAGE);
return;
}
for (unsigned i = 0; i < PAGE; i++) {
if (((unsigned char *) p)[i] != 0) {
printf("FAIL dontneed-rw: byte %u not zeroed\n", i);
fails++;
munmap(p, PAGE);
return;
}
}
printf("PASS dontneed-rw\n");
munmap(p, PAGE);
}

/* The exact V8 decommit pattern: a guard page is set PROT_NONE and then
* MADV_DONTNEED'd. Linux returns 0 for a mapped-but-PROT_NONE page; after
* re-granting RW the page reads back as zero. */
static void test_dontneed_protnone(void)
{
size_t sz = 2u * PAGE;
void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED) {
printf("FAIL dontneed-protnone: mmap errno=%d\n", errno);
fails++;
return;
}
if (!is_high_va(p)) {
printf("FAIL dontneed-protnone: mapping not in high-VA window (%p)\n",
p);
fails++;
munmap(p, sz);
return;
}
void *guard = (char *) p + PAGE;
memset(p, 0xBB, sz);
if (mprotect(guard, PAGE, PROT_NONE) != 0) {
printf("FAIL dontneed-protnone: mprotect PROT_NONE errno=%d\n", errno);
fails++;
munmap(p, sz);
return;
}
errno = 0;
if (madvise(guard, PAGE, MADV_DONTNEED) != 0) {
printf("FAIL dontneed-protnone: madvise rc=-1 errno=%d\n", errno);
fails++;
munmap(p, sz);
return;
}
if (mprotect(guard, PAGE, PROT_READ | PROT_WRITE) != 0) {
printf("FAIL dontneed-protnone: re-grant RW errno=%d\n", errno);
fails++;
munmap(p, sz);
return;
}
for (unsigned i = 0; i < PAGE; i++) {
if (((unsigned char *) guard)[i] != 0) {
printf("FAIL dontneed-protnone: guard byte %u not zeroed\n", i);
fails++;
munmap(p, sz);
return;
}
}
printf("PASS dontneed-protnone\n");
munmap(p, sz);
}

/* Multi-page MADV_DONTNEED across a high-VA span returns 0 and zero-fills. */
static void test_dontneed_multi(void)
{
size_t sz = 16u * PAGE;
void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED) {
printf("FAIL dontneed-multi: mmap errno=%d\n", errno);
fails++;
return;
}
if (!is_high_va(p)) {
printf("FAIL dontneed-multi: mapping not in high-VA window (%p)\n", p);
fails++;
munmap(p, sz);
return;
}
memset(p, 0xCC, sz);
errno = 0;
if (madvise(p, sz, MADV_DONTNEED) != 0) {
printf("FAIL dontneed-multi: madvise rc=-1 errno=%d\n", errno);
fails++;
munmap(p, sz);
return;
}
for (size_t i = 0; i < sz; i++) {
if (((unsigned char *) p)[i] != 0) {
printf("FAIL dontneed-multi: byte %zu not zeroed\n", i);
fails++;
munmap(p, sz);
return;
}
}
printf("PASS dontneed-multi\n");
munmap(p, sz);
}

int main(void)
{
test_dontneed_rw();
test_dontneed_protnone();
test_dontneed_multi();

if (fails) {
printf("madvise high-VA: %d subtest(s) failed\n", fails);
return 1;
}
printf("madvise high-VA: all subtests passed\n");
return 0;
}
Loading