From 41d43fd79d967c01118ef561550be68214b39642 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Fri, 5 Jun 2026 15:11:07 +0800 Subject: [PATCH] Short-circuit mprotect on matching tracker prot Repeat mprotect with the same prot on RELRO, JIT, and GC-style ranges no longer walks page tables: a pre-check confirms every overlapping region already records the requested prot and is not MAP_NORESERVE, in which case the call returns 0 without touching the tracker or PTEs. The skip is gated on a per-request safety helper that forces the slow path for pure PROT_READ, because sys_mmap installs MEM_PERM_RW PTEs for non-exec mappings (including PROT_READ requests) and only the explicit guest_update_perms call inside mprotect tightens them to MEM_PERM_R. tests/test-negative pins this behavior with a regression test that mprotects PROT_READ onto a PROT_READ mmap and confirms a subsequent write traps SIGSEGV. The fast path runs before any mutation. When PTE work is required, it runs BEFORE guest_region_set_prot so a -ENOMEM from guest_{update_perms, extend_page_tables,invalidate_ptes} leaves the tracker at the OLD prot; the next retry sees the mismatch and re-attempts instead of silently no-op'ing on stale tracker state. The low-IPA branch was also missing return-value checks on guest_update_perms and guest_invalidate_ptes; both now propagate -LINUX_ENOMEM. Three table-full split paths now arm a sticky guest_t.regions_tracker_ stale flag: guest_region_set_prot's two split sites and guest_region_ remove's interior-split fallback. The fast path checks this flag and falls back to unconditional PTE work for the lifetime of the process once the tracker has lied even once. Without arming the flag at the remove site, an orphaned-but-still-mapped tail would present vacuously uniform prot to the fast path and skip required PTE updates. The flag is propagated across fork via the process-state IPC, which is bumped to v11 to carry a uint8 stale snapshot after num_guest_regions. The child also arms the flag locally if its receiving region table is smaller than the parent's GUEST_MAX_REGIONS would have allowed, so a child built with a tighter cap inherits the same conservative fast-path behavior. guest_region_remove is restructured as a single in/out compaction pass. The previous code walked the region array with one cursor and handled each overlap kind in its own branch, mutating g->regions in place and rescanning after a full-containment removal. The new pass keeps a write cursor 'out' alongside the input cursor 'in' (out <= in by the non-overlap invariant), so trim survivors are emitted into the compaction front and the untouched suffix is shifted once at the end. The two trim-only branches collapse into a single survivor-build block. The interior split remains the only growth path: it snapshots the source region, shifts the suffix to make room, writes both halves, and returns. guest_region_add_ex_owned_gpa replaces the O(n) bubble-insert with a binary search plus single memmove. guest_region_range_{prot_uniform, has_noreserve} use the same binary-search prefix-skip. find_free_gap_inner stops iterating once a region's start crosses max_addr; later regions cannot affect a candidate gap inside the search window. --- src/core/guest.c | 269 ++++++++++++++++++++++++++------------- src/core/guest.h | 34 +++++ src/runtime/fork-state.c | 15 ++- src/runtime/fork-state.h | 8 +- src/runtime/forkipc.c | 8 +- src/syscall/mem.c | 78 ++++++++---- tests/test-negative.c | 46 +++++++ 7 files changed, 335 insertions(+), 123 deletions(-) diff --git a/src/core/guest.c b/src/core/guest.c index ec8f7e6..fca75b3 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -1737,6 +1737,40 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b) return a->offset + (a->end - a->start) == b->offset; } +/* First region whose start is >= start. regions[] is sorted by start. */ +static int region_lower_bound_start(const guest_t *g, uint64_t start) +{ + int lo = 0; + int hi = g->nregions; + + while (lo < hi) { + int mid = lo + (hi - lo) / 2; + if (g->regions[mid].start < start) + lo = mid + 1; + else + hi = mid; + } + return lo; +} + +/* First region whose end is > addr. See guest.h for the contract; also used + * inside this file to skip the untouched prefix for remove and set_prot. + */ +int guest_region_first_end_above(const guest_t *g, uint64_t addr) +{ + int lo = 0; + int hi = g->nregions; + + while (lo < hi) { + int mid = lo + (hi - lo) / 2; + if (g->regions[mid].end <= addr) + lo = mid + 1; + else + hi = mid; + } + return lo; +} + /* Merge region at index i with its right neighbor (i+1) when their layouts * agree. No-op if i is the last region or layouts differ. */ @@ -1857,12 +1891,10 @@ int guest_region_add_ex_owned_gpa(guest_t *g, return -1; } - /* Find insertion point (keep sorted by start address) */ - int i = g->nregions; - while (i > 0 && g->regions[i - 1].start > start) { - g->regions[i] = g->regions[i - 1]; - i--; - } + /* Find insertion point (keep sorted by start address). */ + int i = region_lower_bound_start(g, start); + memmove(&g->regions[i + 1], &g->regions[i], + (g->nregions - i) * sizeof(guest_region_t)); guest_region_t *r = &g->regions[i]; r->start = start; @@ -1926,104 +1958,117 @@ int guest_preannounce(guest_t *g, void guest_region_remove(guest_t *g, uint64_t start, uint64_t end) { - int i = 0; - while (i < g->nregions) { - guest_region_t *r = &g->regions[i]; + if (end <= start) + return; - /* No overlap: region is entirely before the removal range */ - if (r->end <= start) { - i++; - continue; - } + /* In-place compaction: 'out' is the next output slot, 'in' is the next + * input slot. Since the prefix [0, first) is untouched (it sorts strictly + * before [start, end) by guest_region_first_end_above), both cursors begin + * at 'first'. The non-overlap invariant guarantees out <= in throughout the + * loop, so writes at g->regions[out] never clobber slots not yet read. + */ + int first = guest_region_first_end_above(g, start); + int out = first; + int in = first; - /* No overlap: region is entirely after the removal range */ + while (in < g->nregions) { + guest_region_t *r = &g->regions[in]; if (r->start >= end) - break; /* sorted, so done */ - - /* Full containment: remove the entire region */ - if (r->start >= start && r->end <= end) { - if (r->backing_fd >= 0) - close(r->backing_fd); - memmove(&g->regions[i], &g->regions[i + 1], - (g->nregions - i - 1) * sizeof(guest_region_t)); - g->nregions--; - continue; /* do not increment i */ - } - - /* Partial overlap: removal range cuts the beginning */ - if (r->start >= start && r->end > end) { - uint64_t trimmed = end - r->start; - r->offset += trimmed; - r->gpa_base += trimmed; - r->start = end; - guest_region_clip_overlay(r); - i++; - continue; - } + break; - /* Partial overlap: removal range cuts the end */ - if (r->start < start && r->end > start && r->end <= end) { - r->end = start; - guest_region_clip_overlay(r); - i++; - continue; - } + bool keep_left = r->start < start; + bool keep_right = r->end > end; - /* Split: removal range is entirely inside the region */ - if (r->start < start && r->end > end) { - /* Need to split into two regions: [r->start, start) and [end, - * r->end) - */ + /* Interior split: removal range lies strictly inside *r, producing + * two output entries from one input slot. This is the only growth + * path; handle it explicitly so the untouched suffix is shifted out + * of the way before either half is written. After this branch no + * further input regions can overlap [start, end), so the loop is + * done. + */ + if (keep_left && keep_right) { if (g->nregions >= GUEST_MAX_REGIONS) { - /* Region table is full; trim to [r->start, start) and drop - * the tail. The tail [end, r->end) becomes untracked in - * /proc/self/maps but remains mapped in page tables. + /* Table full: drop the tail [end, r->end) and fall through to + * the simple "trim end" treatment of *r. The tail stays mapped + * in page tables but is now untracked, so a later mprotect over + * that range would otherwise see vacuously uniform prot in the + * tracker and skip PTE work. Mark the tracker permanently stale + * to disarm the mprotect fast path for the lifetime of the + * process. */ log_error( "guest: region table full, " "munmap split drops tail [0x%llx-0x%llx)", (unsigned long long) end, (unsigned long long) r->end); - r->end = start; - i++; - continue; - } - /* Make room for the new region after i */ - memmove(&g->regions[i + 2], &g->regions[i + 1], - (g->nregions - i - 1) * sizeof(guest_region_t)); - - /* Right half: [end, old_end) */ - guest_region_t *right = &g->regions[i + 1]; - *right = *r; /* Copy attributes */ - right->offset += (end - r->start); - right->gpa_base += (end - r->start); - right->start = end; - if (r->backing_fd >= 0) { - /* A dup failure leaves backing_fd=-1, silently converting this - * half to anonymous semantics (msync and MADV_DONTNEED skip - * regions with backing_fd<0). Propagating the error would - * require making all region split callers (mprotect, munmap) - * fallible. - */ - right->backing_fd = dup(r->backing_fd); - if (right->backing_fd < 0) - log_error( - "guest: dup() failed for region split " - "backing fd %d: %s", - r->backing_fd, strerror(errno)); - } + g->regions_tracker_stale = true; + keep_right = false; + } else { + guest_region_t orig = *r; + int suffix_count = g->nregions - in - 1; + if (suffix_count > 0) + memmove(&g->regions[out + 2], &g->regions[in + 1], + suffix_count * sizeof(guest_region_t)); + + guest_region_t left = orig; + left.end = start; + guest_region_clip_overlay(&left); + g->regions[out] = left; + + guest_region_t right = orig; + uint64_t trimmed = end - orig.start; + right.offset += trimmed; + right.gpa_base += trimmed; + right.start = end; + if (orig.backing_fd >= 0) { + right.backing_fd = dup(orig.backing_fd); + if (right.backing_fd < 0) + log_error( + "guest: dup() failed for region split " + "backing fd %d: %s", + orig.backing_fd, strerror(errno)); + } + guest_region_clip_overlay(&right); + g->regions[out + 1] = right; - /* Left half keeps the original entry and shortens its end. */ - r->end = start; - guest_region_clip_overlay(r); - guest_region_clip_overlay(right); + g->nregions = out + 2 + suffix_count; + return; + } + } - g->nregions++; - i += 2; /* skip both halves */ + if (!keep_left && !keep_right) { + if (r->backing_fd >= 0) + close(r->backing_fd); + in++; continue; } - i++; + /* Trim-only paths: either keep_left xor keep_right is true. Build the + * surviving half from the source slot, then publish it to g->regions + * [out]. The original backing_fd transfers to whichever half survives; + * no dup is needed because only one half remains. + */ + guest_region_t survivor = *r; + if (keep_left) { + survivor.end = start; + } else { + uint64_t trimmed = end - r->start; + survivor.offset += trimmed; + survivor.gpa_base += trimmed; + survivor.start = end; + } + guest_region_clip_overlay(&survivor); + g->regions[out++] = survivor; + in++; } + + /* Append the unread suffix (regions whose start >= end) after the + * compacted overlap area, shifting only if compaction left a hole. + */ + int tail = g->nregions - in; + if (tail > 0 && out != in) + memmove(&g->regions[out], &g->regions[in], + tail * sizeof(guest_region_t)); + g->nregions = out + tail; } const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr) @@ -2043,6 +2088,35 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr) return NULL; } +bool guest_region_range_prot_uniform(const guest_t *g, + uint64_t start, + uint64_t end, + int prot) +{ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->prot != prot) + return false; + } + return true; +} + +bool guest_region_range_has_noreserve(const guest_t *g, + uint64_t start, + uint64_t end) +{ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->noreserve) + return true; + } + return false; +} + void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) { /* Walk regions overlapping [start, end), split at boundaries, update prot. @@ -2051,20 +2125,28 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) */ int first_modified = -1, last_modified = -1; - for (int i = 0; i < g->nregions; i++) { + /* The prefix skip ensures regions[i].end > start for i >= first; the + * non-overlap invariant carries it through all later iterations. + */ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { guest_region_t *r = &g->regions[i]; - if (r->end <= start) - continue; if (r->start >= end) break; /* If region extends before start, split at start */ if (r->start < start) { if (g->nregions >= GUEST_MAX_REGIONS) { + /* The region keeps its old prot in the tracker, but PTEs for + * [start, r->end) have already been updated. Mark the tracker + * permanently stale so the mprotect fast path falls back to + * unconditional PTE work and cannot be fooled by a tracker + * that lags actual PTE state. + */ log_error( "guest: region table full, " "mprotect split skipped at 0x%llx", (unsigned long long) start); + g->regions_tracker_stale = true; continue; } memmove(&g->regions[i + 1], &g->regions[i], @@ -2094,8 +2176,10 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) /* If region extends past end, split at end */ if (r->end > end) { if (g->nregions >= GUEST_MAX_REGIONS) { - /* Split failure applies prot to the whole region. - * The tail [end, r->end) gets new prot too. + /* Over-apply prot to the whole region: the tail [end, r->end) + * now claims new prot in the tracker even though PTE work + * did not cover it. Mark the tracker stale so the mprotect + * fast path stops trusting prot uniformity. */ log_error( "guest: region table full, " @@ -2103,6 +2187,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) "(region [0x%llx-0x%llx) gets prot %d entirely)", (unsigned long long) end, (unsigned long long) r->start, (unsigned long long) r->end, prot); + g->regions_tracker_stale = true; r->prot = prot; if (first_modified < 0) first_modified = i; diff --git a/src/core/guest.h b/src/core/guest.h index 7545ffb..4e393a0 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -494,6 +494,14 @@ typedef struct { /* Semantic region tracking for munmap/mprotect/proc-self-maps */ guest_region_t regions[GUEST_MAX_REGIONS]; int nregions; /* Number of active regions */ + /* Sticky flag set when guest_region_set_prot could not honor a request + * because the region table was full. After this point the tracker no + * longer faithfully reflects PTE state, so the mprotect fast path must + * fall back to unconditional PTE work. Propagated across fork IPC with + * the semantic region snapshot so children inherit the same fast-path + * guard as the parent. + */ + bool regions_tracker_stale; guest_region_t preannounced[GUEST_MAX_PREANNOUNCED]; int npreannounced; /* /proc/self/maps-only shadow regions */ @@ -1174,6 +1182,32 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr); */ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot); +/* Index of the first region whose end is strictly above addr. Earlier + * regions sort entirely below addr (regions[] is start-sorted and + * non-overlapping, so ends are monotonic). Callers use this to skip the + * untouched prefix in O(log n) before a linear walk over the overlap. + */ +int guest_region_first_end_above(const guest_t *g, uint64_t addr); + +/* True if every tracked region overlapping [start, end) already has prot. + * Returns true vacuously when no region overlaps the range, since callers + * use this to decide whether page-table maintenance can be skipped and an + * untracked sub-range has no PTEs of its own to update. + */ +bool guest_region_range_prot_uniform(const guest_t *g, + uint64_t start, + uint64_t end, + int prot); + +/* True if any tracked region overlapping [start, end) is MAP_NORESERVE. + * Callers must run page-table maintenance for such ranges even when prot + * already matches, because lazy materialization may have produced PTEs + * that need re-permissioning. + */ +bool guest_region_range_has_noreserve(const guest_t *g, + uint64_t start, + uint64_t end); + /* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. * Called from the data/instruction abort handler when the faulting address * falls within a noreserve region. Creates page table entries for one 2MiB diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index a36a673..1c31bc0 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -482,6 +482,7 @@ static int fork_ipc_send_backing_fds(int ipc_sock, int fork_ipc_send_process_state(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions, + bool regions_tracker_stale_snapshot, const guest_region_t *preannounced_snapshot, uint32_t num_preannounced) { @@ -523,8 +524,12 @@ int fork_ipc_send_process_state(int ipc_sock, fork_ipc_write_all(ipc_sock, cmdline, cmdline_len_u32) < 0) return -1; + uint8_t regions_tracker_stale = + regions_tracker_stale_snapshot ? UINT8_C(1) : UINT8_C(0); if (fork_ipc_write_all(ipc_sock, &num_guest_regions, - sizeof(num_guest_regions)) < 0) + sizeof(num_guest_regions)) < 0 || + fork_ipc_write_all(ipc_sock, ®ions_tracker_stale, + sizeof(regions_tracker_stale)) < 0) return -1; if (num_guest_regions > 0 && fork_ipc_write_all(ipc_sock, regions_snapshot, @@ -709,6 +714,12 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read region count"); return -1; } + uint8_t regions_tracker_stale = 0; + if (fork_ipc_read_all(ipc_fd, ®ions_tracker_stale, + sizeof(regions_tracker_stale)) < 0) { + log_error("fork-child: failed to read region tracker state"); + return -1; + } uint32_t recv_regions = num_guest_regions; if (recv_regions > GUEST_MAX_REGIONS) recv_regions = GUEST_MAX_REGIONS; @@ -728,6 +739,8 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) sizeof(guest_region_t)) < 0) return -1; g->nregions = (int) recv_regions; + g->regions_tracker_stale = + (regions_tracker_stale != 0) || (num_guest_regions > recv_regions); uint32_t num_preannounced = 0; if (fork_ipc_read_all(ipc_fd, &num_preannounced, sizeof(num_preannounced)) < diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 53233d1..e9de0bc 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -18,11 +18,14 @@ /* Magic values for IPC frame delimiters */ #define IPC_MAGIC_HEADER 0x454C464BU /* "ELFK" */ #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */ -/* Bumped to 10 when the rosetta placement / kbuf / ttbr1 tuple was added so +/* Bumped to 11 when regions_tracker_stale was added to process state so + * forked children preserve mprotect fast-path correctness. + * + * Bumped to 10 when the rosetta placement / kbuf / ttbr1 tuple was added so * a rosetta-aware child rejects an older parent's header instead of trying * to interpret unknown trailing fields. */ -#define IPC_VERSION 10 +#define IPC_VERSION 11 typedef struct { uint32_t magic; @@ -99,6 +102,7 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g); int fork_ipc_send_process_state(int ipc_sock, const guest_region_t *regions_snapshot, uint32_t num_guest_regions, + bool regions_tracker_stale_snapshot, const guest_region_t *preannounced_snapshot, uint32_t num_preannounced); int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig); diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index c54c837..aef1d24 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -1578,6 +1578,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, * stack limits on worker threads (512KiB default). */ int nregions_snapshot = g->nregions; + bool regions_tracker_stale_snapshot = g->regions_tracker_stale; size_t snap_sz = (size_t) nregions_snapshot * sizeof(guest_region_t); if (nregions_snapshot > 0) { regions_snapshot = malloc(snap_sz); @@ -1599,9 +1600,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, uint32_t num_guest_regions = (uint32_t) nregions_snapshot; uint32_t num_preannounced = (uint32_t) npreannounced_snapshot; - if (fork_ipc_send_process_state(ipc_sock, regions_snapshot, - num_guest_regions, preannounced_snapshot, - num_preannounced) < 0) { + if (fork_ipc_send_process_state( + ipc_sock, regions_snapshot, num_guest_regions, + regions_tracker_stale_snapshot, preannounced_snapshot, + num_preannounced) < 0) { log_error("clone: failed to send process state"); goto fail_snapshot; } diff --git a/src/syscall/mem.c b/src/syscall/mem.c index bcdd48d..415fa2e 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -263,26 +263,6 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) } } -/* Find the smallest i such that g->regions[i].end > gap_start. All earlier - * regions are entirely below gap_start and would be skipped by the loop body - * with no other effect. Regions are kept sorted by start and non-overlapping - * (sys_mmap MAP_FIXED removes overlaps before insertion), so ends are - * monotonic across the array and binary-searchable. - */ -static int first_region_end_above(const guest_t *g, uint64_t gap_start) -{ - int lo = 0; - int hi = g->nregions; - while (lo < hi) { - int mid = lo + (hi - lo) / 2; - if (g->regions[mid].end <= gap_start) - lo = mid + 1; - else - hi = mid; - } - return lo; -} - static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, @@ -303,13 +283,21 @@ static uint64_t find_free_gap_inner(const guest_t *g, * region tail, so the linear walk would otherwise re-scan that whole * prefix on every mmap, addr-hint probe, or hint-miss full scan. */ - for (int i = first_region_end_above(g, gap_start); i < g->nregions; i++) { + for (int i = guest_region_first_end_above(g, gap_start); i < g->nregions; + i++) { /* A region can still slip below gap_start after the ALIGN_UP advance * below skips past a smaller adjacent region; keep the cheap guard. */ if (g->regions[i].end <= gap_start) continue; + /* The search is bounded to [min_addr, max_addr). Once the sorted + * region stream reaches max_addr, later regions cannot affect any + * candidate gap inside the window. + */ + if (g->regions[i].start >= max_addr) + break; + /* If this region starts far enough after gap_start, the allocator found * a gap. Must also verify the gap is within max_addr; regions[] may * contain entries beyond max_addr that could push gap_start past the @@ -3272,6 +3260,16 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) /* sys_mprotect. */ +static bool mprotect_same_prot_fast_path_safe(int prot) +{ + /* Non-fixed main-arena mmap initially installs RW PTEs for PROT_READ + * mappings, relying on mprotect to tighten them later. Do not trust the + * region tracker alone for read-only same-prot requests. + */ + return prot == LINUX_PROT_NONE || (prot & LINUX_PROT_WRITE) || + (prot & LINUX_PROT_EXEC); +} + int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) { if (addr & 4095) @@ -3291,7 +3289,26 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) (prot & LINUX_PROT_EXEC)) return -LINUX_EINVAL; - guest_region_set_prot(g, addr, mprot_end, prot); + /* Fast path: if the tracker already records this prot for every + * overlapping region and none are MAP_NORESERVE, page tables are + * already in sync and no PTE work is required. The tracker + * update is also a no-op, so skip it. Read-only same-prot + * requests still need PTE work because some mmap paths install + * RW PTEs while recording PROT_READ in the tracker. Disabled once + * regions_tracker_stale is set: prior set_prot calls hit + * GUEST_MAX_REGIONS and left the tracker out of sync with PTEs. + */ + if (mprotect_same_prot_fast_path_safe(prot) && + !g->regions_tracker_stale && + guest_region_range_prot_uniform(g, addr, mprot_end, prot) && + !guest_region_range_has_noreserve(g, addr, mprot_end)) + return 0; + + /* Do PTE work BEFORE updating the tracker. If page-table + * maintenance fails, regions[] still reflects the old prot, so + * a retry will see the mismatch and re-attempt the update + * instead of short-circuiting on stale tracker state. + */ if (prot != LINUX_PROT_NONE) { if (guest_update_perms(g, addr, mprot_end, prot_to_perms(prot)) < 0) @@ -3300,6 +3317,7 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) if (guest_invalidate_ptes(g, addr, mprot_end) < 0) return -LINUX_ENOMEM; } + guest_region_set_prot(g, addr, mprot_end, prot); return 0; } uint64_t mprot_off = addr - g->ipa_base; @@ -3312,16 +3330,26 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) if (guest_range_hits_infra(g, mprot_off, mprot_end)) return -LINUX_EINVAL; - guest_region_set_prot(g, mprot_off, mprot_end, prot); + /* Same fast path / ordering / staleness gate as above. */ + if (mprotect_same_prot_fast_path_safe(prot) && + !g->regions_tracker_stale && + guest_region_range_prot_uniform(g, mprot_off, mprot_end, + prot) && + !guest_region_range_has_noreserve(g, mprot_off, mprot_end)) + return 0; + if (prot != LINUX_PROT_NONE) { int page_perms = prot_to_perms(prot); if (guest_extend_page_tables(g, mprot_off, mprot_end, page_perms) < 0) return -LINUX_ENOMEM; - guest_update_perms(g, mprot_off, mprot_end, page_perms); + if (guest_update_perms(g, mprot_off, mprot_end, page_perms) < 0) + return -LINUX_ENOMEM; } else { - guest_invalidate_ptes(g, mprot_off, mprot_end); + if (guest_invalidate_ptes(g, mprot_off, mprot_end) < 0) + return -LINUX_ENOMEM; } + guest_region_set_prot(g, mprot_off, mprot_end, prot); } } return 0; diff --git a/tests/test-negative.c b/tests/test-negative.c index 2eeeacf..8703bfc 100644 --- a/tests/test-negative.c +++ b/tests/test-negative.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -35,6 +37,16 @@ int passes = 0, fails = 0; +static sigjmp_buf segv_jmp; +static volatile sig_atomic_t saw_segv; + +static void on_sigsegv(int sig) +{ + (void) sig; + saw_segv = 1; + siglongjmp(segv_jmp, 1); +} + /* Test 1: Invalid FD operations */ static void test_invalid_fd(void) @@ -162,6 +174,40 @@ static void test_mmap_prot(void) EXPECT_TRUE(val == 42, "data not readable after RO mprotect"); munmap(p, 4096); } + + TEST("same-prot mprotect keeps RO mmap unwritable"); + { + void *p = + mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + if (mprotect(p, 4096, PROT_READ) != 0) { + munmap(p, 4096); + FAIL("mprotect failed"); + return; + } + + struct sigaction old_sa; + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = on_sigsegv; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &old_sa) != 0) { + munmap(p, 4096); + FAIL("sigaction failed"); + return; + } + + saw_segv = 0; + if (sigsetjmp(segv_jmp, 1) == 0) + *(volatile int *) p = 7; + + sigaction(SIGSEGV, &old_sa, NULL); + munmap(p, 4096); + EXPECT_TRUE(saw_segv, "write to read-only mapping succeeded"); + } } /* Test 7: fcntl on invalid FD */