diff --git a/src/core/guest.c b/src/core/guest.c index 21407d5..d0b939e 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -1737,6 +1737,40 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b) return a->offset + (a->end - a->start) == b->offset; } +/* First region whose start is >= start. regions[] is sorted by start. */ +static int region_lower_bound_start(const guest_t *g, uint64_t start) +{ + int lo = 0; + int hi = g->nregions; + + while (lo < hi) { + int mid = lo + (hi - lo) / 2; + if (g->regions[mid].start < start) + lo = mid + 1; + else + hi = mid; + } + return lo; +} + +/* First region whose end is > addr. See guest.h for the contract; also used + * inside this file to skip the untouched prefix for remove and set_prot. + */ +int guest_region_first_end_above(const guest_t *g, uint64_t addr) +{ + int lo = 0; + int hi = g->nregions; + + while (lo < hi) { + int mid = lo + (hi - lo) / 2; + if (g->regions[mid].end <= addr) + lo = mid + 1; + else + hi = mid; + } + return lo; +} + /* Merge region at index i with its right neighbor (i+1) when their layouts * agree. No-op if i is the last region or layouts differ. */ @@ -1857,12 +1891,10 @@ int guest_region_add_ex_owned_gpa(guest_t *g, return -1; } - /* Find insertion point (keep sorted by start address) */ - int i = g->nregions; - while (i > 0 && g->regions[i - 1].start > start) { - g->regions[i] = g->regions[i - 1]; - i--; - } + /* Find insertion point (keep sorted by start address). */ + int i = region_lower_bound_start(g, start); + memmove(&g->regions[i + 1], &g->regions[i], + (g->nregions - i) * sizeof(guest_region_t)); guest_region_t *r = &g->regions[i]; r->start = start; @@ -1926,104 +1958,111 @@ int guest_preannounce(guest_t *g, void guest_region_remove(guest_t *g, uint64_t start, uint64_t end) { - int i = 0; - while (i < g->nregions) { - guest_region_t *r = &g->regions[i]; + if (end <= start) + return; - /* No overlap: region is entirely before the removal range */ - if (r->end <= start) { - i++; - continue; - } + /* In-place compaction: 'out' is the next output slot, 'in' is the next + * input slot. Since the prefix [0, first) is untouched (it sorts strictly + * before [start, end) by guest_region_first_end_above), both cursors begin + * at 'first'. The non-overlap invariant guarantees out <= in throughout the + * loop, so writes at g->regions[out] never clobber slots not yet read. + */ + int first = guest_region_first_end_above(g, start); + int out = first; + int in = first; - /* No overlap: region is entirely after the removal range */ + while (in < g->nregions) { + guest_region_t *r = &g->regions[in]; if (r->start >= end) - break; /* sorted, so done */ - - /* Full containment: remove the entire region */ - if (r->start >= start && r->end <= end) { - if (r->backing_fd >= 0) - close(r->backing_fd); - memmove(&g->regions[i], &g->regions[i + 1], - (g->nregions - i - 1) * sizeof(guest_region_t)); - g->nregions--; - continue; /* do not increment i */ - } - - /* Partial overlap: removal range cuts the beginning */ - if (r->start >= start && r->end > end) { - uint64_t trimmed = end - r->start; - r->offset += trimmed; - r->gpa_base += trimmed; - r->start = end; - guest_region_clip_overlay(r); - i++; - continue; - } + break; - /* Partial overlap: removal range cuts the end */ - if (r->start < start && r->end > start && r->end <= end) { - r->end = start; - guest_region_clip_overlay(r); - i++; - continue; - } + bool keep_left = r->start < start; + bool keep_right = r->end > end; - /* Split: removal range is entirely inside the region */ - if (r->start < start && r->end > end) { - /* Need to split into two regions: [r->start, start) and [end, - * r->end) - */ + /* Interior split: removal range lies strictly inside *r, producing + * two output entries from one input slot. This is the only growth + * path; handle it explicitly so the untouched suffix is shifted out + * of the way before either half is written. After this branch no + * further input regions can overlap [start, end), so the loop is + * done. + */ + if (keep_left && keep_right) { if (g->nregions >= GUEST_MAX_REGIONS) { - /* Region table is full; trim to [r->start, start) and drop - * the tail. The tail [end, r->end) becomes untracked in - * /proc/self/maps but remains mapped in page tables. + /* Table full: drop the tail [end, r->end) and fall through + * to the simple "trim end" treatment of *r. */ log_error( "guest: region table full, " "munmap split drops tail [0x%llx-0x%llx)", (unsigned long long) end, (unsigned long long) r->end); - r->end = start; - i++; - continue; - } - /* Make room for the new region after i */ - memmove(&g->regions[i + 2], &g->regions[i + 1], - (g->nregions - i - 1) * sizeof(guest_region_t)); - - /* Right half: [end, old_end) */ - guest_region_t *right = &g->regions[i + 1]; - *right = *r; /* Copy attributes */ - right->offset += (end - r->start); - right->gpa_base += (end - r->start); - right->start = end; - if (r->backing_fd >= 0) { - /* A dup failure leaves backing_fd=-1, silently converting this - * half to anonymous semantics (msync and MADV_DONTNEED skip - * regions with backing_fd<0). Propagating the error would - * require making all region split callers (mprotect, munmap) - * fallible. - */ - right->backing_fd = dup(r->backing_fd); - if (right->backing_fd < 0) - log_error( - "guest: dup() failed for region split " - "backing fd %d: %s", - r->backing_fd, strerror(errno)); - } + keep_right = false; + } else { + guest_region_t orig = *r; + int suffix_count = g->nregions - in - 1; + if (suffix_count > 0) + memmove(&g->regions[out + 2], &g->regions[in + 1], + suffix_count * sizeof(guest_region_t)); + + guest_region_t left = orig; + left.end = start; + guest_region_clip_overlay(&left); + g->regions[out] = left; + + guest_region_t right = orig; + uint64_t trimmed = end - orig.start; + right.offset += trimmed; + right.gpa_base += trimmed; + right.start = end; + if (orig.backing_fd >= 0) { + right.backing_fd = dup(orig.backing_fd); + if (right.backing_fd < 0) + log_error( + "guest: dup() failed for region split " + "backing fd %d: %s", + orig.backing_fd, strerror(errno)); + } + guest_region_clip_overlay(&right); + g->regions[out + 1] = right; - /* Left half keeps the original entry and shortens its end. */ - r->end = start; - guest_region_clip_overlay(r); - guest_region_clip_overlay(right); + g->nregions = out + 2 + suffix_count; + return; + } + } - g->nregions++; - i += 2; /* skip both halves */ + if (!keep_left && !keep_right) { + if (r->backing_fd >= 0) + close(r->backing_fd); + in++; continue; } - i++; + /* Trim-only paths: either keep_left xor keep_right is true. Build the + * surviving half from the source slot, then publish it to g->regions + * [out]. The original backing_fd transfers to whichever half survives; + * no dup is needed because only one half remains. + */ + guest_region_t survivor = *r; + if (keep_left) { + survivor.end = start; + } else { + uint64_t trimmed = end - r->start; + survivor.offset += trimmed; + survivor.gpa_base += trimmed; + survivor.start = end; + } + guest_region_clip_overlay(&survivor); + g->regions[out++] = survivor; + in++; } + + /* Append the unread suffix (regions whose start >= end) after the + * compacted overlap area, shifting only if compaction left a hole. + */ + int tail = g->nregions - in; + if (tail > 0 && out != in) + memmove(&g->regions[out], &g->regions[in], + tail * sizeof(guest_region_t)); + g->nregions = out + tail; } const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr) @@ -2043,6 +2082,35 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr) return NULL; } +bool guest_region_range_prot_uniform(const guest_t *g, + uint64_t start, + uint64_t end, + int prot) +{ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->prot != prot) + return false; + } + return true; +} + +bool guest_region_range_has_noreserve(const guest_t *g, + uint64_t start, + uint64_t end) +{ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->noreserve) + return true; + } + return false; +} + void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) { /* Walk regions overlapping [start, end), split at boundaries, update prot. @@ -2051,20 +2119,28 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) */ int first_modified = -1, last_modified = -1; - for (int i = 0; i < g->nregions; i++) { + /* The prefix skip ensures regions[i].end > start for i >= first; the + * non-overlap invariant carries it through all later iterations. + */ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { guest_region_t *r = &g->regions[i]; - if (r->end <= start) - continue; if (r->start >= end) break; /* If region extends before start, split at start */ if (r->start < start) { if (g->nregions >= GUEST_MAX_REGIONS) { + /* The region keeps its old prot in the tracker, but PTEs for + * [start, r->end) have already been updated. Mark the tracker + * permanently stale so the mprotect fast path falls back to + * unconditional PTE work and cannot be fooled by a tracker + * that lags actual PTE state. + */ log_error( "guest: region table full, " "mprotect split skipped at 0x%llx", (unsigned long long) start); + g->regions_tracker_stale = true; continue; } memmove(&g->regions[i + 1], &g->regions[i], @@ -2094,8 +2170,10 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) /* If region extends past end, split at end */ if (r->end > end) { if (g->nregions >= GUEST_MAX_REGIONS) { - /* Split failure applies prot to the whole region. - * The tail [end, r->end) gets new prot too. + /* Over-apply prot to the whole region: the tail [end, r->end) + * now claims new prot in the tracker even though PTE work + * did not cover it. Mark the tracker stale so the mprotect + * fast path stops trusting prot uniformity. */ log_error( "guest: region table full, " @@ -2103,6 +2181,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) "(region [0x%llx-0x%llx) gets prot %d entirely)", (unsigned long long) end, (unsigned long long) r->start, (unsigned long long) r->end, prot); + g->regions_tracker_stale = true; r->prot = prot; if (first_modified < 0) first_modified = i; diff --git a/src/core/guest.h b/src/core/guest.h index 9e40ae3..23cbe19 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -472,6 +472,15 @@ typedef struct { /* Semantic region tracking for munmap/mprotect/proc-self-maps */ guest_region_t regions[GUEST_MAX_REGIONS]; int nregions; /* Number of active regions */ + /* Sticky flag set when guest_region_set_prot could not honor a request + * because the region table was full. After this point the tracker no + * longer faithfully reflects PTE state, so the mprotect fast path must + * fall back to unconditional PTE work. Not propagated across fork IPC; + * a child that inherits a stale tracker re-arms the flag the next time + * it hits the same condition, so the only window of incorrect skipping + * is the very first matching mprotect after such a fork. + */ + bool regions_tracker_stale; guest_region_t preannounced[GUEST_MAX_PREANNOUNCED]; int npreannounced; /* /proc/self/maps-only shadow regions */ @@ -1152,6 +1161,32 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr); */ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot); +/* Index of the first region whose end is strictly above addr. Earlier + * regions sort entirely below addr (regions[] is start-sorted and + * non-overlapping, so ends are monotonic). Callers use this to skip the + * untouched prefix in O(log n) before a linear walk over the overlap. + */ +int guest_region_first_end_above(const guest_t *g, uint64_t addr); + +/* True if every tracked region overlapping [start, end) already has prot. + * Returns true vacuously when no region overlaps the range, since callers + * use this to decide whether page-table maintenance can be skipped and an + * untracked sub-range has no PTEs of its own to update. + */ +bool guest_region_range_prot_uniform(const guest_t *g, + uint64_t start, + uint64_t end, + int prot); + +/* True if any tracked region overlapping [start, end) is MAP_NORESERVE. + * Callers must run page-table maintenance for such ranges even when prot + * already matches, because lazy materialization may have produced PTEs + * that need re-permissioning. + */ +bool guest_region_range_has_noreserve(const guest_t *g, + uint64_t start, + uint64_t end); + /* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. * Called from the data/instruction abort handler when the faulting address * falls within a noreserve region. Creates page table entries for one 2MiB diff --git a/src/syscall/mem.c b/src/syscall/mem.c index bcdd48d..415fa2e 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -263,26 +263,6 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) } } -/* Find the smallest i such that g->regions[i].end > gap_start. All earlier - * regions are entirely below gap_start and would be skipped by the loop body - * with no other effect. Regions are kept sorted by start and non-overlapping - * (sys_mmap MAP_FIXED removes overlaps before insertion), so ends are - * monotonic across the array and binary-searchable. - */ -static int first_region_end_above(const guest_t *g, uint64_t gap_start) -{ - int lo = 0; - int hi = g->nregions; - while (lo < hi) { - int mid = lo + (hi - lo) / 2; - if (g->regions[mid].end <= gap_start) - lo = mid + 1; - else - hi = mid; - } - return lo; -} - static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, @@ -303,13 +283,21 @@ static uint64_t find_free_gap_inner(const guest_t *g, * region tail, so the linear walk would otherwise re-scan that whole * prefix on every mmap, addr-hint probe, or hint-miss full scan. */ - for (int i = first_region_end_above(g, gap_start); i < g->nregions; i++) { + for (int i = guest_region_first_end_above(g, gap_start); i < g->nregions; + i++) { /* A region can still slip below gap_start after the ALIGN_UP advance * below skips past a smaller adjacent region; keep the cheap guard. */ if (g->regions[i].end <= gap_start) continue; + /* The search is bounded to [min_addr, max_addr). Once the sorted + * region stream reaches max_addr, later regions cannot affect any + * candidate gap inside the window. + */ + if (g->regions[i].start >= max_addr) + break; + /* If this region starts far enough after gap_start, the allocator found * a gap. Must also verify the gap is within max_addr; regions[] may * contain entries beyond max_addr that could push gap_start past the @@ -3272,6 +3260,16 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) /* sys_mprotect. */ +static bool mprotect_same_prot_fast_path_safe(int prot) +{ + /* Non-fixed main-arena mmap initially installs RW PTEs for PROT_READ + * mappings, relying on mprotect to tighten them later. Do not trust the + * region tracker alone for read-only same-prot requests. + */ + return prot == LINUX_PROT_NONE || (prot & LINUX_PROT_WRITE) || + (prot & LINUX_PROT_EXEC); +} + int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) { if (addr & 4095) @@ -3291,7 +3289,26 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) (prot & LINUX_PROT_EXEC)) return -LINUX_EINVAL; - guest_region_set_prot(g, addr, mprot_end, prot); + /* Fast path: if the tracker already records this prot for every + * overlapping region and none are MAP_NORESERVE, page tables are + * already in sync and no PTE work is required. The tracker + * update is also a no-op, so skip it. Read-only same-prot + * requests still need PTE work because some mmap paths install + * RW PTEs while recording PROT_READ in the tracker. Disabled once + * regions_tracker_stale is set: prior set_prot calls hit + * GUEST_MAX_REGIONS and left the tracker out of sync with PTEs. + */ + if (mprotect_same_prot_fast_path_safe(prot) && + !g->regions_tracker_stale && + guest_region_range_prot_uniform(g, addr, mprot_end, prot) && + !guest_region_range_has_noreserve(g, addr, mprot_end)) + return 0; + + /* Do PTE work BEFORE updating the tracker. If page-table + * maintenance fails, regions[] still reflects the old prot, so + * a retry will see the mismatch and re-attempt the update + * instead of short-circuiting on stale tracker state. + */ if (prot != LINUX_PROT_NONE) { if (guest_update_perms(g, addr, mprot_end, prot_to_perms(prot)) < 0) @@ -3300,6 +3317,7 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) if (guest_invalidate_ptes(g, addr, mprot_end) < 0) return -LINUX_ENOMEM; } + guest_region_set_prot(g, addr, mprot_end, prot); return 0; } uint64_t mprot_off = addr - g->ipa_base; @@ -3312,16 +3330,26 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) if (guest_range_hits_infra(g, mprot_off, mprot_end)) return -LINUX_EINVAL; - guest_region_set_prot(g, mprot_off, mprot_end, prot); + /* Same fast path / ordering / staleness gate as above. */ + if (mprotect_same_prot_fast_path_safe(prot) && + !g->regions_tracker_stale && + guest_region_range_prot_uniform(g, mprot_off, mprot_end, + prot) && + !guest_region_range_has_noreserve(g, mprot_off, mprot_end)) + return 0; + if (prot != LINUX_PROT_NONE) { int page_perms = prot_to_perms(prot); if (guest_extend_page_tables(g, mprot_off, mprot_end, page_perms) < 0) return -LINUX_ENOMEM; - guest_update_perms(g, mprot_off, mprot_end, page_perms); + if (guest_update_perms(g, mprot_off, mprot_end, page_perms) < 0) + return -LINUX_ENOMEM; } else { - guest_invalidate_ptes(g, mprot_off, mprot_end); + if (guest_invalidate_ptes(g, mprot_off, mprot_end) < 0) + return -LINUX_ENOMEM; } + guest_region_set_prot(g, mprot_off, mprot_end, prot); } } return 0; diff --git a/tests/test-negative.c b/tests/test-negative.c index 2eeeacf..8703bfc 100644 --- a/tests/test-negative.c +++ b/tests/test-negative.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -35,6 +37,16 @@ int passes = 0, fails = 0; +static sigjmp_buf segv_jmp; +static volatile sig_atomic_t saw_segv; + +static void on_sigsegv(int sig) +{ + (void) sig; + saw_segv = 1; + siglongjmp(segv_jmp, 1); +} + /* Test 1: Invalid FD operations */ static void test_invalid_fd(void) @@ -162,6 +174,40 @@ static void test_mmap_prot(void) EXPECT_TRUE(val == 42, "data not readable after RO mprotect"); munmap(p, 4096); } + + TEST("same-prot mprotect keeps RO mmap unwritable"); + { + void *p = + mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + if (mprotect(p, 4096, PROT_READ) != 0) { + munmap(p, 4096); + FAIL("mprotect failed"); + return; + } + + struct sigaction old_sa; + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = on_sigsegv; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &old_sa) != 0) { + munmap(p, 4096); + FAIL("sigaction failed"); + return; + } + + saw_segv = 0; + if (sigsetjmp(segv_jmp, 1) == 0) + *(volatile int *) p = 7; + + sigaction(SIGSEGV, &old_sa, NULL); + munmap(p, 4096); + EXPECT_TRUE(saw_segv, "write to read-only mapping succeeded"); + } } /* Test 7: fcntl on invalid FD */