Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 58 additions & 23 deletions src/syscall/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -968,12 +968,14 @@ static int hvf_apply_file_overlay(guest_t *g,
uint64_t ipa,
uint64_t len,
int fd,
off_t file_off);
off_t file_off,
int prot);
static int hvf_apply_file_overlay_quiesced(guest_t *g,
uint64_t ipa,
uint64_t len,
int fd,
off_t file_off);
off_t file_off,
int prot);
static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len);

static int read_file_range_to_guest(guest_t *g,
Expand Down Expand Up @@ -1008,10 +1010,12 @@ static int restore_file_overlay_range(guest_t *g,
uint64_t overlay_start,
uint64_t overlay_end,
int fd,
uint64_t file_off)
uint64_t file_off,
int prot)
{
int err = hvf_apply_file_overlay(
g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off);
g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off,
prot);
if (err < 0)
return err;
mark_overlay_metadata_range(g, start, end, overlay_start, overlay_end);
Expand Down Expand Up @@ -1133,7 +1137,7 @@ static int restore_snapshot_overlays_in_place(guest_t *g,
if (first) {
int err = restore_file_overlay_range(
g, snap->start, snap->end, snap->overlay_start,
snap->overlay_end, snap->backing_fd, snap_file_off);
snap->overlay_end, snap->backing_fd, snap_file_off, snap->prot);
if (err < 0)
return err;
continue;
Expand Down Expand Up @@ -1240,7 +1244,7 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n)
return -LINUX_EFAULT;
int err = restore_file_overlay_range(
g, snap->start, snap->end, snap->overlay_start,
snap->overlay_end, r->backing_fd, snap_file_off);
snap->overlay_end, r->backing_fd, snap_file_off, r->prot);
if (err < 0)
return err;
continue;
Expand Down Expand Up @@ -1429,18 +1433,19 @@ static int hvf_segment_split(guest_t *g,
return 0;
}

/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd,
* file_off). The IPA range may be sub-2 MiB; the containing 2 MiB
* segment is split out first if it is not already isolated. Caller
* holds mmap_lock and has already quiesced sibling vCPUs (or has none).
* The fork pre-snapshot path quiesces siblings before calling this so
* the overlay install does not trigger a nested quiesce.
/* Apply a real file overlay at [ipa, ipa+len) backed by [fd, file_off). The
* IPA range may be sub-2 MiB; the containing 2 MiB segment is split out first
* if it is not already isolated. Caller holds mmap_lock and has already
* quiesced sibling vCPUs (or has none). The fork pre-snapshot path quiesces
* siblings before calling this so the overlay install does not trigger a
* nested quiesce.
*/
static int hvf_apply_file_overlay_quiesced(guest_t *g,
uint64_t ipa,
uint64_t len,
int fd,
off_t file_off)
off_t file_off,
int prot)
{
uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa);
uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len);
Expand All @@ -1459,8 +1464,32 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g,
return -LINUX_EIO;

void *target = (uint8_t *) g->host_base + ipa;
void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED,
fd, file_off);
/* hv_vm_map can only grant stage-2 rights (RWX) that the host VA backing's
* max_protection permits. A MAP_SHARED mapping of an O_RDONLY fd has
* max_protection READ only, so the following hv_vm_map of the segment
* fails with HV_ERROR -- this is why the JVM's read-only mmap of
* lib/modules (an O_RDONLY fd) could never be installed.
*
* Pick the host backing from what the fd and the guest actually need:
* - guest wants PROT_WRITE: MAP_SHARED with PROT_READ|PROT_WRITE so the
* guest's writes reach the file. On an O_RDONLY fd the host mmap
* returns EACCES, matching Linux.
* - guest read-only on a writable fd: MAP_SHARED PROT_READ keeps
* cross-mapping coherence; max_protection is RWX because the fd is
* writable, so the segment maps.
* - guest read-only on an O_RDONLY fd: MAP_PRIVATE PROT_READ. Its
* max_protection is RWX so the segment maps; the pages still show file
* content, and the guest's stage-1 tables keep the region read-only so
* the private copy is never dirtied (no observable MAP_SHARED
* divergence for a read-only mapping).
*/
bool want_write = (prot & LINUX_PROT_WRITE) != 0;
int acc = fcntl(fd, F_GETFL);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Treating fcntl failure as "not writable" silently picks MAP_PRIVATE for a valid writable fd whose F_GETFL transiently failed. Vanishingly rare on a host fd elfuse already holds via host_fd_ref_open, but the failure mode (losing MAP_SHARED coherence) is silent rather than surfaced.

Two options: return -linux_errno() on acc < 0, or hoist fd-writability detection up to where host_backing_fd is resolved and thread a plain bool fd_writable through. The latter also eliminates the per-install fcntl on the hot mmap path.

bool fd_writable = acc >= 0 && ((acc & O_ACCMODE) == O_RDWR ||
(acc & O_ACCMODE) == O_WRONLY);
int host_prot = want_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
int share = (want_write || fd_writable) ? MAP_SHARED : MAP_PRIVATE;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The MAP_PRIVATE substitution is sound while the mapping stays read-only, but sys_mprotect at mem.c:3275 doesn't know about the backing decision -- it just calls guest_region_set_prot + guest_update_perms(prot_to_perms(prot)). A guest that does:

int fd = open(path, O_RDONLY);
char *p = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
mprotect(p, len, PROT_READ | PROT_WRITE);   // Linux: EACCES; here: succeeds
*p = 0xff;                                   // Writes to COW copy, not the file

will silently upgrade stage-1 to RW and write into the COW copy. Linux returns EACCES because the mapping remembers max_prot=READ from the O_RDONLY fd. Before this PR the upgrade was unreachable (the initial mmap failed); the MAP_PRIVATE fallback exposes it.

The cleanest fix is to track a host_backing_kind (or max_prot) on guest_region_t, set it when this branch is taken, carry it through snapshots/splits/merges/mremap, and have sys_mprotect return -LINUX_EACCES when PROT_WRITE would exceed it. That also closes the downstream gap where sys_msync at mem.c:3560 skips its pwrite-refresh path for overlay_active=true regions on the assumption "the page cache already keeps them coherent with the file" -- false for a MAP_PRIVATE backing.

void *p = mmap(target, len, host_prot, share | MAP_FIXED, fd, file_off);
if (p == MAP_FAILED) {
int saved = linux_errno();
/* The overlay failed; restore the segment to slab backing so the
Expand Down Expand Up @@ -1501,10 +1530,11 @@ static int hvf_apply_file_overlay(guest_t *g,
uint64_t ipa,
uint64_t len,
int fd,
off_t file_off)
off_t file_off,
int prot)
{
thread_quiesce_siblings();
int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off);
int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off, prot);
thread_resume_siblings();
return err;
}
Expand Down Expand Up @@ -1976,7 +2006,7 @@ int64_t sys_mmap(guest_t *g,
ALIGN_UP(length, host_page_size_cached());
int oerr =
hvf_apply_file_overlay(g, result_off, fixed_overlay_len,
host_backing_fd, (off_t) offset);
host_backing_fd, (off_t) offset, prot);
if (oerr < 0) {
int restore_err = restore_region_snapshots(
g, replaced_snaps, replaced_nsnaps);
Expand Down Expand Up @@ -2285,7 +2315,8 @@ int64_t sys_mmap(guest_t *g,
if (overlay_aligned) {
uint64_t nf_overlay_len = ALIGN_UP(length, hps);
int oerr = hvf_apply_file_overlay(g, result_off, nf_overlay_len,
host_backing_fd, (off_t) offset);
host_backing_fd, (off_t) offset,
prot);
if (oerr < 0) {
int rollback_err = rollback_fresh_mmap_allocation(
g, result_off, length, false, 0, 0, saved_mmap_next,
Expand Down Expand Up @@ -2871,7 +2902,7 @@ int64_t sys_mremap(guest_t *g,
int restore_err = restore_file_overlay_range(
g, old_off, old_off + old_size, source_overlay_start,
source_overlay_end, track_backing_fd,
source_overlay_file_off);
source_overlay_file_off, prot);
if (restore_err < 0) {
if (track_backing_fd >= 0)
close(track_backing_fd);
Expand Down Expand Up @@ -2904,7 +2935,7 @@ int64_t sys_mremap(guest_t *g,
(void) restore_file_overlay_range(
g, old_off, old_off + old_size, source_overlay_start,
source_overlay_end, track_backing_fd,
source_overlay_file_off);
source_overlay_file_off, prot);
guest_invalidate_ptes(g, new_off, new_off + new_size);
if (track_backing_fd >= 0)
close(track_backing_fd);
Expand Down Expand Up @@ -3761,7 +3792,11 @@ int mmap_fork_prepare_anon_shared(guest_t *g,
return nsnaps;
}

int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0);
/* Anonymous-shared fork backing: the temp file is writable and the
* region is read-write, so request RW (preserves prior behavior). */
int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0,
LINUX_PROT_READ |
LINUX_PROT_WRITE);
if (err < 0) {
log_warn("fork-prep: overlay install [0x%llx, 0x%llx) failed: %d",
(unsigned long long) start,
Expand Down Expand Up @@ -3947,7 +3982,7 @@ int mmap_fork_restore_overlays(guest_t *g,
}

int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd,
(off_t) file_off);
(off_t) file_off, r->prot);
if (err < 0) {
log_warn(
"fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d",
Expand Down
3 changes: 3 additions & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ test-shim-cred-race
[section] msync MAP_SHARED tests
test-msync

[section] Read-only MAP_SHARED file overlay tests
test-mmap-shared-ro

[section] Cross-fork MAP_SHARED coherence tests
test-cross-fork-mapshared # diff=skip

Expand Down
Loading
Loading