-
Notifications
You must be signed in to change notification settings - Fork 9
Back read-only MAP_SHARED file mappings with MAP_PRIVATE #84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -968,12 +968,14 @@ static int hvf_apply_file_overlay(guest_t *g, | |
| uint64_t ipa, | ||
| uint64_t len, | ||
| int fd, | ||
| off_t file_off); | ||
| off_t file_off, | ||
| int prot); | ||
| static int hvf_apply_file_overlay_quiesced(guest_t *g, | ||
| uint64_t ipa, | ||
| uint64_t len, | ||
| int fd, | ||
| off_t file_off); | ||
| off_t file_off, | ||
| int prot); | ||
| static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len); | ||
|
|
||
| static int read_file_range_to_guest(guest_t *g, | ||
|
|
@@ -1008,10 +1010,12 @@ static int restore_file_overlay_range(guest_t *g, | |
| uint64_t overlay_start, | ||
| uint64_t overlay_end, | ||
| int fd, | ||
| uint64_t file_off) | ||
| uint64_t file_off, | ||
| int prot) | ||
| { | ||
| int err = hvf_apply_file_overlay( | ||
| g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off); | ||
| g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off, | ||
| prot); | ||
| if (err < 0) | ||
| return err; | ||
| mark_overlay_metadata_range(g, start, end, overlay_start, overlay_end); | ||
|
|
@@ -1133,7 +1137,7 @@ static int restore_snapshot_overlays_in_place(guest_t *g, | |
| if (first) { | ||
| int err = restore_file_overlay_range( | ||
| g, snap->start, snap->end, snap->overlay_start, | ||
| snap->overlay_end, snap->backing_fd, snap_file_off); | ||
| snap->overlay_end, snap->backing_fd, snap_file_off, snap->prot); | ||
| if (err < 0) | ||
| return err; | ||
| continue; | ||
|
|
@@ -1240,7 +1244,7 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n) | |
| return -LINUX_EFAULT; | ||
| int err = restore_file_overlay_range( | ||
| g, snap->start, snap->end, snap->overlay_start, | ||
| snap->overlay_end, r->backing_fd, snap_file_off); | ||
| snap->overlay_end, r->backing_fd, snap_file_off, r->prot); | ||
| if (err < 0) | ||
| return err; | ||
| continue; | ||
|
|
@@ -1429,18 +1433,19 @@ static int hvf_segment_split(guest_t *g, | |
| return 0; | ||
| } | ||
|
|
||
| /* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, | ||
| * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB | ||
| * segment is split out first if it is not already isolated. Caller | ||
| * holds mmap_lock and has already quiesced sibling vCPUs (or has none). | ||
| * The fork pre-snapshot path quiesces siblings before calling this so | ||
| * the overlay install does not trigger a nested quiesce. | ||
| /* Apply a real file overlay at [ipa, ipa+len) backed by [fd, file_off). The | ||
| * IPA range may be sub-2 MiB; the containing 2 MiB segment is split out first | ||
| * if it is not already isolated. Caller holds mmap_lock and has already | ||
| * quiesced sibling vCPUs (or has none). The fork pre-snapshot path quiesces | ||
| * siblings before calling this so the overlay install does not trigger a | ||
| * nested quiesce. | ||
| */ | ||
| static int hvf_apply_file_overlay_quiesced(guest_t *g, | ||
| uint64_t ipa, | ||
| uint64_t len, | ||
| int fd, | ||
| off_t file_off) | ||
| off_t file_off, | ||
| int prot) | ||
| { | ||
| uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa); | ||
| uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len); | ||
|
|
@@ -1459,8 +1464,32 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, | |
| return -LINUX_EIO; | ||
|
|
||
| void *target = (uint8_t *) g->host_base + ipa; | ||
| void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, | ||
| fd, file_off); | ||
| /* hv_vm_map can only grant stage-2 rights (RWX) that the host VA backing's | ||
| * max_protection permits. A MAP_SHARED mapping of an O_RDONLY fd has | ||
| * max_protection READ only, so the following hv_vm_map of the segment | ||
| * fails with HV_ERROR -- this is why the JVM's read-only mmap of | ||
| * lib/modules (an O_RDONLY fd) could never be installed. | ||
| * | ||
| * Pick the host backing from what the fd and the guest actually need: | ||
| * - guest wants PROT_WRITE: MAP_SHARED with PROT_READ|PROT_WRITE so the | ||
| * guest's writes reach the file. On an O_RDONLY fd the host mmap | ||
| * returns EACCES, matching Linux. | ||
| * - guest read-only on a writable fd: MAP_SHARED PROT_READ keeps | ||
| * cross-mapping coherence; max_protection is RWX because the fd is | ||
| * writable, so the segment maps. | ||
| * - guest read-only on an O_RDONLY fd: MAP_PRIVATE PROT_READ. Its | ||
| * max_protection is RWX so the segment maps; the pages still show file | ||
| * content, and the guest's stage-1 tables keep the region read-only so | ||
| * the private copy is never dirtied (no observable MAP_SHARED | ||
| * divergence for a read-only mapping). | ||
| */ | ||
| bool want_write = (prot & LINUX_PROT_WRITE) != 0; | ||
| int acc = fcntl(fd, F_GETFL); | ||
| bool fd_writable = acc >= 0 && ((acc & O_ACCMODE) == O_RDWR || | ||
| (acc & O_ACCMODE) == O_WRONLY); | ||
| int host_prot = want_write ? (PROT_READ | PROT_WRITE) : PROT_READ; | ||
| int share = (want_write || fd_writable) ? MAP_SHARED : MAP_PRIVATE; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The MAP_PRIVATE substitution is sound while the mapping stays read-only, but int fd = open(path, O_RDONLY);
char *p = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
mprotect(p, len, PROT_READ | PROT_WRITE); // Linux: EACCES; here: succeeds
*p = 0xff; // Writes to COW copy, not the filewill silently upgrade stage-1 to RW and write into the COW copy. Linux returns The cleanest fix is to track a |
||
| void *p = mmap(target, len, host_prot, share | MAP_FIXED, fd, file_off); | ||
| if (p == MAP_FAILED) { | ||
| int saved = linux_errno(); | ||
| /* The overlay failed; restore the segment to slab backing so the | ||
|
|
@@ -1501,10 +1530,11 @@ static int hvf_apply_file_overlay(guest_t *g, | |
| uint64_t ipa, | ||
| uint64_t len, | ||
| int fd, | ||
| off_t file_off) | ||
| off_t file_off, | ||
| int prot) | ||
| { | ||
| thread_quiesce_siblings(); | ||
| int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off); | ||
| int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off, prot); | ||
| thread_resume_siblings(); | ||
| return err; | ||
| } | ||
|
|
@@ -1976,7 +2006,7 @@ int64_t sys_mmap(guest_t *g, | |
| ALIGN_UP(length, host_page_size_cached()); | ||
| int oerr = | ||
| hvf_apply_file_overlay(g, result_off, fixed_overlay_len, | ||
| host_backing_fd, (off_t) offset); | ||
| host_backing_fd, (off_t) offset, prot); | ||
| if (oerr < 0) { | ||
| int restore_err = restore_region_snapshots( | ||
| g, replaced_snaps, replaced_nsnaps); | ||
|
|
@@ -2285,7 +2315,8 @@ int64_t sys_mmap(guest_t *g, | |
| if (overlay_aligned) { | ||
| uint64_t nf_overlay_len = ALIGN_UP(length, hps); | ||
| int oerr = hvf_apply_file_overlay(g, result_off, nf_overlay_len, | ||
| host_backing_fd, (off_t) offset); | ||
| host_backing_fd, (off_t) offset, | ||
| prot); | ||
| if (oerr < 0) { | ||
| int rollback_err = rollback_fresh_mmap_allocation( | ||
| g, result_off, length, false, 0, 0, saved_mmap_next, | ||
|
|
@@ -2871,7 +2902,7 @@ int64_t sys_mremap(guest_t *g, | |
| int restore_err = restore_file_overlay_range( | ||
| g, old_off, old_off + old_size, source_overlay_start, | ||
| source_overlay_end, track_backing_fd, | ||
| source_overlay_file_off); | ||
| source_overlay_file_off, prot); | ||
| if (restore_err < 0) { | ||
| if (track_backing_fd >= 0) | ||
| close(track_backing_fd); | ||
|
|
@@ -2904,7 +2935,7 @@ int64_t sys_mremap(guest_t *g, | |
| (void) restore_file_overlay_range( | ||
| g, old_off, old_off + old_size, source_overlay_start, | ||
| source_overlay_end, track_backing_fd, | ||
| source_overlay_file_off); | ||
| source_overlay_file_off, prot); | ||
| guest_invalidate_ptes(g, new_off, new_off + new_size); | ||
| if (track_backing_fd >= 0) | ||
| close(track_backing_fd); | ||
|
|
@@ -3761,7 +3792,11 @@ int mmap_fork_prepare_anon_shared(guest_t *g, | |
| return nsnaps; | ||
| } | ||
|
|
||
| int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0); | ||
| /* Anonymous-shared fork backing: the temp file is writable and the | ||
| * region is read-write, so request RW (preserves prior behavior). */ | ||
| int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0, | ||
| LINUX_PROT_READ | | ||
| LINUX_PROT_WRITE); | ||
| if (err < 0) { | ||
| log_warn("fork-prep: overlay install [0x%llx, 0x%llx) failed: %d", | ||
| (unsigned long long) start, | ||
|
|
@@ -3947,7 +3982,7 @@ int mmap_fork_restore_overlays(guest_t *g, | |
| } | ||
|
|
||
| int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd, | ||
| (off_t) file_off); | ||
| (off_t) file_off, r->prot); | ||
| if (err < 0) { | ||
| log_warn( | ||
| "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Treating
fcntlfailure as "not writable" silently picksMAP_PRIVATEfor a valid writable fd whoseF_GETFLtransiently failed. Vanishingly rare on a host fd elfuse already holds viahost_fd_ref_open, but the failure mode (losing MAP_SHARED coherence) is silent rather than surfaced.Two options: return
-linux_errno()onacc < 0, or hoist fd-writability detection up to wherehost_backing_fdis resolved and thread a plainbool fd_writablethrough. The latter also eliminates the per-installfcntlon the hot mmap path.