diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 54f286f..f141330 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -122,7 +122,7 @@ jobs: GNU_OBJCOPY: /opt/homebrew/opt/binutils/bin/objcopy HOMEBREW_NO_INSTALL_CLEANUP: 1 HOMEBREW_NO_AUTO_UPDATE: 1 - BREW_PKGS: binutils + BREW_PKGS: binutils zstd cjson steps: - name: Checkout uses: actions/checkout@v6 @@ -181,7 +181,7 @@ jobs: HOMEBREW_NO_AUTO_UPDATE: 1 # binutils is needed because make lint depends on the shim_blob.h # generated by the assembly + objcopy pipeline. - BREW_PKGS: binutils llvm + BREW_PKGS: binutils llvm zstd cjson CLANG_TIDY: /opt/homebrew/opt/llvm/bin/clang-tidy steps: - name: Checkout @@ -220,7 +220,7 @@ jobs: GNU_OBJCOPY: /opt/homebrew/opt/binutils/bin/objcopy HOMEBREW_NO_INSTALL_CLEANUP: 1 HOMEBREW_NO_AUTO_UPDATE: 1 - BREW_PKGS: binutils llvm + BREW_PKGS: binutils llvm zstd cjson LLVM_BIN: /opt/homebrew/opt/llvm/bin steps: - name: Checkout diff --git a/.gitignore b/.gitignore index a01f895..88f02e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ build/ archive/ -externals/ +# externals/ holds downloaded fixtures (kernel, rootfs, packages) that are +# fetched on demand; tracking them in git would balloon the repo. Nothing +# under externals/ is vendored now -- cJSON and zstd are both consumed as +# system libraries via pkg-config. +externals/* lib/modules/ *.o *.bin diff --git a/Makefile b/Makefile index 3262c84..da92684 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,7 @@ SRCS := \ core/shim-globals.c \ core/bootstrap.c \ core/rosetta.c \ + core/launch.c \ core/sysroot.c \ runtime/thread.c \ runtime/futex.c \ @@ -67,15 +68,66 @@ SRCS := \ debug/gdbstub-reg.c \ debug/gdbstub-rsp.c \ debug/log.c \ - debug/syscall-hist.c + debug/syscall-hist.c \ + oci/ref.c \ + oci/cli.c \ + oci/digest.c \ + oci/digest-set.c \ + oci/blob-store.c \ + oci/media-type.c \ + oci/manifest.c \ + oci/fetch.c \ + oci/store.c \ + oci/pull.c \ + oci/inspect.c \ + oci/dedup-metrics.c \ + oci/status.c \ + oci/policy.c \ + oci/tar.c \ + oci/decompress.c \ + oci/layer-meta.c \ + oci/layer-apply.c \ + oci/origin-meta.c \ + oci/volume.c \ + oci/volume-list.c \ + oci/clone-rootfs.c \ + oci/unpack.c \ + oci/rebuild-cache.c \ + oci/runspec.c \ + oci/user-lookup.c \ + oci/path-resolve.c \ + oci/runtime-files.c \ + oci/run.c SRCS := $(addprefix src/,$(SRCS)) OBJS := $(patsubst src/%.c,$(BUILD_DIR)/%.o,$(SRCS)) +# cJSON (JSON parser for OCI manifests / config / policy) is consumed as a +# system shared library via pkg-config, mirroring zstd / zlib / libcurl. +# Install with `brew install cjson` (macOS) or `apt-get install libcjson-dev` +# (Linux). It is used across the OCI subsystem, so the include path goes into +# the global CFLAGS rather than a per-translation-unit override. +CJSON_CFLAGS := $(shell pkg-config --cflags libcjson) +CJSON_LIBS := $(shell pkg-config --libs libcjson) +CFLAGS += $(CJSON_CFLAGS) + +# zstd (decode path for OCI layers) is consumed as a system shared library, +# mirroring the existing -lz / -lcurl dependencies. Phase 2 OCI layer unpack +# decompresses zstd-compressed layer media types; only src/oci/decompress.c +# includes , and it touches only the stable public API. Install with +# `brew install zstd` (macOS) or `apt-get install libzstd-dev` (Linux); the +# include/library paths are resolved through pkg-config. +ZSTD_CFLAGS := $(shell pkg-config --cflags libzstd) +ZSTD_LIBS := $(shell pkg-config --libs libzstd) + DISPATCH_MANIFEST := src/syscall/dispatch.tbl DISPATCH_GENERATOR := scripts/gen-syscall-dispatch.py DISPATCH_HEADER := $(BUILD_DIR)/dispatch.h -HVF_LDFLAGS := -framework Hypervisor -arch arm64 +# -lz: gzip-compressed OCI layers route through zlib (system library). +# -lcurl: HTTPS fetch for the Phase 1 oci pull path. +# $(ZSTD_LIBS): zstd-compressed OCI layers, decoded via the system libzstd. +# $(CJSON_LIBS): JSON parsing for OCI manifests/config/policy (system cJSON). +HVF_LDFLAGS := -framework Hypervisor -arch arm64 -lcurl -lz $(ZSTD_LIBS) $(CJSON_LIBS) # Generated headers under build/ that must exist before compiling sources that # include them. @@ -151,6 +203,215 @@ $(BUILD_DIR)/test-proctitle-host: $(BUILD_DIR)/test-proctitle-host.o \ @echo " LD $@" $(Q)$(CC) $(CFLAGS) -o $@ $^ +## Build the OCI reference parser unit test (native macOS binary). +## Pure C, no HVF, no codesign required. +$(BUILD_DIR)/test-oci-ref: $(BUILD_DIR)/test-oci-ref.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI digest unit test (native macOS binary). Pure C, no HVF. +$(BUILD_DIR)/test-oci-digest: $(BUILD_DIR)/test-oci-digest.o $(BUILD_DIR)/oci/digest.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI blob store unit test (native macOS binary). Pure C, no HVF. +$(BUILD_DIR)/test-oci-blob-store: $(BUILD_DIR)/test-oci-blob-store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI manifest / index / config parser unit test (native, no HVF). +$(BUILD_DIR)/test-oci-manifest: $(BUILD_DIR)/test-oci-manifest.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/digest.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the shared OCI mock HTTPS server helper. tests/lib/oci-mock.{c,h} +## terminates TLS via libssl from brew openssl@3; both the fetch and pull +## suites link against the same compiled object to avoid duplicating ~400 LOC +## of scaffolding in their own translation units. +$(BUILD_DIR)/lib/oci-mock.o: CFLAGS += $(OPENSSL_CFLAGS) + +## Build the OCI fetch (libcurl) unit test (native macOS, no HVF). Pulls in +## blob-store + digest + manifest models + cJSON; links against system libcurl +## and the platform pthread runtime for the in-process mock HTTP server. The +## test mock terminates TLS using libssl from brew openssl@3 so the ca_file +## negative cases exercise a real certificate verification path. +$(BUILD_DIR)/test-oci-fetch.o: CFLAGS += $(OPENSSL_CFLAGS) +$(BUILD_DIR)/test-oci-fetch: $(BUILD_DIR)/test-oci-fetch.o $(BUILD_DIR)/lib/oci-mock.o $(BUILD_DIR)/oci/fetch.o $(BUILD_DIR)/oci/policy.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ -lcurl -lpthread $(OPENSSL_LDFLAGS) $(CJSON_LIBS) + +## Build the OCI local store unit test (native macOS, no HVF). Pure C; links +## against the store wrapper plus its blob-store, digest, and cJSON deps. +## cJSON is required because store.c now reads / writes index.json. +$(BUILD_DIR)/test-oci-store: $(BUILD_DIR)/test-oci-store.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI pull pipeline unit test (native macOS, no HVF). Shares the +## TLS-terminating mock server with test-oci-fetch via tests/lib/oci-mock. +$(BUILD_DIR)/test-oci-pull.o: CFLAGS += $(OPENSSL_CFLAGS) +$(BUILD_DIR)/test-oci-pull: $(BUILD_DIR)/test-oci-pull.o $(BUILD_DIR)/lib/oci-mock.o $(BUILD_DIR)/oci/pull.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/fetch.o $(BUILD_DIR)/oci/policy.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ -lcurl -lpthread $(OPENSSL_LDFLAGS) $(CJSON_LIBS) + +## Build the OCI inspect renderer unit test (native macOS, no HVF). Pure +## offline: no fetcher, no mock server, no libcurl. Pre-populates the store +## via oci_blob_store_put_bytes + oci_store_put_ref. +$(BUILD_DIR)/test-oci-inspect: $(BUILD_DIR)/test-oci-inspect.o $(BUILD_DIR)/oci/inspect.o $(BUILD_DIR)/oci/dedup-metrics.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI cross-image dedup metrics unit test (native macOS, no HVF). +## Drives oci_dedup_metrics_compute against scratch stores hand-populated +## via oci_blob_store_put_bytes + oci_store_put_ref. Same dependency set +## as test-oci-inspect, plus oci/dedup-metrics.o. +$(BUILD_DIR)/test-oci-dedup-metrics: $(BUILD_DIR)/test-oci-dedup-metrics.o $(BUILD_DIR)/oci/dedup-metrics.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI rebuild-cache unit test (native macOS, no HVF). Drives +## oci_rebuild_cache against scratch stores hand-populated via oci_origin_write +## into a fixture /images/sha256-/ tree, then asserts that +## /layers/stacks/sha256// entries are created (commit) or left +## absent (dry-run). Same dependency set as test-oci-store plus oci/rebuild- +## cache.o. +$(BUILD_DIR)/test-oci-rebuild-cache: $(BUILD_DIR)/test-oci-rebuild-cache.o $(BUILD_DIR)/oci/rebuild-cache.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI store-wide status unit test (native macOS, no HVF). Drives +## oci_status_compute against scratch stores hand-populated via +## stage_image / oci_origin_write fixture helpers and asserts the aggregated +## struct fields (pin entries, unpacked entries, reachable + populated +## ratios, store totals). Same dependency set as test-oci-store plus +## oci/status.o. +$(BUILD_DIR)/test-oci-status: $(BUILD_DIR)/test-oci-status.o $(BUILD_DIR)/oci/status.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI policy.json schema and loader unit test (native macOS, no HVF). +## Pure C; links against the policy translation unit plus cJSON for the JSON +## parser. Drives oci_policy_load against per-test scratch HOME / XDG / override +## trees under /tmp. +$(BUILD_DIR)/test-oci-policy: $(BUILD_DIR)/test-oci-policy.o $(BUILD_DIR)/oci/policy.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI tar reader unit test (native macOS, no HVF). Pure C; the +## test constructs ustar / GNU long-name streams in memory and drives them +## through the reader via a callback that exercises short-read chunking. +$(BUILD_DIR)/test-oci-tar: $(BUILD_DIR)/test-oci-tar.o $(BUILD_DIR)/oci/tar.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI runspec unit test (native macOS, no HVF). Merges +## image-config runtime block + CLI overrides; the rootfs-driven +## symbolic-User cases write /etc/passwd and /etc/group fixtures under +## /tmp, so the link island pulls in oci/user-lookup.o. +$(BUILD_DIR)/test-oci-runspec: $(BUILD_DIR)/test-oci-runspec.o $(BUILD_DIR)/oci/runspec.o $(BUILD_DIR)/oci/user-lookup.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI User-field resolver unit test (native macOS, no HVF). +## Pure C; the test builds scratch /tmp rootfses with synthetic +## /etc/passwd / /etc/group and drives oci_user_lookup across the seven +## OCI image-spec User shapes plus the policy edges. +$(BUILD_DIR)/test-oci-user: $(BUILD_DIR)/test-oci-user.o $(BUILD_DIR)/oci/user-lookup.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI path-resolve unit test (native macOS, no HVF). Touches +## the host filesystem to build a small fake sysroot tree and drives +## oci_path_resolve through realpath / stat / symlink-follow scenarios. +## Pure C; no libcurl, no zstd, no HVF. +$(BUILD_DIR)/test-oci-path-resolve: $(BUILD_DIR)/test-oci-path-resolve.o $(BUILD_DIR)/oci/path-resolve.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI run orchestrator unit test (native macOS, no HVF). Links +## the same OCI graph the unpack test pulls in, plus oci/run.o, +## oci/runspec.o, and oci/path-resolve.o. Does NOT link core/launch.o: +## the test ships an in-file elfuse_launch stub that aborts when called, +## and every case installs a launch hook via oci_run_set_launch_for_testing +## before invoking oci_run, so the real VM bring-up never runs from a test. +$(BUILD_DIR)/test-oci-run: $(BUILD_DIR)/test-oci-run.o $(BUILD_DIR)/oci/run.o $(BUILD_DIR)/oci/runspec.o $(BUILD_DIR)/oci/user-lookup.o $(BUILD_DIR)/oci/path-resolve.o $(BUILD_DIR)/oci/runtime-files.o $(BUILD_DIR)/oci/unpack.o $(BUILD_DIR)/oci/volume.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/clone-rootfs.o $(BUILD_DIR)/oci/layer-apply.o $(BUILD_DIR)/oci/layer-meta.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/decompress.o $(BUILD_DIR)/oci/tar.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/ref.o $(BUILD_DIR)/core/sysroot.o $(BUILD_DIR)/debug/log.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ -lz $(ZSTD_LIBS) $(CJSON_LIBS) + +## Build the OCI runtime-files injection unit test (native macOS, no HVF). +## Pure C; the test drives oci_runtime_files_inject against scratch +## /tmp/elfuse-rf-* run directories and verifies the synthesised +## /etc/{resolv.conf,hosts,hostname} content. +$(BUILD_DIR)/test-oci-runtime-files: $(BUILD_DIR)/test-oci-runtime-files.o $(BUILD_DIR)/oci/runtime-files.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI fixture builder (Phase 3 compat tests). Standalone tool +## that synthesises a complete OCI store from uncompressed-tar layers +## plus image-config flags. Used by tests/test-oci-compat.sh and +## available standalone for one-off "shape an image from local files" +## experiments. +$(BUILD_DIR)/oci-fixture-builder: $(BUILD_DIR)/lib/oci-fixture-builder.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/ref.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## decompress.c is the only translation unit in elfuse that includes +## . Attach the system zstd include path as a target-specific +## CFLAG so the rest of the codebase never sees zstd headers. +$(BUILD_DIR)/oci/decompress.o: CFLAGS += $(ZSTD_CFLAGS) + +## Build the OCI sidecar metadata unit test (native macOS, no HVF). Pure +## C; links against cJSON for the JSON round-trip plus the layer-meta +## translation unit. +$(BUILD_DIR)/test-oci-meta: $(BUILD_DIR)/test-oci-meta.o $(BUILD_DIR)/oci/layer-meta.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI origin sidecar unit test (native macOS, no HVF). Drives +## oci_origin_write against a tmpdir and verifies the resulting +## .elfuse-origin.json by parsing it back through cJSON. +$(BUILD_DIR)/test-oci-origin: $(BUILD_DIR)/test-oci-origin.o $(BUILD_DIR)/oci/origin-meta.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI layer applier unit test (native macOS, no HVF). Builds +## tar payloads in memory, drives them through oci_layer_apply into a +## tmp tree, and verifies filesystem state via lstat/readlink. +$(BUILD_DIR)/test-oci-layer-apply: $(BUILD_DIR)/test-oci-layer-apply.o $(BUILD_DIR)/oci/layer-apply.o $(BUILD_DIR)/oci/layer-meta.o $(BUILD_DIR)/oci/tar.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ $(CJSON_LIBS) + +## Build the OCI volume bootstrap unit test (native macOS, no HVF). +## Default-volume test is gated behind OCI_VOLUME_TEST=1 because it +## costs ~150 ms of hdiutil orchestration on first run. Links +## src/core/sysroot.o for the hdiutil wrappers PR #33 introduced. +$(BUILD_DIR)/test-oci-volume: $(BUILD_DIR)/test-oci-volume.o $(BUILD_DIR)/oci/volume.o $(BUILD_DIR)/core/sysroot.o $(BUILD_DIR)/debug/log.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI clone-rootfs unit test (native macOS, no HVF). The +## test skips itself if clonefile returns ENOTSUP (non-APFS scratch). +$(BUILD_DIR)/test-oci-clone: $(BUILD_DIR)/test-oci-clone.o $(BUILD_DIR)/oci/clone-rootfs.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + +## Build the OCI unpack orchestrator integration smoke (native macOS, +## no HVF). Pulls in the full Phase 2 OCI stack so the dependency +## edges between modules are exercised at link time. +$(BUILD_DIR)/test-oci-unpack: $(BUILD_DIR)/test-oci-unpack.o $(BUILD_DIR)/oci/unpack.o $(BUILD_DIR)/oci/volume.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/clone-rootfs.o $(BUILD_DIR)/oci/layer-apply.o $(BUILD_DIR)/oci/layer-meta.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/decompress.o $(BUILD_DIR)/oci/tar.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/ref.o $(BUILD_DIR)/core/sysroot.o $(BUILD_DIR)/debug/log.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ -lz $(ZSTD_LIBS) $(CJSON_LIBS) + +## Build the OCI decompression dispatch unit test (native macOS, no HVF). +## Links system libzstd + zlib so gzip and zstd payloads both round-trip +## through oci_stream_t. The gzip fixture is generated at test time via +## zlib; the zstd fixture is an embedded byte array so the test needs only +## the zstd decode path. +$(BUILD_DIR)/test-oci-decompress.o: CFLAGS += $(ZSTD_CFLAGS) +$(BUILD_DIR)/test-oci-decompress: $(BUILD_DIR)/test-oci-decompress.o $(BUILD_DIR)/oci/decompress.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ -lz $(ZSTD_LIBS) + # Guest test binaries (cross-compiled, aarch64-linux) # Only used when GUEST_TEST_BINARIES is not set. diff --git a/README.md b/README.md index 631c551..8232a51 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,11 @@ guest debugging through a built-in GDB RSP stub. - macOS 13 or newer - Xcode Command Line Tools, `clang`, `codesign`, and GNU `make` - GNU `objcopy` from Homebrew `binutils`, or `llvm-objcopy` +- `zstd` and `cJSON` libraries with headers for OCI image support, resolved + via `pkg-config`: `brew install zstd cjson` (macOS) or `apt-get install + libzstd-dev libcjson-dev` (Linux). The `oci` subcommand decodes + zstd-compressed layers and parses JSON manifests; the rest of the build + links the system `libcurl` and `zlib` that ship with macOS. - Hypervisor entitlement: `com.apple.security.hypervisor` For guest test binaries, the project also expects an AArch64 Linux cross diff --git a/docs/usage.md b/docs/usage.md index 4cf6946..80e94b2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -99,6 +99,179 @@ and memory access, and per-thread inspection. Implementation details, including the snapshot protocol used to keep Hypervisor.framework register access on the owning thread, are documented in [internals.md](internals.md). +## Running OCI Images (`elfuse oci run`) + +Phase 3 adds a direct-execution path for pulled OCI images: + +```sh +elfuse oci run [OPTIONS] IMAGE [ARG...] +``` + +The subcommand reads the image's runtime block (Entrypoint, Cmd, Env, +WorkingDir, User) and folds in any CLI overrides, then unpacks the image +into the local APFS sysroot volume, clones a per-run rootfs via APFS +`clonefile(2)`, resolves argv[0] against PATH inside the rootfs, and +hands off to the same VM bring-up the legacy positional-ELF `elfuse` +entry uses. + +The image must already be pulled. `oci run` does not auto-pull on miss. +The usual workflow is: + +```sh +elfuse oci pull alpine:3 +elfuse oci run alpine:3 /bin/sh -c 'echo hello from inside' +``` + +### Options + +| Option | Meaning | +|--------|---------| +| `--store DIR` | Override the local store root | +| `--volume DIR` | Override the APFS sysroot volume mount point | +| `--entrypoint PROG` | Replace the image Entrypoint with `PROG` | +| `-e KEY=VAL`, `--env KEY=VAL` | Set or replace one env var (repeatable) | +| `-e KEY`, `--env KEY` | Import `KEY` from the host environ (repeatable) | +| `-w DIR`, `--workdir DIR` | Override image WorkingDir | +| `-u USER[:GROUP]`, `--user USER[:GROUP]` | Override image User; numeric `UID[:GID]` or symbolic `name[:group]` resolved from the rootfs `/etc/passwd` and `/etc/group` (see [User and WorkingDir](#user-and-workingdir)) | +| `--keep` | Keep the per-run cloned rootfs after exit | +| `--name NAME` | Reserved: deterministic clone-dir suffix (ignored today) | + +### Argv override matrix + +| Image Entrypoint | Image Cmd | CLI ARGV | `--entrypoint` | Result argv | +|--|--|--|--|--| +| set | set | none | none | Entrypoint ++ Cmd | +| set | set | provided | none | Entrypoint ++ CLI ARGV (Cmd dropped) | +| set | none | provided | none | Entrypoint ++ CLI ARGV | +| none | set | none | none | Cmd | +| none | set | provided | none | CLI ARGV (Cmd dropped) | +| set | set | optional | provided | [`--entrypoint`] ++ CLI ARGV | +| none | none | provided | none | CLI ARGV | +| none | none | none | none | `EINVAL` "image has no entrypoint or cmd; pass one on the CLI" | + +### Env merge policy + +The merged guest env is built in this order: + +1. Image `Env` (verbatim, in spec order) +2. Each CLI `-e KEY=VAL` set-or-replaces by key +3. Each CLI `-e KEY` (no `=`) imports the host's value when present, otherwise drops silently +4. `TERM` auto-imported from the host iff the merged env has no `TERM` +5. `PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin` injected iff the merged env has no `PATH` +6. `container=elfuse` injected unconditionally so systemd-style sandbox detection works + +CLI `-e DYLD_*=...` overrides are hard-rejected with `EINVAL`: `DYLD_*` is a +macOS-only loader contract with no meaning inside an aarch64-linux guest. +Image-provided `DYLD_*` entries pass through (the guest ignores them). + +### User and WorkingDir + +`User` accepts seven shapes: the empty string (no override), a numeric +`UID`, `UID:GID`, a symbolic `name`, `name:group`, `uid:group`, or +`name:gid`. Symbolic forms read `/etc/passwd` and `/etc/group` from +the cloned rootfs. A token made entirely of ASCII digits is always +parsed numerically, even when a same-named account ships in the image +(this matches runc semantics, so an image that happens to carry a +`1234` account does not capture `--user 1234`). When the symbolic +form names an account the unpacked layers do not actually carry, +lookup fails closed; `elfuse` never silently falls back to root. +`--user UID` alone defaults GID to the same value. + +`WorkingDir` must be absolute and free of `..` segments. If neither the +image nor the CLI sets it, the guest starts in `/`. The directory is +materialized under the cloned rootfs (`mkdir -p`, mode 0755, best- +effort chown to the resolved uid:gid when `--user` or image User +selects credentials). + +### Scope guardrails + +- Auto-pull on `run` miss -> never; `elfuse oci pull` must run first +- Network policy, `docker run -p`-style port mapping -> later phases +- Live `docker exec`-style attach -> never + +### Runtime host-truth surface + +`elfuse oci run` runs the guest against a freshly cloned per-run +rootfs and a small set of synthesized host-truth files. The rootfs +is produced by APFS `clonefile(2)` against the unpacked image +layers, so the first guest write to any path triggers copy-on-write +in APFS without touching the original image. The clone is removed at +guest exit unless `--keep` is set; nothing is ever pushed back to +the on-disk image, and concurrent `oci run` invocations against the +same image are isolated. + +Three `/etc` files are overwritten in the clone before the guest +starts. Any pre-existing symlink (the common case is +`/etc/resolv.conf -> /run/systemd/resolve/stub-resolv.conf`) is +unlinked first so it does not dangle inside the guest: + +| File | Source | +|--|--| +| `/etc/resolv.conf` | `nameserver` lines harvested from `scutil --dns`; falls back to `8.8.8.8` and `1.1.1.1` on any scutil failure | +| `/etc/hosts` | fixed 5-line block: `localhost`, the ip6-loopback aliases, ip6 link-local multicast, and `127.0.0.1 host.elfuse.internal` | +| `/etc/hostname` | literal string `elfuse` | + +The following pseudo-filesystem paths are synthesized by the host-side +openat interceptor and do not need to exist inside the rootfs: + +| Path | Behavior | +|--|--| +| `/dev/null`, `/dev/zero`, `/dev/random`, `/dev/urandom`, `/dev/tty` | redirected to the host device of the same name | +| `/dev/full` | reads zero-fill, writes of any non-zero length return `ENOSPC` | +| `/dev/console` | mirrored from the controlling tty when present (macOS reserves the real `/dev/console` for the kernel) | +| other `/dev/*` | `ENOENT` | +| `/proc/cpuinfo`, `/proc/meminfo`, `/proc/version` | derived from host sysctl | +| `/proc/self/{maps,exe,status,stat,comm,statm,cgroup}` | synthesized; `cgroup` reports the canonical `0::/` (elfuse runs outside any cgroup hierarchy) | +| `/proc/sys/kernel/{ostype,osrelease,hostname}` | tracks the cached `uname` fields (`Linux`, `6.17.0-20-generic`, `elfuse`) | + +### Libc-adjacent compatibility + +`elfuse` does not patch libc-adjacent payload (NSS modules, time-zone +data, locale data, character-set converters, dynamic-linker cache) +inside the guest. Each item below names the contract `elfuse` honors +and the failure mode an image hits when it does not ship the +matching files. + +- **`/etc/nsswitch.conf`** is read by the guest's libc, not by + `elfuse`. Only the `files` and `dns` backends actually function: + `files` resolves through `/etc/{passwd,group,hosts}` in the cloned + rootfs, and `dns` resolves through host `getaddrinfo` via the + synthesized `/etc/resolv.conf`. Backends such as `systemd`, `sss`, + or `ldap` need their NSS shared object plus a matching daemon, + neither of which `elfuse` provides. +- **NSS shared objects** (`libnss_systemd.so`, `libnss_sss.so`, + `libnss_ldap.so`, ...) are `dlopen`'d by guest libc against its own + loader. `elfuse` never injects NSS modules: they are aarch64-linux + ELF objects against guest libc, so the macOS host has no way to + load them, and the guest can only `dlopen` the modules its image + already carries. +- **tzdata** (`/usr/share/zoneinfo`, `/etc/localtime`, `/etc/timezone`) + ships with the image. `elfuse` does not transcode macOS + `/var/db/timezone/zoneinfo` into the tzdata format; if the image is + missing the needed zone, glibc / musl fall back to UTC. The `TZ` + environment variable is honored as-is and is not rewritten by the + Env merge policy. +- **`/usr/lib/locale/locale-archive`** is not regenerated. glibc + images without a built archive (or the matching `.UTF-8/` + directory) fall back to the `C` locale; locale-aware sort / printf + / strcoll outputs ASCII order. musl images do not use the archive + and are unaffected. +- **`/usr/lib//gconv/`** modules and the `gconv-modules` + index ship with the image. Missing modules surface as `EILSEQ` from + `iconv` / glibc's character-set conversion; this most often shows + up when an image ships a stripped glibc layer. +- **`ld.so.cache`** is not rebuilt. The guest dynamic linker reads + whatever cache the image carries; missing entries fall through to + the linker's library-path search, which is the normal slow path. + +Common workloads and the symptom-to-workaround mapping: + +| Symptom | Trigger | Workaround | +|--|--|--| +| `getaddrinfo` returns `EAI_AGAIN` or an empty result | `/etc/nsswitch.conf` lists a backend (`systemd`, `sss`, ...) that needs a daemon | use a distro whose `nsswitch.conf` is `files dns` (alpine ships this by default; debian needs the file edited) | +| `date`, `strftime` show UTC instead of the expected zone | the image does not contain `/usr/share/zoneinfo/` | install tzdata in the image (`apk add tzdata` / `apt install tzdata`), or pass `-e TZ=UTC` to acknowledge UTC | +| `sort`, `printf`, `strcoll` collate in ASCII order | the image is missing `/usr/lib/locale/locale-archive` or the matching `.UTF-8/` directory | accept the C-locale fallback, run `locale-gen` during the image build, or use a musl-based image (alpine), which does not depend on the archive | + ## Guest Compatibility Model `elfuse` is designed for Linux user-space workloads, not for booting a Linux diff --git a/mk/analysis.mk b/mk/analysis.mk index 246c948..54783aa 100644 --- a/mk/analysis.mk +++ b/mk/analysis.mk @@ -14,10 +14,14 @@ SHELL_SCRIPTS := $(shell git ls-files --cached --others --exclude-standard \ PYTHON_FORMAT_FILES := $(shell git ls-files --cached --others --exclude-standard \ -- '*.py') -## Run clang-tidy on all source files +## Run clang-tidy on all source files. ZSTD_CFLAGS comes from the parent +## Makefile (pkg-config libzstd) so src/oci/decompress.c, which is the only +## translation unit that #includes , can resolve the header during +## analysis. lint: $(BUILD_DIR)/shim_blob.h $(BUILD_DIR)/version.h @echo " TIDY src/" - $(Q)$(CLANG_TIDY) $(SRCS) -- $(CFLAGS) -Isrc -I$(BUILD_DIR) + $(Q)$(CLANG_TIDY) $(SRCS) -- $(CFLAGS) -Isrc -I$(BUILD_DIR) \ + $(ZSTD_CFLAGS) ## Run clang static analyzer (scan-build) analyze: diff --git a/mk/config.mk b/mk/config.mk index 7270e28..2f14b55 100644 --- a/mk/config.mk +++ b/mk/config.mk @@ -16,7 +16,17 @@ endif # Exclude native macOS test files from cross-compilation NATIVE_TESTS := tests/test-multi-vcpu.c tests/test-rwx.c \ - tests/test-tlbi-encoder-host.c + tests/test-tlbi-encoder-host.c \ + tests/test-oci-ref.c \ + tests/test-oci-digest.c tests/test-oci-blob-store.c \ + tests/test-oci-manifest.c tests/test-oci-fetch.c \ + tests/test-oci-store.c tests/test-oci-pull.c \ + tests/test-oci-inspect.c tests/test-oci-tar.c \ + tests/test-oci-decompress.c tests/test-oci-meta.c \ + tests/test-oci-layer-apply.c tests/test-oci-volume.c \ + tests/test-oci-clone.c tests/test-oci-unpack.c \ + tests/test-oci-runspec.c tests/test-oci-path-resolve.c \ + tests/test-oci-run.c SPECIAL_TEST_SRCS := tests/test-lowbase-mem.c SPECIAL_TEST_BINS := $(BUILD_DIR)/test-lowbase-mem-200000 $(BUILD_DIR)/test-lowbase-mem-300000 diff --git a/mk/tests.mk b/mk/tests.mk index fd412ff..f8a8bd2 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -8,7 +8,19 @@ test-rosetta-alpine test-rosetta-audit test-rosetta-jit \ test-rosetta-glibc test-rosetta-all bench-rosetta \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ - test-full test-multi-vcpu test-rwx test-sysroot-rename \ + test-full test-multi-vcpu test-rwx \ + test-oci-ref test-oci-digest test-oci-blob-store test-oci-manifest \ + test-oci-fetch test-oci-fetch-online test-oci-store test-oci-pull \ + test-oci-inspect test-oci-dedup-metrics test-oci-rebuild-cache \ + test-oci-status \ + test-oci-policy \ + test-oci-tar test-oci-decompress test-oci-meta \ + test-oci-origin \ + test-oci-layer-apply test-oci-volume test-oci-clone \ + test-oci-unpack test-oci-runspec test-oci-user test-oci-path-resolve \ + test-oci-runtime-files \ + test-oci-run test-oci-compat oci-fixture-builder \ + test-sysroot-rename \ test-case-collision test-case-collision-fallback test-sysroot-create-paths \ test-proctitle-host test-proctitle-low-stack \ test-sysroot-procfs-exec test-timeout-disable test-fuse-alpine \ @@ -56,6 +68,58 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage \ @$(MAKE) --no-print-directory test-rosetta-cli @printf "\n$(BLUE)━━━ hot-syscall guardrail ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-bench-guardrail + @printf "\n$(BLUE)━━━ OCI reference parser unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-ref + @printf "\n$(BLUE)━━━ OCI digest unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-digest + @printf "\n$(BLUE)━━━ OCI blob store unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-blob-store + @printf "\n$(BLUE)━━━ OCI manifest parser unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-manifest + @printf "\n$(BLUE)━━━ OCI fetch unit tests (offline mock HTTP) ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-fetch + @printf "\n$(BLUE)━━━ OCI store unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-store + @printf "\n$(BLUE)━━━ OCI pull pipeline unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-pull + @printf "\n$(BLUE)━━━ OCI inspect renderer unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-inspect + @printf "\n$(BLUE)━━━ OCI cross-image dedup metrics unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-dedup-metrics + @printf "\n$(BLUE)━━━ OCI rebuild-cache unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-rebuild-cache + @printf "\n$(BLUE)━━━ OCI store-wide status unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-status + @printf "\n$(BLUE)━━━ OCI policy.json loader unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-policy + @printf "\n$(BLUE)━━━ OCI tar reader unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-tar + @printf "\n$(BLUE)━━━ OCI decompression dispatch unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-decompress + @printf "\n$(BLUE)━━━ OCI sidecar metadata unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-meta + @printf "\n$(BLUE)━━━ OCI origin sidecar unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-origin + @printf "\n$(BLUE)━━━ OCI layer applier unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-layer-apply + @printf "\n$(BLUE)━━━ OCI volume bootstrap unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-volume + @printf "\n$(BLUE)━━━ OCI clone-rootfs unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-clone + @printf "\n$(BLUE)━━━ OCI unpack orchestrator smoke ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-unpack + @printf "\n$(BLUE)━━━ OCI runspec resolver unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-runspec + @printf "\n$(BLUE)━━━ OCI User-field resolver unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-user + @printf "\n$(BLUE)━━━ OCI path-resolve unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-path-resolve + @printf "\n$(BLUE)━━━ OCI runtime-files injection unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-runtime-files + @printf "\n$(BLUE)━━━ OCI run orchestrator unit tests ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-run + @printf "\n$(BLUE)━━━ OCI compat shell smoke ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-oci-compat ## Hot-syscall performance guardrail: ensure getpid, libc clock_gettime, ## and 1-byte /dev/urandom reads stay under their TODO ns/op ceilings. @@ -77,6 +141,163 @@ test-bench-guardrail: $(BENCH_GUARDRAIL_DEPS) LINUX_TOOLCHAIN="$(LINUX_TOOLCHAIN)" \ bash tests/test-bench-guardrail.sh +## Run the OCI image reference parser unit tests (native, no HVF) +test-oci-ref: $(BUILD_DIR)/test-oci-ref + @$(BUILD_DIR)/test-oci-ref + +## Run the OCI digest unit tests (native, no HVF) +test-oci-digest: $(BUILD_DIR)/test-oci-digest + @$(BUILD_DIR)/test-oci-digest + +## Run the OCI blob store unit tests (native, no HVF) +test-oci-blob-store: $(BUILD_DIR)/test-oci-blob-store + @$(BUILD_DIR)/test-oci-blob-store + +## Run the OCI manifest / index / config parser unit tests (native, no HVF) +test-oci-manifest: $(BUILD_DIR)/test-oci-manifest + @$(BUILD_DIR)/test-oci-manifest + +## Run the OCI fetch unit tests against an in-process mock HTTP server +## (native, no HVF, no network). +test-oci-fetch: $(BUILD_DIR)/test-oci-fetch + @$(BUILD_DIR)/test-oci-fetch + +## Pull alpine:3.20 from Docker Hub anonymously, verify manifest parse and +## blob digests against a real registry. Opt-in; requires network. Not run by +## `make check`. +test-oci-fetch-online: $(BUILD_DIR)/test-oci-fetch + @OCI_FETCH_ONLINE=1 $(BUILD_DIR)/test-oci-fetch + +## Run the OCI local store unit tests (native, no HVF) +test-oci-store: $(BUILD_DIR)/test-oci-store + @$(BUILD_DIR)/test-oci-store + +## Run the OCI pull pipeline unit tests (native, no HVF, no network) +test-oci-pull: $(BUILD_DIR)/test-oci-pull + @$(BUILD_DIR)/test-oci-pull + +## Run the OCI inspect renderer unit tests (native, no HVF, no network) +test-oci-inspect: $(BUILD_DIR)/test-oci-inspect + @$(BUILD_DIR)/test-oci-inspect + +## Run the OCI cross-image dedup metrics unit tests (native, no HVF, no network). +## Phase 1 Plan 3 C3.4: validates oci_dedup_metrics_compute against pin-only +## and pin + unpacked-tree scratch stores. +test-oci-dedup-metrics: $(BUILD_DIR)/test-oci-dedup-metrics + @$(BUILD_DIR)/test-oci-dedup-metrics + +## Run the OCI rebuild-cache unit tests (native, no HVF, no network). +## Phase 1 Plan 3 C3.5: validates oci_rebuild_cache against scratch +## stores hand-populated via oci_origin_write into a fixture +## /images/sha256-/ tree. +test-oci-rebuild-cache: $(BUILD_DIR)/test-oci-rebuild-cache + @$(BUILD_DIR)/test-oci-rebuild-cache + +## Run the OCI store-wide status unit tests (native, no HVF, no network). +## Phase 1 Plan 4 C4.1: validates oci_status_compute against scratch stores +## hand-populated via stage_image + oci_origin_write fixture helpers. +test-oci-status: $(BUILD_DIR)/test-oci-status + @$(BUILD_DIR)/test-oci-status + +## Run the OCI policy.json schema and loader unit tests (native, no HVF, +## no network). Phase 1 Plan 6 C6.1: validates oci_policy_load against +## scratch HOME / XDG / override trees, the load-order chain, and the +## per-host effective view returned by oci_policy_lookup. +test-oci-policy: $(BUILD_DIR)/test-oci-policy + @$(BUILD_DIR)/test-oci-policy + +## Run the OCI tar reader unit tests (native, no HVF, no network) +test-oci-tar: $(BUILD_DIR)/test-oci-tar + @$(BUILD_DIR)/test-oci-tar + +## Run the OCI decompression dispatch unit tests (native, no HVF, no network) +test-oci-decompress: $(BUILD_DIR)/test-oci-decompress + @$(BUILD_DIR)/test-oci-decompress + +## Run the OCI sidecar metadata unit tests (native, no HVF, no network) +test-oci-meta: $(BUILD_DIR)/test-oci-meta + @$(BUILD_DIR)/test-oci-meta + +## Run the OCI origin sidecar unit tests (native, no HVF, no network). +## Covers oci_origin_write + cJSON parse-back round-trips. Phase 3 sees +## the file in unpacked image directories; Plan 1's root-set walker +## consumes it to attribute layer blobs back to live sysroots. +test-oci-origin: $(BUILD_DIR)/test-oci-origin + @$(BUILD_DIR)/test-oci-origin + +## Run the OCI layer applier unit tests (native, no HVF, no network) +test-oci-layer-apply: $(BUILD_DIR)/test-oci-layer-apply + @$(BUILD_DIR)/test-oci-layer-apply + +## Run the OCI volume bootstrap unit tests (native, no HVF). The +## default-sparsebundle case is gated behind OCI_VOLUME_TEST=1 because +## hdiutil orchestration is slow. +test-oci-volume: $(BUILD_DIR)/test-oci-volume + @$(BUILD_DIR)/test-oci-volume + +## Run the OCI clone-rootfs unit tests (native, no HVF). Skips itself +## if the test scratch directory does not support clonefile. +test-oci-clone: $(BUILD_DIR)/test-oci-clone + @$(BUILD_DIR)/test-oci-clone + +## Run the OCI unpack orchestrator smoke (native, no HVF). The full +## end-to-end fixture is gated behind OCI_VOLUME_TEST=1. +test-oci-unpack: $(BUILD_DIR)/test-oci-unpack + @$(BUILD_DIR)/test-oci-unpack + +## Run the OCI runspec resolver unit tests (native, no HVF, no network). +## Feeds hand-built oci_image_runtime_t literals plus synthetic CLI flags +## through oci_runspec_build and asserts argv / envp / uid / cwd outputs +## against the Phase 3 override matrix and Env policy. Phase 4 symbolic +## User cases write scratch /tmp rootfses for /etc/passwd lookup. +test-oci-runspec: $(BUILD_DIR)/test-oci-runspec + @$(BUILD_DIR)/test-oci-runspec + +## Run the OCI User-field resolver unit tests (native, no HVF, no network). +## Phase 4 F4.7: validates oci_user_lookup against scratch rootfses +## carrying synthetic /etc/passwd / /etc/group; covers the seven OCI +## image-spec User shapes plus the policy edges (digit-name collision, +## missing passwd, name-not-found, invalid characters). +test-oci-user: $(BUILD_DIR)/test-oci-user + @$(BUILD_DIR)/test-oci-user + +## Run the OCI guest PATH resolver unit tests (native, no HVF, no network). +## Builds a fake sysroot tree under /tmp and drives oci_path_resolve +## against it: PATH search, symlink-follow, escape-symlink skip, +## EACCES on noexec, ENOENT diagnostics with searched-dirs list. +test-oci-path-resolve: $(BUILD_DIR)/test-oci-path-resolve + @$(BUILD_DIR)/test-oci-path-resolve + +## Run the OCI runtime-files injection unit tests (native, no HVF, no network). +## Phase 4 F4.2 / F4.3: validates oci_runtime_files_inject against scratch +## run directories, covering fresh-/etc creation, symlink overwrite, +## regular-file overwrite, and the synthesised /etc/{resolv.conf, +## hosts, hostname} content. +test-oci-runtime-files: $(BUILD_DIR)/test-oci-runtime-files + @$(BUILD_DIR)/test-oci-runtime-files + +## Run the OCI run orchestrator unit tests (native, no HVF, no network). +## Covers oci_cli_run argument parsing plus oci_run early-failure +## paths against a case-insensitive volume; the launch backend is +## stubbed via oci_run_set_launch_for_testing so the test never spins +## up a real HVF VM. End-to-end launch coverage lives in the Phase 3 +## commit 6 compat shell suite. +test-oci-run: $(BUILD_DIR)/test-oci-run + @$(BUILD_DIR)/test-oci-run + +## Build the OCI fixture builder tool. Standalone executable used by +## tests/test-oci-compat.sh and available for hand-rolled fixtures. +oci-fixture-builder: $(BUILD_DIR)/oci-fixture-builder + +## Run the OCI run compatibility shell smoke (native, no HVF). Default +## mode covers CLI surface + fixture-builder integration; OCI_COMPAT_TEST=1 +## gates the heavy end-to-end harness (hdiutil sparsebundle + actual +## elfuse oci run launches); OCI_FETCH_ONLINE=1 gates the docker.io +## pull + run sibling. Requires test-hello (assembly aarch64 ELF) + +## elfuse + oci-fixture-builder pre-built. +test-oci-compat: $(ELFUSE_BIN) $(BUILD_DIR)/oci-fixture-builder $(TEST_HELLO_DEP) + @bash tests/test-oci-compat.sh + test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename @set -e; \ tmpdir=$$(mktemp -d); \ diff --git a/mk/toolchain.mk b/mk/toolchain.mk index e0f6be4..ec00aa9 100644 --- a/mk/toolchain.mk +++ b/mk/toolchain.mk @@ -42,3 +42,25 @@ SHIM_ASFLAGS ?= -arch arm64 # clang-format CLANG_FORMAT ?= clang-format + +# OpenSSL (Homebrew) for the OCI fetch test scaffolding. The mock HTTP server +# uses libssl/libcrypto to terminate TLS with a self-signed certificate so the +# ca_file negative cases exercise a real handshake. macOS ships LibreSSL +# headers in a private framework and does not publish a usable include path +# under /usr; brew openssl@3 is the documented public location. +ifeq ($(origin OPENSSL_PREFIX),undefined) + ifneq ($(wildcard /opt/homebrew/opt/openssl@3/include/openssl/ssl.h),) + OPENSSL_PREFIX := /opt/homebrew/opt/openssl@3 + else ifneq ($(wildcard /usr/local/opt/openssl@3/include/openssl/ssl.h),) + OPENSSL_PREFIX := /usr/local/opt/openssl@3 + else + OPENSSL_PREFIX := + endif +endif +ifneq ($(OPENSSL_PREFIX),) + OPENSSL_CFLAGS := -I$(OPENSSL_PREFIX)/include + OPENSSL_LDFLAGS := -L$(OPENSSL_PREFIX)/lib -lssl -lcrypto +else + OPENSSL_CFLAGS := + OPENSSL_LDFLAGS := -lssl -lcrypto +endif diff --git a/src/core/launch.c b/src/core/launch.c new file mode 100644 index 0000000..32f6de5 --- /dev/null +++ b/src/core/launch.c @@ -0,0 +1,148 @@ +/* elfuse VM launch: bring-up + GDB + run loop + teardown + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Extracted from src/main.c so the Phase 3 oci run orchestrator and the + * legacy positional-ELF main can share one bring-up path. The function + * deliberately does NOT touch the original CLI argv block: proctitle + * rewriting must happen in the caller, before elfuse_launch, because the + * caller owns the only pointer to the original argv (the heap-copied + * guest_argv hands a different string region to the guest). + * + * The function does not save/restore host cwd. Callers that care about + * post-exit host cwd preservation snapshot it themselves; this matches + * the pre-refactor main() shape where the cwd save lived alongside the + * CLI parsing rather than the bring-up. Same goes for sysroot_mount + * cleanup -- the --create-sysroot detach belongs to whoever provisioned + * the mount, not to the launch. + * + * shim_blob.h carries the embedded EL1 kernel shim. It is included here + * rather than in src/main.c so symbol shim_bin / shim_bin_len has a + * single definition site once the legacy main was reshaped to call + * elfuse_launch. Any other future caller that needs the shim bytes + * directly should pull them from here. + */ + +#include "launch.h" + +#include +#include +#include +#include + +#include "core/bootstrap.h" +#include "core/guest.h" +#include "core/sysroot.h" + +#include "syscall/proc.h" + +#include "debug/gdbstub.h" +#include "debug/log.h" + +/* Embedded shim binary (generated by xxd -i from shim.bin). */ +#include "shim_blob.h" + +int elfuse_launch(const launch_args_t *args) +{ + if (!args) { + log_error("elfuse_launch: NULL args"); + return 1; + } + + extern char **environ; + char **envp_use = args->envp ? (char **) (uintptr_t) args->envp : environ; + + guest_t g; + bool guest_initialized = false; + guest_bootstrap_t boot; + /* oci_run pre-resolves the entrypoint: args->elf_path is the host path + * under the cloned rootfs and guest_argv[0] is the guest-absolute path + * the guest should see (/proc/self/exe, argv[0]). The binary is a real + * file in the rootfs, never a FUSE-materialized temp, so the host-path + * temp flag is false. (guest_bootstrap_prepare's split of host vs guest + * path landed with the rosetta work; the caller resolves the host path.) + */ + const char *elf_guest_path = (args->guest_argc > 0 && args->guest_argv) + ? args->guest_argv[0] + : args->elf_path; + if (guest_bootstrap_prepare( + &g, args->elf_path, false, elf_guest_path, args->sysroot, + args->guest_argc, args->guest_argv, envp_use, shim_bin, + shim_bin_len, args->verbose, &guest_initialized, &boot) < 0) { + if (guest_initialized) + guest_destroy(&g); + return 1; + } + + if (args->sysroot) { + bool case_sensitive = true; + bool case_preserving = true; + if (sysroot_probe_case_sensitivity(args->sysroot, &case_sensitive, + &case_preserving) == 0) + proc_set_sysroot_casefold(case_preserving && !case_sensitive); + else + proc_set_sysroot_casefold(false); + } else { + proc_set_sysroot_casefold(false); + } + + if (args->has_creds) + proc_set_ids(args->uid, args->uid, args->uid, args->gid, args->gid, + args->gid); + + /* Phase 3 commit 5 will wire cwd_guest here once oci_run materializes + * the image WorkingDir under the cloned rootfs. Today's legacy main + * passes NULL and the guest inherits the host cwd, matching the + * pre-refactor behavior. + */ + (void) args->cwd_guest; + + /* Phase 3 placeholder: fork_child / vfork_notify dispatch stays in + * main() for now (early return before reaching elfuse_launch). The + * fields are kept on launch_args_t so callers route through one + * launch struct shape even when the IPC plumbing changes. + */ + (void) args->fork_child_fd; + (void) args->vfork_notify_fd; + + hv_vcpu_t vcpu; + hv_vcpu_exit_t *vexit; + if (guest_bootstrap_create_vcpu(&g, &boot, args->verbose, &vcpu, &vexit) < + 0) { + if (guest_initialized) + guest_destroy(&g); + return 1; + } + + /* GDB setup must happen before the first run so entry-stop and + * hardware breakpoints can affect the initial vCPU. + */ + if (args->gdb_port > 0) { + if (gdb_stub_init(args->gdb_port, &g) < 0) { + log_error("failed to initialize GDB stub"); + if (guest_initialized) + guest_destroy(&g); + return 1; + } + /* Mirror any preconfigured breakpoints/watchpoints into this + * vCPU. + */ + gdb_stub_sync_debug_regs(vcpu); + if (args->gdb_stop_on_entry) + gdb_stub_wait_for_attach(); + } + + /* vcpu_run_loop owns guest execution until exit, fatal signal, or + * timeout. + */ + int exit_code = + vcpu_run_loop(vcpu, vexit, &g, args->verbose, args->timeout_sec); + + /* Tear down debugger state before freeing guest/vCPU resources. */ + gdb_stub_shutdown(); + if (guest_initialized) + guest_destroy(&g); + + return exit_code; +} diff --git a/src/core/launch.h b/src/core/launch.h new file mode 100644 index 0000000..3422794 --- /dev/null +++ b/src/core/launch.h @@ -0,0 +1,84 @@ +/* elfuse VM launch entry: post-CLI bring-up + run loop + teardown + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * elfuse_launch is the single entry point for "run a guest binary in a + * fresh HVF VM until it exits". It is shared between main() (legacy + * positional-ELF CLI) and the Phase 3 oci run orchestrator. The function + * owns the guest_t, the vCPU, the GDB stub, and the run loop; it does NOT + * own the elf_path / sysroot / guest_argv heap copies or the + * sysroot_mount the host CLI may have provisioned -- those stay with the + * caller so behaviors that need the original CLI argv (proctitle + * rewriting, --create-sysroot detach on exit, host cwd save+restore) + * remain coherent regardless of how the launch was kicked off. + * + * Lifetime / ownership contract: + * + * - The caller owns every pointer in launch_args_t. elfuse_launch reads + * them and does not free them; const-qualified pointers stay valid + * for the duration of the call. + * - envp may be NULL; the host process environ is used in that case. + * - guest_argv is the NULL-terminated string array the guest sees as + * its argv. It must already be heap-copied because the caller may + * have clobbered the original CLI argv with proctitle. + * - has_creds=false means "inherit the host uid/gid"; uid/gid are + * ignored. has_creds=true forces the elfuse guest identity model + * via proc_set_ids. + * - cwd_guest reserves a slot for Phase 3 commit 5 (oci run) to set + * the guest's initial working directory. main()'s legacy positional- + * ELF path passes NULL and the guest inherits the host cwd, matching + * pre-refactor behavior. + * - fork_child_fd / vfork_notify_fd are forwarded for future + * fork-child-routed launches; main() currently dispatches the + * fork-child path before reaching elfuse_launch and so passes -1. + */ + +#pragma once + +#include +#include + +typedef struct { + /* Host filesystem path to the guest ELF (absolute). */ + const char *elf_path; + /* Host filesystem path to the sysroot the guest sees as / (absolute), + * or NULL when the guest runs without a sysroot. + */ + const char *sysroot; + /* NULL-terminated guest argv shape. guest_argc is the count of + * non-NULL entries (matches the legacy main() call shape). + */ + int guest_argc; + const char **guest_argv; + /* NULL-terminated guest environ. NULL means "use host environ". */ + const char **envp; + /* Override host uid/gid when true. Phase 3 commit 5 sets this from + * the image User field; main()'s legacy path leaves it false. + */ + bool has_creds; + uint32_t uid; + uint32_t gid; + /* Guest-absolute initial working directory. NULL inherits the host + * cwd. Wired up by Phase 3 commit 5. + */ + const char *cwd_guest; + /* GDB Remote Serial Protocol port. 0 disables the stub. */ + int gdb_port; + bool gdb_stop_on_entry; + /* Per-iteration vCPU run timeout. 0 disables (no alarm()). */ + int timeout_sec; + /* Fork-child IPC handles. -1 means "not a fork child". main()'s + * --fork-child dispatch handles the >= 0 case before reaching + * elfuse_launch; oci run never sets these. + */ + int fork_child_fd; + int vfork_notify_fd; + bool verbose; +} launch_args_t; + +/* Bring up the guest VM, run it to exit / signal / timeout, tear down, + * return the exit code. Returns 1 on bring-up failure (with a log + * message) and the guest's exit status otherwise. + */ +int elfuse_launch(const launch_args_t *args); diff --git a/src/core/sysroot.c b/src/core/sysroot.c index 188c783..d8a4442 100644 --- a/src/core/sysroot.c +++ b/src/core/sysroot.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -222,10 +223,35 @@ static int spawn_capture_stdout(char *const argv[], return 0; } -static int spawn_simple(char *const argv[]) +static int spawn_simple_common(char *const argv[], bool silence_stdout) { + posix_spawn_file_actions_t actions; + posix_spawn_file_actions_t *actions_ptr = NULL; + if (silence_stdout) { + /* hdiutil prints messages like '"diskN" ejected.' to stdout, + * even on success. Callers that promise a clean stdout contract + * (oci unpack / oci clone) must not inherit that noise, so + * redirect the child's fd 1 to /dev/null. stderr is left alone + * so genuine error messages still surface for diagnostics. + */ + if (posix_spawn_file_actions_init(&actions) != 0) { + errno = ENOMEM; + return -1; + } + if (posix_spawn_file_actions_addopen(&actions, STDOUT_FILENO, + "/dev/null", O_WRONLY, 0) != 0) { + posix_spawn_file_actions_destroy(&actions); + errno = EIO; + return -1; + } + actions_ptr = &actions; + } + pid_t pid = -1; - int spawn_ret = posix_spawnp(&pid, argv[0], NULL, NULL, argv, environ); + int spawn_ret = + posix_spawnp(&pid, argv[0], actions_ptr, NULL, argv, environ); + if (actions_ptr) + posix_spawn_file_actions_destroy(actions_ptr); if (spawn_ret != 0) { errno = spawn_ret; return -1; @@ -244,6 +270,11 @@ static int spawn_simple(char *const argv[]) return 0; } +static int spawn_simple_silent(char *const argv[]) +{ + return spawn_simple_common(argv, true); +} + static int parse_attach_mountpoint(const char *plist, char *mount_path, size_t mount_path_sz) @@ -419,11 +450,11 @@ static int sysroot_detach_mountpoint_force(const char *mount_path, bool force) if (force) { char *const argv[] = {"hdiutil", "detach", "-force", (char *) mount_path, NULL}; - return spawn_simple(argv); + return spawn_simple_silent(argv); } char *const argv[] = {"hdiutil", "detach", (char *) mount_path, NULL}; - return spawn_simple(argv); + return spawn_simple_silent(argv); } static bool sysroot_mountpoint_is_active(const char *mount_path) @@ -505,7 +536,7 @@ int sysroot_create_mount(const char *mount_path, sysroot_mount_t *mount) "elfuse_sysroot", mount->image_path, NULL}; - if (spawn_simple(create_argv) < 0) { + if (spawn_simple_silent(create_argv) < 0) { log_error("sysroot: hdiutil create failed for %s: %s", mount->image_path, strerror(errno)); return -1; diff --git a/src/main.c b/src/main.c index ea1d17e..ec15366 100644 --- a/src/main.c +++ b/src/main.c @@ -33,6 +33,8 @@ #include "core/shim-globals.h" #include "core/sysroot.h" +#include "oci/cli.h" + #include "runtime/forkipc.h" #include "runtime/proctitle.h" @@ -198,6 +200,13 @@ int main(int argc, char **argv) argc = 5; } + /* `elfuse oci ...` is a self-contained CLI subcommand: image distribution + * never touches Hypervisor.framework, so dispatch before any guest setup + * to avoid host-DC-ZVA / entitlement checks the user never asked for. + */ + if (argc > 1 && !strcmp(argv[1], "oci")) + return oci_cli_main(argc - 1, argv + 1); + /* --help and --version do not require an ELF path. */ if (argc > 1) { if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-V")) { diff --git a/src/oci/blob-store.c b/src/oci/blob-store.c new file mode 100644 index 0000000..3c98f4a --- /dev/null +++ b/src/oci/blob-store.c @@ -0,0 +1,677 @@ +/* Content-addressable blob store for OCI image data + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The commit path uses link(2) rather than rename(2) so that a second writer + * racing on the same digest cannot silently overwrite a blob that another + * process already finalized. link returning EEXIST is treated as a dedup + * hit; both clients then unlink their staging file and report success. This + * matches the content-addressable invariant: identical bytes map to one + * inode, regardless of how many concurrent writers raced to produce them. + */ + +#include "blob-store.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "digest.h" + +/* Largest path the store will materialize. Comfortably above PATH_MAX so + * snprintf truncation never silently corrupts a path; callers that pass an + * out_size smaller than this can still recover via the returned length. + */ +#define STORE_PATH_MAX 4096 + +struct oci_blob_store { + char *root; +}; + +struct oci_blob_writer { + oci_blob_store_t *store; + oci_digest_algo_t algo; + char expected_hex[OCI_DIGEST_HEX_MAX + 1]; + char tmp_path[STORE_PATH_MAX]; + int fd; + oci_digester_t *digester; + bool failed; +}; + +static int mkdir_one(const char *path) +{ + if (mkdir(path, 0755) == 0) + return 0; + if (errno == EEXIST) { + struct stat st; + if (stat(path, &st) == 0 && S_ISDIR(st.st_mode)) + return 0; + errno = ENOTDIR; + return -1; + } + return -1; +} + +/* Create every directory along path. Walks component by component so that a + * missing intermediate directory does not abort the whole open. path must + * fit in STORE_PATH_MAX; the caller is responsible for upstream length + * checks (only internal call sites build these paths from store->root plus + * fixed suffixes, all of which stay well under the limit). + */ +static int mkdir_p(const char *path) +{ + char buf[STORE_PATH_MAX]; + size_t len = strlen(path); + if (len == 0 || len >= sizeof(buf)) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(buf, path, len + 1); + + for (size_t i = 1; i < len; i++) { + if (buf[i] != '/') + continue; + buf[i] = '\0'; + if (mkdir_one(buf) < 0) + return -1; + buf[i] = '/'; + } + return mkdir_one(buf); +} + +static int join2(char *out, size_t out_size, const char *a, const char *b) +{ + int n = snprintf(out, out_size, "%s/%s", a, b); + if (n < 0 || (size_t) n >= out_size) { + errno = ENAMETOOLONG; + return -1; + } + return n; +} + +static int ensure_layout(const char *root) +{ + char path[STORE_PATH_MAX]; + if (mkdir_p(root) < 0) + return -1; + if (join2(path, sizeof(path), root, "blobs") < 0 || mkdir_one(path) < 0) + return -1; + if (join2(path, sizeof(path), root, "tmp") < 0 || mkdir_one(path) < 0) + return -1; + + static const char *const algos[] = {"sha256", "sha512"}; + for (size_t i = 0; i < sizeof(algos) / sizeof(algos[0]); i++) { + int n = snprintf(path, sizeof(path), "%s/blobs/%s", root, algos[i]); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + if (mkdir_one(path) < 0) + return -1; + } + return 0; +} + +oci_blob_store_t *oci_blob_store_open(const char *root) +{ + if (!root || !*root) { + errno = EINVAL; + return NULL; + } + if (ensure_layout(root) < 0) + return NULL; + + oci_blob_store_t *s = calloc(1, sizeof(*s)); + if (!s) + return NULL; + s->root = strdup(root); + if (!s->root) { + free(s); + return NULL; + } + return s; +} + +void oci_blob_store_close(oci_blob_store_t *s) +{ + if (!s) + return; + free(s->root); + free(s); +} + +int oci_blob_store_path(const oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *hex, + char *out, + size_t out_size) +{ + if (!s || !out || out_size == 0) { + if (out && out_size) + out[0] = '\0'; + return -1; + } + const char *name = oci_digest_algo_name(algo); + if (!name || !oci_digest_hex_valid(algo, hex)) { + out[0] = '\0'; + return -1; + } + int n = snprintf(out, out_size, "%s/blobs/%s/%s", s->root, name, hex); + if (n < 0) { + out[0] = '\0'; + return -1; + } + return n; +} + +bool oci_blob_store_has(const oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *hex) +{ + char path[STORE_PATH_MAX]; + int n = oci_blob_store_path(s, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) + return false; + struct stat st; + return stat(path, &st) == 0 && S_ISREG(st.st_mode); +} + +/* Monotonic counter used to disambiguate concurrent staging files within the + * same process. mkstemp itself supplies the global uniqueness via the random + * XXXXXX suffix; the counter is here only so that read-modify failures of + * the rand pool cannot defeat in-process uniqueness. + */ +static unsigned long writer_seq(void) +{ + static unsigned long n = 0; + return __sync_add_and_fetch(&n, 1); +} + +/* Shared writer construction. tmp_template_suffix is the part after + * "/tmp/" -- the caller composes a pid/seq form (anonymous) or a + * digest-prefix form (named) and this helper opens mkstemp + chmod + the + * digester, identical between the two public entry points. + */ +static oci_blob_writer_t *writer_begin_with_template( + oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + const char *tmp_template_suffix) +{ + if (!s || !oci_digest_hex_valid(algo, expected_hex)) { + errno = EINVAL; + return NULL; + } + + oci_blob_writer_t *w = calloc(1, sizeof(*w)); + if (!w) + return NULL; + w->store = s; + w->algo = algo; + memcpy(w->expected_hex, expected_hex, oci_digest_hex_len(algo) + 1); + w->fd = -1; + + int n = snprintf(w->tmp_path, sizeof(w->tmp_path), "%s/tmp/%s", s->root, + tmp_template_suffix); + if (n < 0 || (size_t) n >= sizeof(w->tmp_path)) { + free(w); + errno = ENAMETOOLONG; + return NULL; + } + + int fd = mkstemp(w->tmp_path); + if (fd < 0) { + int saved = errno; + free(w); + errno = saved; + return NULL; + } + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + if (fchmod(fd, 0644) < 0) { + int saved = errno; + (void) close(fd); + (void) unlink(w->tmp_path); + free(w); + errno = saved; + return NULL; + } + w->fd = fd; + + w->digester = oci_digester_new(algo); + if (!w->digester) { + int saved = errno ? errno : ENOMEM; + (void) close(w->fd); + (void) unlink(w->tmp_path); + free(w); + errno = saved; + return NULL; + } + return w; +} + +oci_blob_writer_t *oci_blob_writer_begin(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex) +{ + char tmpl[128]; + int n = snprintf(tmpl, sizeof(tmpl), "blob-%ld-%lu-XXXXXX", (long) getpid(), + writer_seq()); + if (n < 0 || (size_t) n >= sizeof(tmpl)) { + errno = ENAMETOOLONG; + return NULL; + } + return writer_begin_with_template(s, algo, expected_hex, tmpl); +} + +#define OCI_BLOB_NAMED_HEX_PREFIX 16 + +oci_blob_writer_t *oci_blob_writer_begin_named(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex) +{ + if (!expected_hex) { + errno = EINVAL; + return NULL; + } + char prefix[OCI_BLOB_NAMED_HEX_PREFIX + 1]; + size_t hl = strlen(expected_hex); + size_t use = + hl < OCI_BLOB_NAMED_HEX_PREFIX ? hl : OCI_BLOB_NAMED_HEX_PREFIX; + memcpy(prefix, expected_hex, use); + prefix[use] = '\0'; + char tmpl[64]; + int n = snprintf(tmpl, sizeof(tmpl), "blob-%s-XXXXXX", prefix); + if (n < 0 || (size_t) n >= sizeof(tmpl)) { + errno = ENAMETOOLONG; + return NULL; + } + return writer_begin_with_template(s, algo, expected_hex, tmpl); +} + +/* Build the per-store tmp/ path into out. Returns true on success, false on + * overflow. The caller is responsible for sizing out (STORE_PATH_MAX fits). + */ +static bool tmp_dir_path(const oci_blob_store_t *s, char *out, size_t cap) +{ + int n = snprintf(out, cap, "%s/tmp", s->root); + return n > 0 && (size_t) n < cap; +} + +/* Pull the leading hex16 digest prefix used for tmp filenames. expected_hex + * is validated by the caller (oci_digest_hex_valid). + */ +static void named_prefix_for(const char *expected_hex, char *out) +{ + size_t hl = strlen(expected_hex); + size_t use = + hl < OCI_BLOB_NAMED_HEX_PREFIX ? hl : OCI_BLOB_NAMED_HEX_PREFIX; + memcpy(out, expected_hex, use); + out[use] = '\0'; +} + +/* Open an existing partial as a writer. Re-hashes the bytes already on disk + * and positions the fd at end-of-file. Returns the writer on success or + * NULL on any I/O failure; the caller decides whether to fall back to a + * fresh writer. The partial file at path is NOT unlinked on failure -- + * caller policy. + */ +static oci_blob_writer_t *open_partial_as_writer(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + const char *path, + int64_t partial_size) +{ + oci_blob_writer_t *w = calloc(1, sizeof(*w)); + if (!w) + return NULL; + w->store = s; + w->algo = algo; + memcpy(w->expected_hex, expected_hex, oci_digest_hex_len(algo) + 1); + size_t plen = strlen(path); + if (plen + 1 > sizeof(w->tmp_path)) { + free(w); + errno = ENAMETOOLONG; + return NULL; + } + memcpy(w->tmp_path, path, plen + 1); + w->fd = open(path, O_RDWR); + if (w->fd < 0) { + free(w); + return NULL; + } + (void) fcntl(w->fd, F_SETFD, FD_CLOEXEC); + w->digester = oci_digester_new(algo); + if (!w->digester) { + int saved = errno ? errno : ENOMEM; + (void) close(w->fd); + free(w); + errno = saved; + return NULL; + } + if (lseek(w->fd, 0, SEEK_SET) < 0) + goto fail_io; + int64_t consumed = 0; + char buf[64 * 1024]; + while (consumed < partial_size) { + ssize_t got = read(w->fd, buf, sizeof(buf)); + if (got == 0) + break; + if (got < 0) { + if (errno == EINTR) + continue; + goto fail_io; + } + oci_digester_update(w->digester, buf, (size_t) got); + consumed += got; + } + if (consumed != partial_size) + goto fail_io; + if (lseek(w->fd, 0, SEEK_END) < 0) + goto fail_io; + return w; + +fail_io: { + int saved = errno ? errno : EIO; + oci_digester_free(w->digester); + (void) close(w->fd); + free(w); + errno = saved; + return NULL; +} +} + +oci_blob_writer_t *oci_blob_writer_resume_named(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + int64_t expected_size, + int64_t *out_resume_offset) +{ + if (out_resume_offset) + *out_resume_offset = 0; + if (!s || !oci_digest_hex_valid(algo, expected_hex)) { + errno = EINVAL; + return NULL; + } + + char tmp_dir[STORE_PATH_MAX]; + if (!tmp_dir_path(s, tmp_dir, sizeof(tmp_dir))) + return oci_blob_writer_begin_named(s, algo, expected_hex); + + char prefix[OCI_BLOB_NAMED_HEX_PREFIX + 1]; + named_prefix_for(expected_hex, prefix); + char glob[8 + OCI_BLOB_NAMED_HEX_PREFIX]; + int gn = snprintf(glob, sizeof(glob), "blob-%s-", prefix); + if (gn <= 0 || (size_t) gn >= sizeof(glob)) + return oci_blob_writer_begin_named(s, algo, expected_hex); + size_t glen = (size_t) gn; + + DIR *d = opendir(tmp_dir); + if (!d) + return oci_blob_writer_begin_named(s, algo, expected_hex); + + char best_path[STORE_PATH_MAX] = {0}; + int64_t best_size = -1; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (strncmp(de->d_name, glob, glen) != 0) + continue; + char cand[STORE_PATH_MAX]; + int cn = snprintf(cand, sizeof(cand), "%s/%s", tmp_dir, de->d_name); + if (cn <= 0 || (size_t) cn >= sizeof(cand)) + continue; + struct stat st; + if (stat(cand, &st) < 0 || !S_ISREG(st.st_mode)) + continue; + int64_t sz = (int64_t) st.st_size; + /* Keep the largest partial; unlink everything else. A partial that is + * already at or past the declared size is corrupt or stale -- the + * caller cannot send a useful Range from it -- so drop it here and + * fall through to the fresh-writer path on no surviving partial. + */ + if (sz <= 0 || sz >= expected_size) { + (void) unlink(cand); + continue; + } + if (sz > best_size) { + if (best_path[0]) + (void) unlink(best_path); + memcpy(best_path, cand, (size_t) cn + 1); + best_size = sz; + } else { + (void) unlink(cand); + } + } + closedir(d); + + if (best_size <= 0 || !best_path[0]) + return oci_blob_writer_begin_named(s, algo, expected_hex); + + oci_blob_writer_t *w = + open_partial_as_writer(s, algo, expected_hex, best_path, best_size); + if (!w) { + (void) unlink(best_path); + return oci_blob_writer_begin_named(s, algo, expected_hex); + } + if (out_resume_offset) + *out_resume_offset = best_size; + return w; +} + +void oci_blob_store_sweep_partials(oci_blob_store_t *s, long ttl_secs) +{ + if (!s) + return; + char tmp_dir[STORE_PATH_MAX]; + if (!tmp_dir_path(s, tmp_dir, sizeof(tmp_dir))) + return; + DIR *d = opendir(tmp_dir); + if (!d) + return; + time_t now = time(NULL); + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (strncmp(de->d_name, "blob-", 5) != 0) + continue; + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/%s", tmp_dir, de->d_name); + if (n <= 0 || (size_t) n >= sizeof(path)) + continue; + struct stat st; + if (stat(path, &st) < 0 || !S_ISREG(st.st_mode)) + continue; + if ((long) (now - st.st_mtime) >= ttl_secs) + (void) unlink(path); + } + closedir(d); +} + +bool oci_blob_writer_write(oci_blob_writer_t *w, const void *buf, size_t len) +{ + if (!w || w->failed || (!buf && len)) { + if (w) + w->failed = true; + errno = EINVAL; + return false; + } + const uint8_t *p = buf; + while (len > 0) { + ssize_t n = write(w->fd, p, len); + if (n < 0) { + if (errno == EINTR) + continue; + w->failed = true; + return false; + } + if (n == 0) { + w->failed = true; + errno = EIO; + return false; + } + oci_digester_update(w->digester, p, (size_t) n); + p += n; + len -= (size_t) n; + } + return true; +} + +/* Discard staging file, free fd and digester. Errno is preserved across the + * cleanup so the caller can return its own diagnostic. + */ +static void writer_cleanup_fail(oci_blob_writer_t *w) +{ + int saved = errno; + if (w->fd >= 0) + (void) close(w->fd); + (void) unlink(w->tmp_path); + oci_digester_free(w->digester); + free(w); + errno = saved; +} + +/* fsync the directory that contains path so a newly linked/renamed entry is + * durable across a crash: fsync on the file persists its data but not the + * parent directory entry that names it. Best-effort -- the file-level fsync + * is the primary guarantee, and some filesystems reject a directory fsync, so + * a failure here must not fail the surrounding commit. + */ +static void fsync_parent_dir(const char *path) +{ + const char *slash = strrchr(path, '/'); + char dir[STORE_PATH_MAX]; + if (!slash) { + dir[0] = '.'; + dir[1] = '\0'; + } else if (slash == path) { + dir[0] = '/'; + dir[1] = '\0'; + } else { + size_t n = (size_t) (slash - path); + if (n >= sizeof(dir)) + return; + memcpy(dir, path, n); + dir[n] = '\0'; + } + int dfd = open(dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (dfd < 0) + return; + (void) fsync(dfd); + (void) close(dfd); +} + +int oci_blob_writer_commit(oci_blob_writer_t *w) +{ + if (!w) { + errno = EINVAL; + return -1; + } + if (w->failed) { + writer_cleanup_fail(w); + errno = EIO; + return -1; + } + + char got_hex[OCI_DIGEST_HEX_MAX + 1]; + if (oci_digester_finish_hex(w->digester, got_hex) == 0) { + writer_cleanup_fail(w); + errno = EIO; + return -1; + } + oci_digester_free(w->digester); + w->digester = NULL; + + if (strcmp(got_hex, w->expected_hex) != 0) { + if (w->fd >= 0) + (void) close(w->fd); + (void) unlink(w->tmp_path); + free(w); + errno = EINVAL; + return -1; + } + + if (fsync(w->fd) < 0) { + int saved = errno; + (void) close(w->fd); + (void) unlink(w->tmp_path); + free(w); + errno = saved; + return -1; + } + if (close(w->fd) < 0) { + int saved = errno; + w->fd = -1; + (void) unlink(w->tmp_path); + free(w); + errno = saved; + return -1; + } + w->fd = -1; + + char final_path[STORE_PATH_MAX]; + int n = oci_blob_store_path(w->store, w->algo, w->expected_hex, final_path, + sizeof(final_path)); + if (n < 0 || (size_t) n >= sizeof(final_path)) { + (void) unlink(w->tmp_path); + free(w); + errno = ENAMETOOLONG; + return -1; + } + + if (link(w->tmp_path, final_path) < 0) { + if (errno != EEXIST) { + int saved = errno; + (void) unlink(w->tmp_path); + free(w); + errno = saved; + return -1; + } + /* Dedup hit: another writer beat this one. Content is identical + * because the digest matched, so dropping the staging file is the + * correct action. + */ + } + /* Persist the directory entry just created by link(2); without this a + * crash can leave the blob's data on disk but unreferenced by its name. + */ + fsync_parent_dir(final_path); + (void) unlink(w->tmp_path); + free(w); + return 0; +} + +void oci_blob_writer_abort(oci_blob_writer_t *w) +{ + if (!w) + return; + if (w->fd >= 0) + (void) close(w->fd); + (void) unlink(w->tmp_path); + oci_digester_free(w->digester); + free(w); +} + +int oci_blob_store_put_bytes(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + const void *buf, + size_t len) +{ + oci_blob_writer_t *w = oci_blob_writer_begin(s, algo, expected_hex); + if (!w) + return -1; + if (!oci_blob_writer_write(w, buf, len)) { + int saved = errno; + oci_blob_writer_abort(w); + errno = saved; + return -1; + } + return oci_blob_writer_commit(w); +} diff --git a/src/oci/blob-store.h b/src/oci/blob-store.h new file mode 100644 index 0000000..5345571 --- /dev/null +++ b/src/oci/blob-store.h @@ -0,0 +1,157 @@ +/* Content-addressable blob store for OCI image data + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Layout matches the OCI image-layout convention: + * + * /blobs// finalized blob, immutable + * /tmp/blob-- in-flight staging file + * + * Every blob is committed by writing the staging file, fsync'ing it, hashing + * the bytes as they stream through the writer, comparing the actual hex to + * the expected hex from the manifest descriptor, and then atomically renaming + * the staging file into its final blobs// slot. A digest mismatch + * unlinks the staging file before returning -1, so an interrupted or hostile + * pull leaves no visible-complete blob behind. Repeated commits of the same + * digest are dedup'd in place (final path already exists -> drop staging, + * report success). + * + * The store path is opaque to this module; the caller picks it. Phase 1 + * targets ~/Library/Application Support/elfuse/blobs/ on macOS; a later + * slice moves the root onto a case-sensitive APFS sparse volume (oci-roadmap + * Q1) but the store API does not change. + */ + +#pragma once + +#include +#include +#include + +#include "digest.h" + +typedef struct oci_blob_store oci_blob_store_t; +typedef struct oci_blob_writer oci_blob_writer_t; + +/* Open or create the store rooted at `root`. The directory tree (root, + * blobs/, tmp) is created with mode 0755 if missing. Returns NULL on + * failure with errno preserved. + */ +oci_blob_store_t *oci_blob_store_open(const char *root); + +/* Release the store handle. Does not delete on-disk state. Safe on NULL. */ +void oci_blob_store_close(oci_blob_store_t *s); + +/* Resolve the final on-disk path for algo:hex. Returns the number of bytes + * the full path occupies excluding the trailing NUL, or -1 if algo or hex + * is malformed. Always writes a NUL terminator when out_size > 0; if the + * full path does not fit, out is truncated but still NUL-terminated and the + * caller can detect overflow by comparing the return value to out_size. + */ +int oci_blob_store_path(const oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *hex, + char *out, + size_t out_size); + +/* True when blobs// exists as a regular file. */ +bool oci_blob_store_has(const oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *hex); + +/* Begin a streaming write keyed by the descriptor digest. The writer hashes + * payload bytes as they stream and verifies the result against expected_hex + * during commit. Returns NULL on failure with errno preserved. expected_hex + * must be lowercase and the correct length for algo. + */ +oci_blob_writer_t *oci_blob_writer_begin(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex); + +/* Same contract as oci_blob_writer_begin but stages into + * tmp/blob--XXXXXX. The digest prefix in the filename lets + * parallel batch callers find their in-flight partials by digest (used by + * the curl_multi pull path's resume + sweep, plan-doc Plan 5). Both writer + * entry points produce final blobs at the same blobs// path and + * are otherwise interchangeable; pickers can choose based on whether they + * need digest-keyed staging. + */ +oci_blob_writer_t *oci_blob_writer_begin_named(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex); + +/* Open a writer that resumes from an existing tmp/blob--* + * partial when one is present. Scans tmp/ for files whose name starts with + * the digest prefix; selects the largest matching file; reopens it O_RDWR, + * re-hashes the bytes that are already on disk into the digester, and + * positions the fd at end-of-file so the next write appends. The selected + * partial keeps its name; siblings with the same prefix are unlinked. + * + * Falls back to oci_blob_writer_begin_named (fresh mkstemp staging file, + * offset 0) when any of these holds: + * - tmp/ has no matching partial + * - partial size >= expected_size (corrupt or stale; would defeat the + * descriptor size cap during the resumed transfer) + * - partial size is zero + * - reopen / re-hash fails for any reason + * + * On success returns a writer and, when out_resume_offset is non-NULL, + * writes the partial byte count there (0 on the fallback paths). Returns + * NULL only on the hard-error conditions that oci_blob_writer_begin_named + * also returns NULL for (EINVAL on bad arguments, ENOMEM on calloc, etc.). + * + * expected_size must be the descriptor's declared blob size in bytes. The + * store does not know what the caller will subsequently issue as a Range + * request; the parameter is here so the store can pre-reject partials that + * are at or past the declared size (which would tip the streaming overflow + * gate downstream into a "blob exceeded declared size" failure rather than + * a clean restart). + */ +oci_blob_writer_t *oci_blob_writer_resume_named(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + int64_t expected_size, + int64_t *out_resume_offset); + +/* Delete tmp/ partials whose mtime is older than ttl_secs seconds ago. + * Matches any file in tmp/ whose name starts with "blob-"; non-matching + * names and subdirectories are skipped. Errors during the scan are + * silent: a missing tmp/ directory, a permission denial, or a per-file + * unlink failure leaves the rest of the sweep running. + * + * The store retains exclusive ownership of tmp/, so an aggressive prefix + * filter would not catch any third-party content. ttl_secs is the caller's + * choice; the fetcher invokes this once per batch with seven days. + */ +void oci_blob_store_sweep_partials(oci_blob_store_t *s, long ttl_secs); + +/* Append data to the staging file and the running digest. Returns true on + * success or false on a short write / I/O error with errno preserved. On + * failure the writer is left in a state where the only valid next call is + * oci_blob_writer_abort. + */ +bool oci_blob_writer_write(oci_blob_writer_t *w, const void *buf, size_t len); + +/* Finalize the digest, fsync, verify against expected_hex, then atomically + * rename into place. On success returns 0 and releases the writer. On digest + * mismatch returns -1 with errno set to EINVAL. On I/O failure returns -1 + * with errno preserved. The staging file is always unlinked on failure so + * an aborted pull never leaves a visible-complete blob. + */ +int oci_blob_writer_commit(oci_blob_writer_t *w); + +/* Discard the staging file and release the writer. Always succeeds; safe on + * NULL. + */ +void oci_blob_writer_abort(oci_blob_writer_t *w); + +/* One-shot helper: write a memory buffer into the store. Returns 0 on + * success or -1 on failure (errno preserved); semantics match the streaming + * commit path including dedup and atomic rename. + */ +int oci_blob_store_put_bytes(oci_blob_store_t *s, + oci_digest_algo_t algo, + const char *expected_hex, + const void *buf, + size_t len); diff --git a/src/oci/cli.c b/src/oci/cli.c new file mode 100644 index 0000000..f132137 --- /dev/null +++ b/src/oci/cli.c @@ -0,0 +1,1543 @@ +/* `elfuse oci` subcommand dispatch + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Slice 5a turns pull into a real subcommand: argument parsing for --store, + * -u USER[:PASS], --insecure-ca PEM, --insecure, -q, plus the actual oci_pull + * invocation against a freshly opened store and fetcher. Slice 5b extends + * inspect with --store and --all-platforms and an offline manifest tree + * renderer (src/oci/inspect.c). prune and list still return rc=2 "not + * implemented yet". + */ + +#include "cli.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "clone-rootfs.h" +#include "fetch.h" +#include "inspect.h" +#include "policy.h" +#include "pull.h" +#include "rebuild-cache.h" +#include "ref.h" +#include "run.h" +#include "status.h" +#include "store.h" +#include "unpack.h" +#include "volume.h" + +static int print_usage(FILE *out) +{ + fputs( + "usage: elfuse oci [args]\n" + "\n" + "Subcommands:\n" + " pull [OPTIONS] Download an image into the local store\n" + " inspect [OPTIONS] Show the canonical reference and parsed " + "fields\n" + " unpack [OPTIONS] Apply layers into a case-sensitive " + "sysroot\n" + " clone [OPTIONS] Create a per-run rootfs via APFS " + "clonefile\n" + " run [OPTIONS] [ARG...]\n" + " Launch a guest binary from a pulled image\n" + " prune Remove unreferenced blobs from the local " + "store\n" + " rebuild-cache Back-fill stack cache from unpacked " + "sysroots\n" + " status Report pins, unpacked sysroots, and cache " + "totals\n" + " list List images in the local store\n" + "\n" + "Pull options:\n" + " --store DIR Override the local store root\n" + " (default: ~/Library/Application " + "Support/elfuse/store)\n" + " -u, --user USER[:PASS] HTTP Basic auth for private registries\n" + " --insecure-ca PEM Trust PEM as the registry CA bundle\n" + " --insecure Skip TLS verify (loopback registries only)\n" + " --refresh Revalidate the pinned tag via " + "If-None-Match:\n" + " on 304 reuse the cached manifest and only\n" + " re-fetch missing layer blobs\n" + " -q, --quiet Suppress per-blob progress output\n" + "\n" + "Env: ELFUSE_OCI_PROGRESS=plain disables the in-place TTY\n" + " redraw (use when the terminal mis-handles CSI cursor-up;\n" + " prints one summary line per blob on completion).\n" + "\n" + "Policy: optional policy.json controls per-registry insecure /\n" + " ca_bundle / auth_file. Read from $ELFUSE_POLICY_FILE >\n" + " $XDG_CONFIG_HOME/elfuse/policy.json >\n" + " $HOME/.config/elfuse/policy.json >\n" + " $HOME/Library/Application Support/elfuse/policy.json.\n" + " CLI flags override; --quiet silences override warnings.\n" + "\n" + "Inspect options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Include unpacked sysroots under DIR/images/\n" + " in the layer reuse comparison\n" + " --all-platforms List every platform entry of an image index\n" + " instead of drilling into linux/arm64\n" + "\n" + "Unpack options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Override the sysroot APFS volume mount point\n" + " (default: auto-provisioned sparsebundle " + "under\n" + " ~/Library/Application " + "Support/elfuse/sysroots/)\n" + " --force Re-extract even if the image sysroot exists\n" + " -q, --quiet Suppress per-layer progress output\n" + "\n" + "Clone options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Override the sysroot APFS volume mount point\n" + " --name NAME Human-friendly suffix for the per-run rootfs\n" + " --keep Do not register the run dir for cleanup " + "(no-op)\n" + "\n" + "Prune options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Treat unpacked sysroots under DIR/images/ as " + "roots\n" + " --commit Actually unlink dangling blobs " + "(default: dry-run)\n" + " --older-than DUR Skip dangling blobs younger than DUR\n" + " (suffixes: s, m, h, d, w; plain integer = " + "seconds;\n" + " 0 = no filter)\n" + " --keep-bytes SIZE Keep up to SIZE bytes of newest dangling " + "blobs;\n" + " (suffixes: K, M, G; KiB-based; 0 = no " + "budget)\n" + "\n" + "Rebuild-cache options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Override the sysroot APFS volume mount point\n" + " --commit Actually write stack snapshots " + "(default: dry-run)\n" + "\n" + "Status options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Include unpacked sysroots under DIR/images/\n" + " in the report\n" + " --json Emit machine-readable JSON (schemaVersion 1)\n" + " --no-disk-usage Skip recursive size sums (faster on large " + "stores)\n" + "\n" + "Refs follow the docker/containerd grammar:\n" + " alpine, alpine:3.20, user/repo, ghcr.io/owner/img:tag,\n" + " repo@sha256:, repo:tag@sha256:\n", + out); + return out == stderr ? 2 : 0; +} + +/* Argument parser state for `oci inspect`. Mirrors pull_args_t in shape so a + * future cleanup could share the flag-loop, but the option set is disjoint + * enough that today the two parsers live side by side. + */ +typedef struct { + const char *store_root; + const char *volume_root; + bool show_all_platforms; + const char *ref_str; +} inspect_args_t; + +static int parse_inspect_args(int argc, char **argv, inspect_args_t *out) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "--all-platforms")) { + out->show_all_platforms = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + return -1; + } + out->volume_root = argv[i]; + } else { + fprintf(stderr, "error: unknown inspect option: %s\n", a); + return -1; + } + i++; + } + if (i >= argc) { + fputs("error: inspect needs a reference argument\n", stderr); + return -1; + } + if (i != argc - 1) { + fputs("error: extra arguments after inspect reference\n", stderr); + return -1; + } + out->ref_str = argv[i]; + return 0; +} + +static int cmd_inspect(int argc, char **argv) +{ + inspect_args_t args = {0}; + int prc = parse_inspect_args(argc, argv, &args); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + + oci_ref_t ref = {0}; + const char *err = NULL; + if (oci_ref_parse(args.ref_str, &ref, &err) < 0) { + fprintf(stderr, "error: %s\n", err ? err : "invalid reference"); + return 1; + } + char *canonical = oci_ref_canonical(&ref); + if (!canonical) { + fputs("error: out of memory rendering canonical reference\n", stderr); + oci_ref_free(&ref); + return 1; + } + printf("canonical: %s\n", canonical); + printf("registry: %s\n", ref.registry); + printf("repository: %s\n", ref.repository); + printf("tag: %s\n", ref.tag ? ref.tag : "(none)"); + printf("digest: %s\n", ref.digest ? ref.digest : "(none)"); + free(canonical); + + /* Resolve store root: --store override or platform default. */ + char *default_root = NULL; + const char *store_root = args.store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root " + "(HOME not set?)\n"); + oci_ref_free(&ref); + return 1; + } + store_root = default_root; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + oci_ref_free(&ref); + free(default_root); + return 1; + } + + oci_inspect_options_t opts = { + .out = stdout, + .show_all_platforms = args.show_all_platforms, + .volume_root = args.volume_root, + }; + err = NULL; + int rc = oci_inspect(store, &ref, &opts, &err); + if (rc < 0 && err) + fprintf(stderr, "error: %s\n", err); + + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + return rc < 0 ? 1 : 0; +} + +/* Argument parser state for `oci pull`. Defaults are populated by the caller, + * then patched by parse_pull_args. + */ +typedef struct { + const char *store_root; /* heap-owned by main, not by parse */ + const char *user; + const char *password; + const char *ca_file; + bool allow_insecure; + bool quiet; + bool refresh; + const char *ref_str; + char *user_pass_buf; /* heap; freed by caller */ +} pull_args_t; + +/* Split USER[:PASS] in-place. Returns 0 on success or -1 with errno=ENOMEM. */ +static int split_userpass(const char *spec, pull_args_t *out) +{ + free(out->user_pass_buf); + out->user_pass_buf = strdup(spec); + if (!out->user_pass_buf) { + errno = ENOMEM; + return -1; + } + char *colon = strchr(out->user_pass_buf, ':'); + if (colon) { + *colon = '\0'; + out->user = out->user_pass_buf; + out->password = colon + 1; + } else { + out->user = out->user_pass_buf; + out->password = ""; + } + return 0; +} + +/* argv layout coming in: ["pull", "--flag", "...", ""]. argv[0] is the + * subcommand name; argv[argc-1] is the ref. Anything in between is options. + * Returns 0 on success, -1 on bad arguments (after printing an error). + */ +static int parse_pull_args(int argc, char **argv, pull_args_t *out) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "-q") || !strcmp(a, "--quiet")) { + out->quiet = true; + } else if (!strcmp(a, "--refresh")) { + out->refresh = true; + } else if (!strcmp(a, "--insecure")) { + out->allow_insecure = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "-u") || !strcmp(a, "--user")) { + if (++i >= argc) { + fputs("error: -u needs USER[:PASS]\n", stderr); + return -1; + } + if (split_userpass(argv[i], out) < 0) { + fputs("error: out of memory parsing credentials\n", stderr); + return -1; + } + } else if (!strcmp(a, "--insecure-ca")) { + if (++i >= argc) { + fputs("error: --insecure-ca needs a PEM path\n", stderr); + return -1; + } + out->ca_file = argv[i]; + } else { + fprintf(stderr, "error: unknown pull option: %s\n", a); + return -1; + } + i++; + } + if (i >= argc) { + fputs("error: pull needs a reference argument\n", stderr); + return -1; + } + if (i != argc - 1) { + fputs("error: extra arguments after pull reference\n", stderr); + return -1; + } + out->ref_str = argv[i]; + return 0; +} + +static int cmd_pull(int argc, char **argv) +{ + pull_args_t args = {0}; + int prc = parse_pull_args(argc, argv, &args); + if (prc == 1) { + free(args.user_pass_buf); + return print_usage(stdout); + } + if (prc < 0) { + free(args.user_pass_buf); + return 2; + } + + /* Default store root: either --store override or the platform default. */ + char *default_root = NULL; + const char *store_root = args.store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root " + "(HOME not set?)\n"); + free(args.user_pass_buf); + return 1; + } + store_root = default_root; + } + + oci_ref_t ref = {0}; + const char *err = NULL; + if (oci_ref_parse(args.ref_str, &ref, &err) < 0) { + fprintf(stderr, "error: invalid reference: %s\n", + err ? err : "(unknown)"); + free(default_root); + free(args.user_pass_buf); + return 1; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + oci_ref_free(&ref); + free(default_root); + free(args.user_pass_buf); + return 1; + } + + oci_policy_t *policy = NULL; + const char *perr = NULL; + if (oci_policy_load(&policy, &perr) < 0) { + fprintf(stderr, "error: policy load failed: %s\n", + perr ? perr : strerror(errno)); + oci_policy_free(policy); + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + free(args.user_pass_buf); + return 1; + } + + /* Warn when a CLI flag overrides a policy-declared value for the same + * registry. The check is gated on having actually loaded a policy file + * (source != "") so the user sees nothing when no policy is configured. + * The warn surface is intentionally minimal: one line per overridden + * field, host-scoped, no JSON pointer plumbing. --quiet silences all. + */ + if (!args.quiet && policy && oci_policy_source(policy)[0] != '\0') { + oci_policy_effective_t pol; + oci_policy_lookup(policy, ref.registry, &pol); + if (args.allow_insecure && !pol.insecure) + fprintf(stderr, + "warning: --insecure overrides policy.insecure for %s\n", + ref.registry); + if (args.user && pol.auth_file) + fprintf(stderr, "warning: -u overrides policy.auth_file for %s\n", + ref.registry); + if (args.ca_file && pol.ca_bundle) + fprintf( + stderr, + "warning: --insecure-ca overrides policy.ca_bundle for %s\n", + ref.registry); + } + + oci_fetcher_options_t fopts = { + .username = args.user, + .password = args.password, + .ca_file = args.ca_file, + .allow_insecure = args.allow_insecure, + .policy = policy, + }; + oci_fetcher_t *fetcher = oci_fetcher_new(&fopts); + if (!fetcher) { + fprintf(stderr, "error: could not create fetcher: %s\n", + strerror(errno)); + oci_policy_free(policy); + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + free(args.user_pass_buf); + return 1; + } + + if (!args.quiet) { + char *canon = oci_ref_canonical(&ref); + fprintf(stderr, "elfuse oci pull %s\n store: %s\n", + canon ? canon : args.ref_str, store_root); + free(canon); + } + + oci_pull_options_t popts = { + .quiet = args.quiet, + .refresh = args.refresh, + }; + err = NULL; + int rc = oci_pull(fetcher, store, &ref, &popts, &err); + if (rc < 0) { + fprintf(stderr, "error: pull failed: %s\n", + err ? err : strerror(errno)); + } else if (!args.quiet) { + fputs("done.\n", stderr); + } + + oci_fetcher_free(fetcher); + oci_policy_free(policy); + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + free(args.user_pass_buf); + return rc < 0 ? 1 : 0; +} + +typedef struct { + const char *store_root; + const char *volume_root; + const char *ref_str; + const char *name; /* clone only */ + bool quiet; + bool force_relayer; + bool keep_on_exit; /* clone only */ +} unpack_args_t; + +static int parse_unpack_args(int argc, + char **argv, + unpack_args_t *out, + bool clone_mode) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "-q") || !strcmp(a, "--quiet")) { + out->quiet = true; + } else if (!strcmp(a, "--force")) { + if (clone_mode) { + fputs("error: --force is not valid for oci clone\n", stderr); + return -1; + } + out->force_relayer = true; + } else if (!strcmp(a, "--keep")) { + if (!clone_mode) { + fputs("error: --keep is only valid for oci clone\n", stderr); + return -1; + } + out->keep_on_exit = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + return -1; + } + out->volume_root = argv[i]; + } else if (clone_mode && !strcmp(a, "--name")) { + if (++i >= argc) { + fputs("error: --name needs an argument\n", stderr); + return -1; + } + out->name = argv[i]; + } else { + fprintf(stderr, "error: unknown option: %s\n", a); + return -1; + } + i++; + } + if (i >= argc) { + fputs("error: subcommand needs a reference argument\n", stderr); + return -1; + } + if (i != argc - 1) { + fputs("error: extra arguments after reference\n", stderr); + return -1; + } + out->ref_str = argv[i]; + return 0; +} + +static int do_unpack(const unpack_args_t *args, + char **out_image_dir, + oci_store_t **out_store_keep) +{ + char *default_root = NULL; + const char *store_root = args->store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root (HOME?)\n"); + return 1; + } + store_root = default_root; + } + + oci_ref_t ref = {0}; + const char *err = NULL; + if (oci_ref_parse(args->ref_str, &ref, &err) < 0) { + fprintf(stderr, "error: invalid reference: %s\n", + err ? err : "(unknown)"); + free(default_root); + return 1; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + oci_ref_free(&ref); + free(default_root); + return 1; + } + + oci_unpack_options_t uopts = { + .volume_root = args->volume_root, + .quiet = args->quiet, + .force_relayer = args->force_relayer, + }; + err = NULL; + int rc = oci_unpack(store, &ref, &uopts, out_image_dir, &err); + if (rc < 0) { + fprintf(stderr, "error: unpack failed: %s\n", + err ? err : strerror(errno)); + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + return 1; + } + oci_ref_free(&ref); + free(default_root); + if (out_store_keep) + *out_store_keep = store; + else + oci_store_close(store); + return 0; +} + +static int cmd_unpack(int argc, char **argv) +{ + unpack_args_t args = {0}; + int prc = parse_unpack_args(argc, argv, &args, false); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + char *image_dir = NULL; + int rc = do_unpack(&args, &image_dir, NULL); + if (rc != 0) { + free(image_dir); + return rc; + } + /* stdout: just the absolute path so $(elfuse oci unpack ref) composes. */ + printf("%s\n", image_dir); + free(image_dir); + return 0; +} + +static int cmd_clone(int argc, char **argv) +{ + unpack_args_t args = {0}; + int prc = parse_unpack_args(argc, argv, &args, true); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + + char *image_dir = NULL; + oci_store_t *store = NULL; + int rc = do_unpack(&args, &image_dir, &store); + if (rc != 0) { + free(image_dir); + return rc; + } + oci_store_close(store); + + /* Resolve the volume root the same way unpack did so clone-rootfs + * lands in the same sparsebundle. + */ + char *volume_root = NULL; + const char *err = NULL; + if (oci_volume_ensure(args.volume_root, &volume_root, &err) < 0) { + fprintf(stderr, "error: volume_ensure failed: %s\n", + err ? err : strerror(errno)); + free(image_dir); + return 1; + } + + /* image_dir has a trailing slash; strip it for the clone source. */ + size_t il = strlen(image_dir); + if (il > 1 && image_dir[il - 1] == '/') + image_dir[il - 1] = '\0'; + + char *run_dir = NULL; + err = NULL; + if (oci_clone_rootfs(image_dir, volume_root, &run_dir, &err) < 0) { + fprintf(stderr, "error: clone failed: %s\n", + err ? err : strerror(errno)); + free(image_dir); + free(volume_root); + return 1; + } + /* --keep is forward-looking; Phase 2 does not auto-clean either way. */ + (void) args.keep_on_exit; + printf("%s\n", run_dir); + free(run_dir); + free(image_dir); + free(volume_root); + return 0; +} + +static int cmd_not_implemented(const char *name) +{ + fprintf(stderr, + "error: 'oci %s' is not implemented yet (see issue #31 Phase 1)\n", + name); + return 2; +} + +/* Argument parser state for `oci prune`. The flag set is intentionally + * minimal: dry-run is the default (so the operator can review what would + * be reclaimed before committing) and --commit is the only switch that + * actually unlinks. --volume mirrors the same flag in unpack/clone so + * the same volume root the user uses for unpacked sysroots also feeds + * the keep-set walk; without --volume only pins contribute. + * + * older_than_sec / keep_bytes default to 0, which the store API + * interprets as "no filter" so an operator that does not opt in sees + * the C1.3 behaviour (every dangling blob is pruned). The CLI does + * not distinguish between "not specified" and "--older-than 0" / + * "--keep-bytes 0" because both compose to the same zero-filter + * behaviour; a future structured-output mode (Plan 4 oci status) can + * surface filter state from the rendered options struct directly. + */ +typedef struct { + const char *store_root; + const char *volume_root; + bool commit; + uint64_t older_than_sec; + uint64_t keep_bytes; +} prune_args_t; + +/* Parse a duration string into seconds. Accepted shapes are + * pure integer interpreted as seconds + * s seconds + * m minutes (60s) + * h hours (3600s) + * d days (86400s) + * w weeks (604800s) + * where is a decimal unsigned integer with no sign character. The + * trailing suffix, when present, is a single ASCII letter; any other + * trailing bytes are rejected. Overflow is detected by checking the + * intermediate product against UINT64_MAX before applying it. Returns + * 0 on success with the value written to *out; -1 on any parse or + * overflow failure with errno=EINVAL. + */ +static int parse_duration(const char *s, uint64_t *out) +{ + if (!s || !*s) { + errno = EINVAL; + return -1; + } + /* strtoull silently accepts a leading '-' and wraps the result; + * detect a negative sign and the leading-whitespace skip + * explicitly so a user-facing flag never quietly parses "-5d" as + * a huge positive duration. + */ + if (*s == '-' || *s == '+' || *s == ' ' || *s == '\t') { + errno = EINVAL; + return -1; + } + char *endp = NULL; + errno = 0; + unsigned long long raw = strtoull(s, &endp, 10); + if (errno == ERANGE) { + errno = EINVAL; + return -1; + } + if (!endp || endp == s) { + errno = EINVAL; + return -1; + } + uint64_t value = (uint64_t) raw; + uint64_t multiplier = 1; + if (*endp != '\0') { + if (endp[1] != '\0') { + errno = EINVAL; + return -1; + } + switch (*endp) { + case 's': + multiplier = 1; + break; + case 'm': + multiplier = 60; + break; + case 'h': + multiplier = 3600; + break; + case 'd': + multiplier = 86400; + break; + case 'w': + multiplier = 604800; + break; + default: + errno = EINVAL; + return -1; + } + } + if (multiplier != 0 && value > UINT64_MAX / multiplier) { + errno = EINVAL; + return -1; + } + *out = value * multiplier; + return 0; +} + +/* Parse a byte-size string into bytes. Accepted shapes are + * pure integer interpreted as bytes + * K / KB 1024 bytes per unit + * M / MB 1024 * 1024 bytes per unit + * G / GB 1024 * 1024 * 1024 bytes per unit + * matching du / df conventions (KiB-based, not decimal). The + * trailing suffix is at most two letters, case-sensitive, and the + * second letter when present must be 'B'. Negative inputs and + * arithmetic overflow are rejected with EINVAL; on success returns 0 + * and stores the byte count in *out. + */ +static int parse_byte_size(const char *s, uint64_t *out) +{ + if (!s || !*s) { + errno = EINVAL; + return -1; + } + if (*s == '-' || *s == '+' || *s == ' ' || *s == '\t') { + errno = EINVAL; + return -1; + } + char *endp = NULL; + errno = 0; + unsigned long long raw = strtoull(s, &endp, 10); + if (errno == ERANGE) { + errno = EINVAL; + return -1; + } + if (!endp || endp == s) { + errno = EINVAL; + return -1; + } + uint64_t value = (uint64_t) raw; + uint64_t multiplier = 1; + if (*endp != '\0') { + char unit = *endp; + char trailer = endp[1]; + if (trailer != '\0' && (trailer != 'B' || endp[2] != '\0')) { + errno = EINVAL; + return -1; + } + switch (unit) { + case 'K': + multiplier = 1024ULL; + break; + case 'M': + multiplier = 1024ULL * 1024ULL; + break; + case 'G': + multiplier = 1024ULL * 1024ULL * 1024ULL; + break; + default: + errno = EINVAL; + return -1; + } + } + if (multiplier != 0 && value > UINT64_MAX / multiplier) { + errno = EINVAL; + return -1; + } + *out = value * multiplier; + return 0; +} + +static int parse_prune_args(int argc, char **argv, prune_args_t *out) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "--commit")) { + out->commit = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + return -1; + } + out->volume_root = argv[i]; + } else if (!strcmp(a, "--older-than")) { + if (++i >= argc) { + fputs("error: --older-than needs an argument\n", stderr); + return -1; + } + if (parse_duration(argv[i], &out->older_than_sec) < 0) { + fprintf(stderr, "error: --older-than: invalid duration '%s'\n", + argv[i]); + return -1; + } + } else if (!strcmp(a, "--keep-bytes")) { + if (++i >= argc) { + fputs("error: --keep-bytes needs an argument\n", stderr); + return -1; + } + if (parse_byte_size(argv[i], &out->keep_bytes) < 0) { + fprintf(stderr, "error: --keep-bytes: invalid byte size '%s'\n", + argv[i]); + return -1; + } + } else { + fprintf(stderr, "error: unknown prune option: %s\n", a); + return -1; + } + i++; + } + if (i != argc) { + fputs("error: prune takes no positional arguments\n", stderr); + return -1; + } + return 0; +} + +static int cmd_prune(int argc, char **argv) +{ + prune_args_t args = {0}; + int prc = parse_prune_args(argc, argv, &args); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + + char *default_root = NULL; + const char *store_root = args.store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root " + "(HOME not set?)\n"); + return 1; + } + store_root = default_root; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + free(default_root); + return 1; + } + + oci_store_prune_options_t opts = { + .commit = args.commit, + .volume_root = args.volume_root, + .older_than_sec = args.older_than_sec, + .keep_bytes = args.keep_bytes, + }; + const char *err = NULL; + int rc = oci_store_prune(store, &opts, &err); + if (rc < 0) { + fprintf(stderr, "error: prune failed: %s\n", + err ? err : strerror(errno)); + oci_store_close(store); + free(default_root); + return 1; + } + + /* Output preserves the Plan 1 line shape so existing operator scripts + * and the compat smoke continue to match on "reclaimable: N blobs" / + * "reclaimed: N blobs" / "kept: M blobs" / "dry-run". The new layer + * and stack lines (C3.3d) only render when their counter is non-zero + * so a single-family cache still produces the legacy two-line output. + */ + const char *verb_done = args.commit ? "reclaimed" : "reclaimable"; + const char *verb_pre = args.commit ? "reclaimed" : "reclaimable"; + if (args.commit) { + printf("reclaimed: %zu blobs (%llu bytes)\n", opts.pruned_blobs, + (unsigned long long) opts.pruned_bytes); + if (opts.pruned_layers > 0) + printf("layers: %zu %s (%llu bytes)\n", opts.pruned_layers, + verb_done, (unsigned long long) opts.pruned_layer_bytes); + if (opts.pruned_stacks > 0) + printf("stacks: %zu %s (%llu bytes)\n", opts.pruned_stacks, + verb_done, (unsigned long long) opts.pruned_stack_bytes); + if (opts.skipped_blobs > 0) + printf("skipped: %zu blobs (%llu bytes)\n", opts.skipped_blobs, + (unsigned long long) opts.skipped_bytes); + if (opts.skipped_layers > 0) + printf("layers: %zu skipped (%llu bytes)\n", opts.skipped_layers, + (unsigned long long) opts.skipped_layer_bytes); + if (opts.skipped_stacks > 0) + printf("stacks: %zu skipped (%llu bytes)\n", opts.skipped_stacks, + (unsigned long long) opts.skipped_stack_bytes); + printf("kept: %zu blobs\n", opts.kept_blobs); + if (opts.kept_layers > 0) + printf("kept: %zu layers\n", opts.kept_layers); + if (opts.kept_stacks > 0) + printf("kept: %zu stacks\n", opts.kept_stacks); + } else { + printf("reclaimable: %zu blobs (%llu bytes)\n", opts.pruned_blobs, + (unsigned long long) opts.pruned_bytes); + if (opts.pruned_layers > 0) + printf("layers: %zu %s (%llu bytes)\n", opts.pruned_layers, + verb_pre, (unsigned long long) opts.pruned_layer_bytes); + if (opts.pruned_stacks > 0) + printf("stacks: %zu %s (%llu bytes)\n", opts.pruned_stacks, + verb_pre, (unsigned long long) opts.pruned_stack_bytes); + if (opts.skipped_blobs > 0) + printf("skipped: %zu blobs (%llu bytes)\n", opts.skipped_blobs, + (unsigned long long) opts.skipped_bytes); + if (opts.skipped_layers > 0) + printf("layers: %zu skipped (%llu bytes)\n", + opts.skipped_layers, + (unsigned long long) opts.skipped_layer_bytes); + if (opts.skipped_stacks > 0) + printf("stacks: %zu skipped (%llu bytes)\n", + opts.skipped_stacks, + (unsigned long long) opts.skipped_stack_bytes); + printf("kept: %zu blobs\n", opts.kept_blobs); + if (opts.kept_layers > 0) + printf("kept: %zu layers\n", opts.kept_layers); + if (opts.kept_stacks > 0) + printf("kept: %zu stacks\n", opts.kept_stacks); + printf("(dry-run; pass --commit to delete)\n"); + } + + oci_store_close(store); + free(default_root); + return 0; +} + +/* Argument parser state for oci rebuild-cache. Mirrors prune_args_t in + * shape because both subcommands carry --store / --volume / --commit; the + * two parsers stay disjoint so a future option addition to either does not + * surprise the other. + */ +typedef struct { + const char *store_root; + const char *volume_root; + bool commit; +} rebuild_cache_args_t; + +static int parse_rebuild_cache_args(int argc, + char **argv, + rebuild_cache_args_t *out) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "--commit")) { + out->commit = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + return -1; + } + out->volume_root = argv[i]; + } else { + fprintf(stderr, "error: unknown rebuild-cache option: %s\n", a); + return -1; + } + i++; + } + if (i != argc) { + fputs("error: rebuild-cache takes no positional arguments\n", stderr); + return -1; + } + return 0; +} + +static int cmd_rebuild_cache(int argc, char **argv) +{ + rebuild_cache_args_t args = {0}; + int prc = parse_rebuild_cache_args(argc, argv, &args); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + + char *default_root = NULL; + const char *store_root = args.store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root " + "(HOME not set?)\n"); + return 1; + } + store_root = default_root; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + free(default_root); + return 1; + } + + oci_rebuild_cache_options_t opts = { + .commit = args.commit, + }; + const char *err = NULL; + int rc = oci_rebuild_cache(store, args.volume_root, &opts, &err); + if (rc < 0) { + fprintf(stderr, "error: rebuild-cache failed: %s\n", + err ? err : strerror(errno)); + oci_store_close(store); + free(default_root); + return 1; + } + + size_t skipped_bad = opts.trees_skipped_no_origin + + opts.trees_skipped_bad_origin + + opts.trees_skipped_empty_diffids; + + if (args.commit) { + printf("rebuild-cache:\n"); + printf(" scanned: %zu unpacked trees\n", opts.trees_scanned); + printf(" rebuilt: %zu trees (%zu stack entries)\n", + opts.trees_rebuilt, opts.stack_entries_added); + printf(" already cached: %zu trees\n", opts.trees_skipped_cached); + if (skipped_bad > 0) + printf(" skipped (bad): %zu trees\n", skipped_bad); + if (opts.trees_failed > 0) + printf(" failed: %zu trees\n", opts.trees_failed); + } else { + printf("rebuild-cache (dry-run):\n"); + printf(" scanned: %zu unpacked trees\n", opts.trees_scanned); + printf(" would rebuild: %zu trees (%zu stack entries)\n", + opts.trees_rebuilt, opts.stack_entries_added); + printf(" already cached: %zu trees\n", opts.trees_skipped_cached); + if (skipped_bad > 0) + printf(" skipped (bad): %zu trees\n", skipped_bad); + if (opts.trees_failed > 0) + printf(" failed: %zu trees\n", opts.trees_failed); + printf("(dry-run; pass --commit to write)\n"); + } + + oci_store_close(store); + free(default_root); + return 0; +} + +/* Argument parser state for `oci status`. The flag set is intentionally + * small: store / volume mirrors prune / rebuild-cache, --json toggles the + * structured output, --no-disk-usage is the operator escape hatch for very + * large stores where the recursive size walk dominates wall time. + */ +typedef struct { + const char *store_root; + const char *volume_root; + bool json; + bool no_disk_usage; +} status_args_t; + +static int parse_status_args(int argc, char **argv, status_args_t *out) +{ + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + return 1; + } else if (!strcmp(a, "--json")) { + out->json = true; + } else if (!strcmp(a, "--no-disk-usage")) { + out->no_disk_usage = true; + } else if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + return -1; + } + out->store_root = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + return -1; + } + out->volume_root = argv[i]; + } else { + fprintf(stderr, "error: unknown status option: %s\n", a); + return -1; + } + i++; + } + if (i != argc) { + fputs("error: status takes no positional arguments\n", stderr); + return -1; + } + return 0; +} + +/* Render a byte count compactly. Values >= 1 MiB use "~X.Y MiB", smaller + * non-zero values render as raw bytes, zero stays "0 B". Mirrors the inspect + * renderer's shared_bytes formatter so the two surfaces look consistent. + */ +static void format_bytes(uint64_t bytes, char *out, size_t cap) +{ + if (bytes == 0) { + snprintf(out, cap, "0 B"); + return; + } + if (bytes >= (uint64_t) 1024 * 1024) { + double mib = (double) bytes / (1024.0 * 1024.0); + snprintf(out, cap, "~%.1f MiB", mib); + return; + } + snprintf(out, cap, "%llu B", (unsigned long long) bytes); +} + +/* Render epoch seconds as a fixed-width "YYYY-MM-DD HH:MM" string in local + * time. Out buffer must hold at least 17 bytes. Negative or zero epochs + * render as "(unknown)". + */ +static void format_mtime(int64_t epoch, char *out, size_t cap) +{ + if (epoch <= 0) { + snprintf(out, cap, "(unknown)"); + return; + } + time_t t = (time_t) epoch; + struct tm lt; + if (!localtime_r(&t, <)) { + snprintf(out, cap, "(unknown)"); + return; + } + strftime(out, cap, "%Y-%m-%d %H:%M", <); +} + +/* Truncate a digest to ":<13-hex>..." so wide manifest digests still + * align in the table. Mirrors the short_digest helper in inspect.c without + * the dependency on that translation unit. + */ +static void short_digest(const char *full, char out[24]) +{ + if (!full) { + snprintf(out, 24, "(null)"); + return; + } + size_t len = strlen(full); + if (len <= 22) { + snprintf(out, 24, "%s", full); + return; + } + snprintf(out, 24, "%.19s...", full); +} + +static const char *pin_status_label(oci_status_pin_code_t c) +{ + switch (c) { + case OCI_STATUS_PIN_OK: + return "ok"; + case OCI_STATUS_PIN_MISSING_MANIFEST: + return "missing manifest"; + case OCI_STATUS_PIN_CORRUPT_MANIFEST: + return "corrupt manifest"; + case OCI_STATUS_PIN_CORRUPT_CONFIG: + return "corrupt config"; + case OCI_STATUS_PIN_INDEX_NO_ARM64: + return "no linux/arm64 entry"; + } + return "unknown"; +} + +static const char *unpacked_status_label(oci_status_unpacked_code_t c) +{ + switch (c) { + case OCI_STATUS_UNPACKED_OK: + return "ok"; + case OCI_STATUS_UNPACKED_MISSING_ORIGIN: + return "missing origin"; + case OCI_STATUS_UNPACKED_CORRUPT_ORIGIN: + return "corrupt origin"; + } + return "unknown"; +} + +/* Emit one JSON-quoted token with backslash and double-quote escaping. + * Mirrors the print_quoted_token static in inspect.c without dragging the + * dependency; control chars pass through (operator-facing strings, never + * raw binary). + */ +static void emit_json_quoted(FILE *out, const char *s) +{ + fputc('"', out); + if (s) { + for (const char *p = s; *p; p++) { + if (*p == '"' || *p == '\\') + fputc('\\', out); + fputc(*p, out); + } + } + fputc('"', out); +} + +static void render_status_human(FILE *out, const oci_status_t *st) +{ + /* Pins section. */ + if (st->pin_count == 0) { + fprintf(out, "PINS (0): (none)\n\n"); + } else { + fprintf(out, "PINS (%zu):\n", st->pin_count); + for (size_t i = 0; i < st->pin_count; i++) { + const oci_status_pin_entry_t *p = &st->pins[i]; + char short_d[24]; + short_digest(p->digest, short_d); + if (p->status != OCI_STATUS_PIN_OK) { + fprintf(out, " %-40s %-22s (%s)\n", + p->name ? p->name : "(unknown)", short_d, + pin_status_label(p->status)); + continue; + } + char mtime_s[20]; + format_mtime(p->last_seen_mtime, mtime_s, sizeof(mtime_s)); + fprintf(out, " %-40s %-22s %2zu layers %s\n", + p->name ? p->name : "(unknown)", short_d, p->layer_count, + mtime_s); + } + fputc('\n', out); + } + + /* Unpacked sysroots section. */ + if (st->unpacked_count == 0) { + fprintf(out, "UNPACKED SYSROOTS (0): (none)\n\n"); + } else { + fprintf(out, "UNPACKED SYSROOTS (%zu):\n", st->unpacked_count); + for (size_t i = 0; i < st->unpacked_count; i++) { + const oci_status_unpacked_entry_t *u = &st->unpacked[i]; + if (u->status != OCI_STATUS_UNPACKED_OK) { + fprintf(out, " %s (%s)\n", u->path ? u->path : "(unknown)", + unpacked_status_label(u->status)); + continue; + } + char short_d[24]; + short_digest(u->manifest_digest, short_d); + char bytes_s[24]; + if (st->disk_usage_skipped) + snprintf(bytes_s, sizeof(bytes_s), "(skipped)"); + else + format_bytes(u->tree_bytes, bytes_s, sizeof(bytes_s)); + fprintf(out, " %s %-22s %s\n", u->path ? u->path : "(unknown)", + short_d, bytes_s); + } + fputc('\n', out); + } + + /* Store totals section. */ + char blob_b[24], layer_b[24], stack_b[24], total_b[24]; + if (st->disk_usage_skipped) { + snprintf(blob_b, sizeof(blob_b), "(skipped)"); + snprintf(layer_b, sizeof(layer_b), "(skipped)"); + snprintf(stack_b, sizeof(stack_b), "(skipped)"); + snprintf(total_b, sizeof(total_b), "(skipped)"); + } else { + format_bytes(st->blob_bytes_total, blob_b, sizeof(blob_b)); + format_bytes(st->layer_cache_bytes_total, layer_b, sizeof(layer_b)); + format_bytes(st->stack_cache_bytes_total, stack_b, sizeof(stack_b)); + uint64_t total = st->blob_bytes_total + st->layer_cache_bytes_total + + st->stack_cache_bytes_total; + format_bytes(total, total_b, sizeof(total_b)); + } + fprintf(out, "STORE TOTALS:\n"); + fprintf(out, " blobs: %zu (%s)\n", st->blob_count, blob_b); + fprintf(out, " layers raw: %zu of %zu reachable cached (%s)\n", + st->diff_ids_populated, st->diff_ids_reachable, layer_b); + fprintf(out, " layers stack: %zu of %zu reachable cached (%s)\n", + st->chain_ids_populated, st->chain_ids_reachable, stack_b); + fprintf(out, " total: %s\n", total_b); + if (st->disk_usage_skipped) + fprintf(out, " (disk usage skipped)\n"); +} + +static void render_status_json(FILE *out, const oci_status_t *st) +{ + fprintf(out, "{\"schemaVersion\":1,\"pins\":["); + for (size_t i = 0; i < st->pin_count; i++) { + const oci_status_pin_entry_t *p = &st->pins[i]; + if (i > 0) + fputc(',', out); + fprintf(out, "{\"name\":"); + emit_json_quoted(out, p->name); + fprintf(out, ",\"digest\":"); + emit_json_quoted(out, p->digest); + fprintf(out, + ",\"manifest_size\":%llu,\"config_size\":%llu,\"layer_count\":" + "%zu,\"last_seen_mtime\":%lld,\"status\":\"%s\"}", + (unsigned long long) p->manifest_size, + (unsigned long long) p->config_size, p->layer_count, + (long long) p->last_seen_mtime, pin_status_label(p->status)); + } + fprintf(out, "],\"unpacked\":["); + for (size_t i = 0; i < st->unpacked_count; i++) { + const oci_status_unpacked_entry_t *u = &st->unpacked[i]; + if (i > 0) + fputc(',', out); + fprintf(out, "{\"path\":"); + emit_json_quoted(out, u->path); + fprintf(out, ",\"manifest_digest\":"); + emit_json_quoted(out, u->manifest_digest); + fprintf(out, + ",\"layer_count\":%zu,\"tree_bytes\":%llu,\"status\":\"%s\"}", + u->layer_count, (unsigned long long) u->tree_bytes, + unpacked_status_label(u->status)); + } + fprintf(out, "],\"totals\":{"); + fprintf(out, "\"blob_count\":%zu,\"blob_bytes\":%llu,", st->blob_count, + (unsigned long long) st->blob_bytes_total); + fprintf(out, "\"layer_cache_count\":%zu,\"layer_cache_bytes\":%llu,", + st->layer_cache_count, + (unsigned long long) st->layer_cache_bytes_total); + fprintf(out, "\"stack_cache_count\":%zu,\"stack_cache_bytes\":%llu,", + st->stack_cache_count, + (unsigned long long) st->stack_cache_bytes_total); + fprintf(out, "\"diff_ids_reachable\":%zu,\"diff_ids_populated\":%zu,", + st->diff_ids_reachable, st->diff_ids_populated); + fprintf(out, "\"chain_ids_reachable\":%zu,\"chain_ids_populated\":%zu,", + st->chain_ids_reachable, st->chain_ids_populated); + fprintf(out, "\"disk_usage_skipped\":%s", + st->disk_usage_skipped ? "true" : "false"); + fprintf(out, "}}\n"); +} + +static int cmd_status(int argc, char **argv) +{ + status_args_t args = {0}; + int prc = parse_status_args(argc, argv, &args); + if (prc == 1) + return print_usage(stdout); + if (prc < 0) + return 2; + + char *default_root = NULL; + const char *store_root = args.store_root; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fprintf(stderr, + "error: could not determine default store root " + "(HOME not set?)\n"); + return 1; + } + store_root = default_root; + } + + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: could not open store at %s: %s\n", store_root, + strerror(errno)); + free(default_root); + return 1; + } + + oci_status_options_t sopts = { + .volume_root = args.volume_root, + .skip_disk_usage = args.no_disk_usage, + }; + oci_status_t st = {0}; + const char *err = NULL; + if (oci_status_compute(store, &sopts, &st, &err) < 0) { + fprintf(stderr, "error: status failed: %s\n", + err ? err : strerror(errno)); + oci_status_free(&st); + oci_store_close(store); + free(default_root); + return 1; + } + + if (args.json) + render_status_json(stdout, &st); + else + render_status_human(stdout, &st); + + oci_status_free(&st); + oci_store_close(store); + free(default_root); + return 0; +} + +int oci_cli_main(int argc, char **argv) +{ + if (argc < 2) + return print_usage(stderr); + + const char *sub = argv[1]; + if (!strcmp(sub, "-h") || !strcmp(sub, "--help") || !strcmp(sub, "help")) + return print_usage(stdout); + if (!strcmp(sub, "inspect")) + return cmd_inspect(argc - 1, argv + 1); + if (!strcmp(sub, "pull")) + return cmd_pull(argc - 1, argv + 1); + if (!strcmp(sub, "unpack")) + return cmd_unpack(argc - 1, argv + 1); + if (!strcmp(sub, "clone")) + return cmd_clone(argc - 1, argv + 1); + if (!strcmp(sub, "run")) + return oci_cli_run(argc - 1, argv + 1); + if (!strcmp(sub, "prune")) + return cmd_prune(argc - 1, argv + 1); + if (!strcmp(sub, "rebuild-cache")) + return cmd_rebuild_cache(argc - 1, argv + 1); + if (!strcmp(sub, "status")) + return cmd_status(argc - 1, argv + 1); + if (!strcmp(sub, "list") || !strcmp(sub, "ls")) + return cmd_not_implemented("list"); + + fprintf(stderr, "error: unknown oci subcommand: %s\n", sub); + return print_usage(stderr); +} diff --git a/src/oci/cli.h b/src/oci/cli.h new file mode 100644 index 0000000..781efd4 --- /dev/null +++ b/src/oci/cli.h @@ -0,0 +1,18 @@ +/* `elfuse oci` subcommand dispatch + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Sits on the side of the main argv parser: when argv[1] == "oci" the rest + * of the command line is forwarded here. Subcommands are pull, inspect, + * prune, and list. Only inspect parses a reference today; the others return + * a deterministic "not yet implemented" exit so users can discover the + * surface without crashes. + */ + +#pragma once + +/* argc/argv are the slice starting at "oci" (i.e. argv[0] == "oci"). Returns + * a process exit code suitable for main() to return directly. + */ +int oci_cli_main(int argc, char **argv); diff --git a/src/oci/clone-rootfs.c b/src/oci/clone-rootfs.c new file mode 100644 index 0000000..67ee976 --- /dev/null +++ b/src/oci/clone-rootfs.c @@ -0,0 +1,190 @@ +/* OCI per-run rootfs via clonefile(2) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oci/clone-rootfs.h" + +#define CR_PATH_MAX 4096 + +static int set_err(const char **err, const char *msg, int err_no) +{ + if (err) + *err = msg; + errno = err_no; + return -1; +} + +static int rand_hex(char *out, size_t n_hex) +{ + /* n_hex is the number of hex chars in the output; the helper reads + * n_hex/2 random bytes from getentropy and prints them. + */ + size_t need = n_hex / 2; + uint8_t buf[32]; + if (need > sizeof(buf)) + return -1; + if (getentropy(buf, need) < 0) + return -1; + static const char hex[] = "0123456789abcdef"; + for (size_t i = 0; i < need; i++) { + out[i * 2] = hex[buf[i] >> 4]; + out[i * 2 + 1] = hex[buf[i] & 0xf]; + } + out[n_hex] = '\0'; + return 0; +} + +static int mkdir_p(const char *path) +{ + char buf[CR_PATH_MAX]; + size_t n = strlen(path); + if (n >= sizeof(buf)) + return -1; + memcpy(buf, path, n + 1); + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + *p = '/'; + } + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + return 0; +} + +int oci_clone_rootfs(const char *src_image_dir, + const char *volume_root, + char **out_run_dir, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!src_image_dir || !volume_root || !out_run_dir) + return set_err(err, "clone: NULL argument", EINVAL); + *out_run_dir = NULL; + + /* Image dir must exist and be a directory. */ + struct stat sb; + if (stat(src_image_dir, &sb) < 0) + return set_err(err, "clone: src image stat failed", errno); + if (!S_ISDIR(sb.st_mode)) + return set_err(err, "clone: src image is not a directory", ENOTDIR); + + /* Provision volume_root/runs/. */ + char runs_dir[CR_PATH_MAX]; + if ((size_t) snprintf(runs_dir, sizeof(runs_dir), "%s/runs", volume_root) >= + sizeof(runs_dir)) + return set_err(err, "clone: runs path overflow", ENAMETOOLONG); + if (mkdir_p(runs_dir) < 0) + return set_err(err, "clone: cannot create runs dir", errno); + + /* Pick a fresh run id. 12 hex chars = 48 bits of randomness; ample + * for elfuse process lifetimes. + */ + char id[13]; + if (rand_hex(id, 12) < 0) + return set_err(err, "clone: getentropy failed", errno); + + char run_dir[CR_PATH_MAX]; + if ((size_t) snprintf(run_dir, sizeof(run_dir), "%s/%s", runs_dir, id) >= + sizeof(run_dir)) + return set_err(err, "clone: run path overflow", ENAMETOOLONG); + + /* clonefile creates the destination atomically: pass dst that does + * NOT exist (the call itself creates the directory). Use + * CLONE_NOFOLLOW so a symlink at the src root is not followed off + * the immutable image. + */ + if (clonefile(src_image_dir, run_dir, CLONE_NOFOLLOW) < 0) + return set_err(err, "clone: clonefile failed", errno); + + char *dup = strdup(run_dir); + if (!dup) { + /* Roll back the clone so the caller sees no half-state. */ + (void) oci_clone_rootfs_remove(run_dir, NULL); + return set_err(err, "clone: strdup failed", ENOMEM); + } + *out_run_dir = dup; + return 0; +} + +int oci_clone_rootfs_remove(const char *run_dir, const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!run_dir) + return set_err(err, "clone remove: NULL path", EINVAL); + + struct stat st; + if (lstat(run_dir, &st) < 0) { + if (errno == ENOENT) + return 0; + return set_err(err, "clone remove: lstat failed", errno); + } + if (!S_ISDIR(st.st_mode)) { + if (unlink(run_dir) < 0) + return set_err(err, "clone remove: unlink failed", errno); + return 0; + } + DIR *d = opendir(run_dir); + if (!d) + return set_err(err, "clone remove: opendir failed", errno); + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[CR_PATH_MAX]; + if ((size_t) snprintf(child, sizeof(child), "%s/%s", run_dir, + de->d_name) >= sizeof(child)) { + rc = -1; + errno = ENAMETOOLONG; + break; + } + if (oci_clone_rootfs_remove(child, NULL) < 0) { + rc = -1; + break; + } + } + closedir(d); + if (rc == 0 && rmdir(run_dir) < 0) + return set_err(err, "clone remove: rmdir failed", errno); + return rc; +} + +int oci_clone_rootfs_gc(const char *volume_root, + time_t older_than, + const char **err) +{ + (void) volume_root; + (void) older_than; + if (err) + *err = NULL; + /* Phase 2 stub: `elfuse oci prune` does not yet sweep stale runs. + * Phase 3 will walk volume_root/runs/ and unlink entries older + * than older_than. The stub returns 0 so the CLI can call it + * unconditionally without a feature gate. + */ + return 0; +} diff --git a/src/oci/clone-rootfs.h b/src/oci/clone-rootfs.h new file mode 100644 index 0000000..a4b1918 --- /dev/null +++ b/src/oci/clone-rootfs.h @@ -0,0 +1,54 @@ +/* OCI per-run rootfs via clonefile(2) + * + * Phase 2 commits Q2 of oci-roadmap.md to APFS clonefile-based + * copy-up: each `elfuse oci clone` invocation gets a fresh directory + * tree cloned from the immutable image sysroot. APFS file-level CoW + * makes the clone nearly O(1) at start, and only modified files + * allocate new blocks. + * + * Apple's clonefile(2) is recursive across directories since macOS + * 10.12. Hardlinks inside the source tree survive the clone metadata + * pass; cross-tree hardlinks back to the immutable image are NOT + * created, so the layer applier in commit 5 must materialize intra- + * image hardlinks via link(2) rather than relying on the clone. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +/* Clone the immutable image directory at src_image_dir into a fresh + * run directory under volume_root/runs/. On success *out_run_dir + * receives a heap-allocated absolute path (caller frees) and returns + * 0. On failure returns -1 with errno set and *err populated. + * + * Errors: + * ENOTSUP the volume does not support APFS clones (non-APFS + * scratch or a future fs without clonefile) + * ENOSPC sparsebundle full + * EACCES volume read-only or run_dir creation denied + * ENAMETOOLONG generated run_dir path overflows the buffer + */ +int oci_clone_rootfs(const char *src_image_dir, + const char *volume_root, + char **out_run_dir, + const char **err); + +/* Recursively remove a run directory previously returned by + * oci_clone_rootfs. Best effort: returns 0 on full removal, -1 on + * the first irrecoverable error. + */ +int oci_clone_rootfs_remove(const char *run_dir, const char **err); + +/* Sweep volume_root/runs/ for stale clones older than older_than + * (epoch seconds). Phase 2 ships this as a stub returning 0 because + * `elfuse oci prune` is not extended in this PR; Phase 3 will + * actually walk the directory. + */ +int oci_clone_rootfs_gc(const char *volume_root, + time_t older_than, + const char **err); diff --git a/src/oci/decompress.c b/src/oci/decompress.c new file mode 100644 index 0000000..20d3134 --- /dev/null +++ b/src/oci/decompress.c @@ -0,0 +1,326 @@ +/* OCI decompression dispatch (gzip + zstd + passthrough) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* Opt into the libzstd static-only interface to access ZSTD_ErrorCode + * and the named constants (ZSTD_error_frameParameter_windowTooLarge in + * particular). The decoder still goes through the public symbols; only + * the error classification consults the static-only enum. + */ +#define ZSTD_STATIC_LINKING_ONLY +#include +#include + +#include "oci/decompress.h" + +/* 128 MiB; rejects pathological zstd headers without hurting real layers. */ +#define OCI_ZSTD_MAX_WINDOW_LOG 27 + +/* Decoder-side input buffer. Sized to one host page so libcurl-style + * pipelines do not pay extra syscalls per read. + */ +#define OCI_DECOMPRESS_IBUF 65536 + +typedef enum { + OCI_STREAM_NONE, + OCI_STREAM_GZIP, + OCI_STREAM_ZSTD, +} oci_stream_kind_t; + +struct oci_stream { + oci_stream_kind_t kind; + int fd; + bool eof; + const char *last_err; + + /* zlib backend */ + z_stream zs; + bool zs_inited; + + /* zstd backend */ + ZSTD_DCtx *zd; + + /* Shared input buffer; both backends pull from it. The position + * advances as the decoder consumes input, and refill_input pulls + * from fd when it is exhausted. + */ + uint8_t *ibuf; + size_t ibuf_len; + size_t ibuf_pos; +}; + +static ssize_t read_some(int fd, void *buf, size_t cap) +{ + while (1) { + ssize_t n = read(fd, buf, cap); + if (n < 0 && errno == EINTR) + continue; + return n; + } +} + +static int refill_input(oci_stream_t *s) +{ + if (s->ibuf_pos < s->ibuf_len) + return 0; + ssize_t n = read_some(s->fd, s->ibuf, OCI_DECOMPRESS_IBUF); + if (n < 0) { + s->last_err = "decompress: input read failed"; + return -1; + } + s->ibuf_pos = 0; + s->ibuf_len = (size_t) n; + return 0; +} + +oci_stream_t *oci_decompress_open(int fd, + oci_compression_t alg, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + + if (fd < 0) { + errno = EBADF; + *err = "decompress: invalid fd"; + return NULL; + } + + oci_stream_t *s = calloc(1, sizeof(*s)); + if (!s) { + errno = ENOMEM; + *err = "decompress: state allocation failed"; + return NULL; + } + s->fd = fd; + + s->ibuf = malloc(OCI_DECOMPRESS_IBUF); + if (!s->ibuf) { + free(s); + errno = ENOMEM; + *err = "decompress: input buffer allocation failed"; + return NULL; + } + + switch (alg) { + case OCI_COMPRESSION_NONE: + s->kind = OCI_STREAM_NONE; + return s; + case OCI_COMPRESSION_GZIP: { + s->kind = OCI_STREAM_GZIP; + /* windowBits=15 + 32 enables gzip + zlib header auto-detection; + * raw deflate without a header is NOT accepted because real OCI + * gzip layers always carry the standard gzip wrapper. + */ + int zrc = inflateInit2(&s->zs, 15 + 32); + if (zrc != Z_OK) { + free(s->ibuf); + free(s); + errno = EINVAL; + *err = "decompress: zlib inflateInit2 failed"; + return NULL; + } + s->zs_inited = true; + return s; + } + case OCI_COMPRESSION_ZSTD: { + s->kind = OCI_STREAM_ZSTD; + s->zd = ZSTD_createDCtx(); + if (!s->zd) { + free(s->ibuf); + free(s); + errno = ENOMEM; + *err = "decompress: ZSTD_createDCtx failed"; + return NULL; + } + size_t prc = ZSTD_DCtx_setParameter(s->zd, ZSTD_d_windowLogMax, + OCI_ZSTD_MAX_WINDOW_LOG); + if (ZSTD_isError(prc)) { + ZSTD_freeDCtx(s->zd); + free(s->ibuf); + free(s); + errno = EINVAL; + *err = "decompress: ZSTD_d_windowLogMax rejected"; + return NULL; + } + return s; + } + default: + free(s->ibuf); + free(s); + errno = EINVAL; + *err = "decompress: unsupported compression"; + return NULL; + } +} + +static ssize_t read_passthrough(oci_stream_t *s, void *buf, size_t cap) +{ + if (s->ibuf_pos < s->ibuf_len) { + size_t left = s->ibuf_len - s->ibuf_pos; + size_t take = left < cap ? left : cap; + memcpy(buf, s->ibuf + s->ibuf_pos, take); + s->ibuf_pos += take; + return (ssize_t) take; + } + /* The passthrough does not buffer beyond what was pre-read; once + * exhausted, hand the caller's buf directly to read(2) so a tar + * driver can stream large payloads without an extra copy. + */ + return read_some(s->fd, buf, cap); +} + +static ssize_t read_gzip(oci_stream_t *s, void *buf, size_t cap) +{ + if (s->eof) + return 0; + s->zs.next_out = buf; + s->zs.avail_out = (uInt) (cap > UINT32_MAX ? UINT32_MAX : cap); + + while (s->zs.avail_out > 0) { + if (s->ibuf_pos == s->ibuf_len) { + if (refill_input(s) < 0) { + errno = EIO; + return -1; + } + if (s->ibuf_len == 0) { + /* Source EOF before zlib reported Z_STREAM_END means + * the gzip frame was truncated. + */ + if (s->zs.avail_out == cap) { + s->eof = true; + return 0; + } + s->last_err = "decompress: gzip stream truncated"; + errno = EIO; + return -1; + } + } + s->zs.next_in = s->ibuf + s->ibuf_pos; + s->zs.avail_in = (uInt) (s->ibuf_len - s->ibuf_pos); + + int zrc = inflate(&s->zs, Z_NO_FLUSH); + size_t consumed = (s->ibuf_len - s->ibuf_pos) - s->zs.avail_in; + s->ibuf_pos += consumed; + if (zrc == Z_STREAM_END) { + s->eof = true; + break; + } + if (zrc == Z_OK || zrc == Z_BUF_ERROR) { + /* Z_BUF_ERROR with no progress just means the decoder wants + * more input next call; loop and refill. + */ + continue; + } + s->last_err = s->zs.msg ? s->zs.msg : "decompress: zlib inflate failed"; + errno = EIO; + return -1; + } + return (ssize_t) (cap - s->zs.avail_out); +} + +static ssize_t read_zstd(oci_stream_t *s, void *buf, size_t cap) +{ + if (s->eof) + return 0; + ZSTD_outBuffer out = {.dst = buf, .size = cap, .pos = 0}; + + while (out.pos < out.size) { + if (s->ibuf_pos == s->ibuf_len) { + if (refill_input(s) < 0) { + errno = EIO; + return -1; + } + if (s->ibuf_len == 0) { + /* libzstd returns 0 from decompressStream when it + * finishes a frame; if the caller sees source EOF here + * with no output produced yet, it is a clean end. + */ + if (out.pos == 0) { + s->eof = true; + return 0; + } + s->last_err = "decompress: zstd stream truncated"; + errno = EIO; + return -1; + } + } + ZSTD_inBuffer in = { + .src = s->ibuf + s->ibuf_pos, + .size = s->ibuf_len - s->ibuf_pos, + .pos = 0, + }; + size_t rrc = ZSTD_decompressStream(s->zd, &out, &in); + s->ibuf_pos += in.pos; + if (ZSTD_isError(rrc)) { + ZSTD_ErrorCode ec = ZSTD_getErrorCode(rrc); + if (ec == ZSTD_error_frameParameter_windowTooLarge) { + s->last_err = "decompress: zstd window exceeds cap"; + errno = EINVAL; + } else { + s->last_err = ZSTD_getErrorName(rrc); + errno = EIO; + } + return -1; + } + if (rrc == 0) { + /* Frame complete; libzstd may still accept more frames, but + * OCI layers ship single-frame so the reader treats this as + * EOF. + */ + s->eof = true; + break; + } + } + return (ssize_t) out.pos; +} + +ssize_t oci_stream_read(oci_stream_t *s, void *buf, size_t cap) +{ + if (!s || !buf) { + errno = EINVAL; + return -1; + } + if (cap == 0) + return 0; + switch (s->kind) { + case OCI_STREAM_NONE: + return read_passthrough(s, buf, cap); + case OCI_STREAM_GZIP: + return read_gzip(s, buf, cap); + case OCI_STREAM_ZSTD: + return read_zstd(s, buf, cap); + } + errno = EINVAL; + return -1; +} + +void oci_stream_close(oci_stream_t *s) +{ + if (!s) + return; + if (s->zs_inited) + inflateEnd(&s->zs); + if (s->zd) + ZSTD_freeDCtx(s->zd); + free(s->ibuf); + free(s); +} + +const char *oci_stream_last_error(const oci_stream_t *s) +{ + return s ? s->last_err : NULL; +} diff --git a/src/oci/decompress.h b/src/oci/decompress.h new file mode 100644 index 0000000..c097e2c --- /dev/null +++ b/src/oci/decompress.h @@ -0,0 +1,55 @@ +/* OCI layer-blob decompression dispatch + * + * Wraps zlib (gzip), the system libzstd, and a passthrough stream behind + * one read-only oci_stream_t. The tar reader is compression-agnostic; + * oci/decompress.c is the only translation unit in the project that + * includes , so any future swap-out of the compression backend + * is local to this module. + * + * The zstd backend caps the decoder window log at 27 (128 MiB) so a + * hostile or unintentionally fat layer cannot exhaust host memory. + * Real-world OCI registry layers stay well below this; the regression + * test pins the boundary at 28-bit windows rejecting with EINVAL. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include "oci/media-type.h" + +typedef struct oci_stream oci_stream_t; + +/* Open a decompression stream over fd. The caller retains fd ownership; + * oci_stream_close does NOT close it. On error returns NULL and sets + * *err to a static description string plus errno. + * + * For OCI_COMPRESSION_NONE this is a thin passthrough wrapper; for GZIP + * the implementation uses zlib's inflate (auto-detecting raw vs gzip + * via inflateInit2 with windowBits=47); for ZSTD it uses libzstd's + * streaming ZSTD_DCtx capped to a 128 MiB decoder window. + */ +oci_stream_t *oci_decompress_open(int fd, + oci_compression_t alg, + const char **err); + +/* Read up to cap bytes. Returns: + * >0 bytes copied into buf (may be a short read; caller loops) + * 0 end of compressed stream (clean) + * -1 decompression or I/O error; errno set, callers may surface + * EIO with the decoder error string + */ +ssize_t oci_stream_read(oci_stream_t *s, void *buf, size_t cap); + +/* Release decoder state. Does NOT close the underlying fd. Safe on NULL. */ +void oci_stream_close(oci_stream_t *s); + +/* For diagnostics only: last static error string the decoder produced. + * Returns NULL if the stream has not failed. The string is owned by + * the stream and remains valid until oci_stream_close. + */ +const char *oci_stream_last_error(const oci_stream_t *s); diff --git a/src/oci/dedup-metrics.c b/src/oci/dedup-metrics.c new file mode 100644 index 0000000..6b73102 --- /dev/null +++ b/src/oci/dedup-metrics.c @@ -0,0 +1,565 @@ +/* OCI cross-image layer dedup metrics + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * One walker per source: + * + * - Pins from index.json. For each pin's manifest digest, the manifest blob + * is parsed; an image-manifest contributes its config blob's diff_ids + * directly, an image-index contributes the picked linux/arm64 sub- + * manifest's diff_ids. Anything that fails along the way is a per-pin + * skip, not a hard error (see header for rationale: dedup is + * informational, not GC). + * + * - Unpacked sysroots under volume_root/images/sha256-/. The origin + * sidecar already records the manifest digest, the config digest, and + * the diff_id list, so this walker never has to read a blob. The walker + * dedupes against the pin walk by manifest_digest so a pin pointing at + * the same manifest as an unpacked tree does not count twice. + * + * Once both walkers finish, the result sets are intersected against the + * target's diff_ids and its per-layer ChainID chain. shared_bytes accumulates + * the on-disk tree size of layers/sha256// entries that exist and + * fall in the intersection; entries absent from the raw cache still register + * as shared layers (cross-image overlap is independent of cache populate). + * + * Memory ownership: + * - oci_dedup_metrics_compute zeroes *out on entry and on failure, so the + * caller can pass a stack-resident struct safely. + * - Internal scratch state (oci_digest_set_t accumulators, the target's + * ChainID chain heap, manifest parses) is owned and freed by this file. + * + * Path budget: DEDUP_PATH_MAX = 4096 matches src/oci/store.c::STORE_PATH_MAX + * so a layers/// tree walk has the same headroom store.c assumes. + */ + +#include "dedup-metrics.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blob-store.h" +#include "digest-set.h" +#include "manifest.h" +#include "origin-meta.h" +#include "volume.h" + +#define DEDUP_PATH_MAX 4096 + +/* Largest blob this helper will memory-map. The manifest renderer in + * src/oci/inspect.c uses 64 MiB; the cross-image walker reads many blobs + * back to back so the cap is the same to keep parser failure modes + * uniform. + */ +#define DEDUP_BLOB_MAX ((size_t) 64 * 1024 * 1024) + +/* Slurp a blob from the store into a fresh heap buffer. The buffer is + * NUL-terminated so the caller can pass the byte range as either a + * length-bounded array or a C string. Returns 0 on success, -1 with + * errno preserved on failure. + */ +static int slurp_blob(oci_blob_store_t *blobs, + const char *digest_str, + char **out_body, + size_t *out_len) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + errno = EINVAL; + return -1; + } + char path[DEDUP_PATH_MAX]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + return -1; + } + if (st.st_size < 0 || (uintmax_t) st.st_size > DEDUP_BLOB_MAX) { + close(fd); + errno = EFBIG; + return -1; + } + size_t want = (size_t) st.st_size; + char *buf = malloc(want + 1); + if (!buf) { + close(fd); + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < want) { + ssize_t r = read(fd, buf + off, want - off); + if (r < 0) { + if (errno == EINTR) + continue; + int saved = errno; + free(buf); + close(fd); + errno = saved; + return -1; + } + if (r == 0) + break; + off += (size_t) r; + } + close(fd); + if (off != want) { + free(buf); + errno = EIO; + return -1; + } + buf[want] = '\0'; + *out_body = buf; + *out_len = want; + return 0; +} + +/* Free a NULL-terminated char ** array allocated as { strdup, strdup, NULL }. + */ +static void free_strv(char **v) +{ + if (!v) + return; + for (size_t i = 0; v[i]; i++) + free(v[i]); + free((void *) v); +} + +/* Walk a path tree summing the st_size of every regular file. Returns the + * accumulated total on success or 0 when the entry is absent / unreadable + * (the caller treats absence as "cache entry not populated", which is + * shared_bytes == 0 for that diff_id). Symlinks are skipped (lstat) so a + * stray symlink can never inflate the count by following it into the + * blob store. + */ +static uint64_t sum_tree_size(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return 0; + if (S_ISREG(st.st_mode)) + return (uint64_t) st.st_size; + if (!S_ISDIR(st.st_mode)) + return 0; + DIR *d = opendir(path); + if (!d) + return 0; + uint64_t total = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[DEDUP_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) + continue; + total += sum_tree_size(child); + } + closedir(d); + return total; +} + +/* Add every diff_id from cfg.rootfs_diff_ids and every ChainID in the chain + * built from those diff_ids into the accumulators. Returns 0 on success, + * -1 with errno set on allocation failure inside oci_digest_set_add or + * oci_chainid_compute (caller treats this as a per-image skip). + */ +static int accumulate_chain(char *const *diff_ids, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc) +{ + char prev[OCI_DIGEST_HEX_MAX + 16] = ""; + for (size_t i = 0; diff_ids[i]; i++) { + if (oci_digest_set_add(diff_acc, diff_ids[i]) < 0) + return -1; + char chain[OCI_DIGEST_HEX_MAX + 16]; + const char *prev_arg = (i == 0) ? NULL : prev; + if (oci_chainid_compute(prev_arg, diff_ids[i], chain, sizeof(chain)) < + 0) + return -1; + memcpy(prev, chain, strlen(chain) + 1); + if (oci_digest_set_add(chain_acc, chain) < 0) + return -1; + } + return 0; +} + +/* For an arbitrary manifest blob digest, locate the per-arch image-config + * digest. Walks one level of image-index indirection (linux/arm64 pick). + * Returns a heap-allocated config digest string on success (caller frees), + * or NULL with errno set when the manifest is missing, malformed, or the + * index has no linux/arm64 entry whose sub-manifest blob is on disk. + */ +static char *resolve_config_digest(oci_store_t *store, + const char *manifest_digest) +{ + oci_blob_store_t *blobs = oci_store_blobs(store); + char *body = NULL; + size_t body_len = 0; + if (slurp_blob(blobs, manifest_digest, &body, &body_len) < 0) + return NULL; + + oci_manifest_t mf = {0}; + if (oci_manifest_parse(body, body_len, &mf, NULL) == 0) { + char *cfg = strdup(mf.config.digest_str); + oci_manifest_free(&mf); + free(body); + if (!cfg) { + errno = ENOMEM; + return NULL; + } + return cfg; + } + + oci_index_t idx = {0}; + if (oci_index_parse(body, body_len, &idx, NULL) < 0) { + free(body); + errno = EINVAL; + return NULL; + } + free(body); + + const oci_index_entry_t *picked = oci_index_pick_linux_arm64(&idx); + if (!picked) { + oci_index_free(&idx); + errno = ENOENT; + return NULL; + } + char *sub_digest = strdup(picked->desc.digest_str); + oci_index_free(&idx); + if (!sub_digest) { + errno = ENOMEM; + return NULL; + } + /* Recurse on the picked sub-manifest. Only one level deep is expected, + * but resolve_config_digest handles arbitrary nesting safely because + * the recursion terminates when the body parses as an image-manifest. + */ + char *cfg = resolve_config_digest(store, sub_digest); + free(sub_digest); + return cfg; +} + +/* Read the image-config at config_digest and return a freshly allocated + * NULL-terminated copy of its rootfs_diff_ids array. Returns NULL with + * errno set on any failure; the caller treats this as a per-image skip. + * The returned strv must be released via free_strv. + */ +static char **load_diff_ids(oci_store_t *store, const char *config_digest) +{ + oci_blob_store_t *blobs = oci_store_blobs(store); + char *body = NULL; + size_t body_len = 0; + if (slurp_blob(blobs, config_digest, &body, &body_len) < 0) + return NULL; + + oci_image_config_t cfg = {0}; + if (oci_image_config_parse(body, body_len, &cfg, NULL) < 0) { + free(body); + errno = EINVAL; + return NULL; + } + free(body); + + /* Count and copy. rootfs_diff_ids is guaranteed non-NULL by the parser + * contract; an image with zero layers gives back a one-element array + * containing only the NULL terminator. + */ + size_t n = 0; + while (cfg.rootfs_diff_ids[n]) + n++; + char **copy = (char **) calloc(n + 1, sizeof(*copy)); + if (!copy) { + oci_image_config_free(&cfg); + errno = ENOMEM; + return NULL; + } + for (size_t i = 0; i < n; i++) { + copy[i] = strdup(cfg.rootfs_diff_ids[i]); + if (!copy[i]) { + free_strv(copy); + oci_image_config_free(&cfg); + errno = ENOMEM; + return NULL; + } + } + oci_image_config_free(&cfg); + return copy; +} + +/* Accumulate a pin's contribution. The pin's manifest digest is checked + * against target_manifest_digest (self-skip) and against compared_manifests + * (dedup with unpacked-tree walk). A pin whose blob is missing, malformed, + * or whose image-config cannot be parsed contributes nothing and is NOT + * counted in compared_images; the walker continues with the next pin. + * + * The accumulator pair is updated atomically per pin: if a mid-pin failure + * leaves partial entries in the sets, the affected pin still does not bump + * compared_images, which means compared_images reads "images that fully + * contributed". Partial sets only inflate shared_layers / shared_bytes; the + * overcount is bounded by the smaller of {pin's diff_id list length, target + * diff_id list length} and is preferable to dropping the pin's first N-1 + * diff_ids when entry N hits ENOMEM. + */ +static void walk_pin(oci_store_t *store, + const char *pin_manifest_digest, + const char *target_manifest_digest, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc, + oci_digest_set_t *compared_manifests, + size_t *compared_images) +{ + if (strcmp(pin_manifest_digest, target_manifest_digest) == 0) + return; + + /* For an image-index pin, the picked linux/arm64 sub-manifest is the + * representative entry. Mark BOTH the pin's manifest digest and the + * resolved sub-manifest digest as compared so a later unpacked-tree + * lookup (which keys on whichever digest the operator unpacked from) + * does not double-count. + */ + char *config_digest = resolve_config_digest(store, pin_manifest_digest); + if (!config_digest) + return; + + char **diff_ids = load_diff_ids(store, config_digest); + free(config_digest); + if (!diff_ids) + return; + + if (accumulate_chain(diff_ids, diff_acc, chain_acc) < 0) { + free_strv(diff_ids); + return; + } + free_strv(diff_ids); + + (void) oci_digest_set_add(compared_manifests, pin_manifest_digest); + (*compared_images)++; +} + +/* Walk every unpacked image tree under volume_root/images/. The origin + * sidecar yields the manifest_digest + diff_ids directly, so no blob read + * is required. Trees whose origin sidecar matches a manifest already + * counted via the pin walk are skipped silently. + */ +static void walk_unpacked(oci_store_t *store, + const char *volume_root, + const char *target_manifest_digest, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc, + oci_digest_set_t *compared_manifests, + size_t *compared_images) +{ + (void) store; + if (!volume_root) + return; + oci_volume_list_t trees = {0}; + if (oci_volume_list_unpacked(volume_root, &trees, NULL) < 0) + return; + for (size_t i = 0; i < trees.count; i++) { + oci_origin_t origin = {0}; + if (oci_origin_read(trees.items[i], &origin, NULL) < 0) + continue; + if (origin.manifest_digest && + strcmp(origin.manifest_digest, target_manifest_digest) == 0) { + oci_origin_free(&origin); + continue; + } + if (origin.manifest_digest && + oci_digest_set_contains(compared_manifests, + origin.manifest_digest)) { + oci_origin_free(&origin); + continue; + } + if (!origin.layer_diffids) { + oci_origin_free(&origin); + continue; + } + if (accumulate_chain(origin.layer_diffids, diff_acc, chain_acc) < 0) { + oci_origin_free(&origin); + continue; + } + if (origin.manifest_digest) + (void) oci_digest_set_add(compared_manifests, + origin.manifest_digest); + (*compared_images)++; + oci_origin_free(&origin); + } + oci_volume_list_free(&trees); +} + +int oci_dedup_metrics_compute(oci_store_t *store, + const char *target_manifest_digest, + const char *volume_root, + oci_dedup_metrics_t *out, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + + if (!store || !target_manifest_digest || !out) { + *err = "dedup_metrics: NULL argument"; + errno = EINVAL; + return -1; + } + memset(out, 0, sizeof(*out)); + + /* Resolve the target's image-config and diff_ids. Hard failure here is + * surfaced to the caller: there is no useful "layer reuse" line when + * the target's own layer list is unknown. + */ + char *target_config = resolve_config_digest(store, target_manifest_digest); + if (!target_config) { + int saved = errno; + *err = "dedup_metrics: target manifest blob missing or unparseable"; + errno = saved; + return -1; + } + char **target_diff_ids = load_diff_ids(store, target_config); + free(target_config); + if (!target_diff_ids) { + int saved = errno; + *err = "dedup_metrics: target image-config missing or unparseable"; + errno = saved; + return -1; + } + + size_t n_target = 0; + while (target_diff_ids[n_target]) + n_target++; + out->total_layers = n_target; + + /* Precompute the target's ChainID chain so the longest-shared-prefix + * lookup is a per-layer set membership check rather than a recompute. + * chain_strs[i] holds ChainID(target_diff_ids[0..i]). + */ + char **chain_strs = NULL; + if (n_target > 0) { + chain_strs = (char **) calloc(n_target, sizeof(*chain_strs)); + if (!chain_strs) { + free_strv(target_diff_ids); + *err = "dedup_metrics: chain alloc failed"; + errno = ENOMEM; + return -1; + } + char prev[OCI_DIGEST_HEX_MAX + 16] = ""; + for (size_t i = 0; i < n_target; i++) { + char chain[OCI_DIGEST_HEX_MAX + 16]; + const char *prev_arg = (i == 0) ? NULL : prev; + if (oci_chainid_compute(prev_arg, target_diff_ids[i], chain, + sizeof(chain)) < 0) { + int saved = errno; + for (size_t j = 0; j < i; j++) + free(chain_strs[j]); + free((void *) chain_strs); + free_strv(target_diff_ids); + *err = "dedup_metrics: chainid compute failed for target"; + errno = saved; + return -1; + } + chain_strs[i] = strdup(chain); + if (!chain_strs[i]) { + for (size_t j = 0; j < i; j++) + free(chain_strs[j]); + free((void *) chain_strs); + free_strv(target_diff_ids); + *err = "dedup_metrics: chainid strdup failed"; + errno = ENOMEM; + return -1; + } + memcpy(prev, chain, strlen(chain) + 1); + } + } + + /* Walk pins and unpacked sysroots into the two accumulators. */ + oci_digest_set_t diff_acc = {0}; + oci_digest_set_t chain_acc = {0}; + oci_digest_set_t compared_manifests = {0}; + oci_digest_set_init(&diff_acc); + oci_digest_set_init(&chain_acc); + oci_digest_set_init(&compared_manifests); + + oci_pin_list_t pins = {0}; + if (oci_store_list_refs(store, &pins, NULL) == 0) { + for (size_t i = 0; i < pins.count; i++) { + walk_pin(store, pins.items[i].digest, target_manifest_digest, + &diff_acc, &chain_acc, &compared_manifests, + &out->compared_images); + } + oci_pin_list_free(&pins); + } + walk_unpacked(store, volume_root, target_manifest_digest, &diff_acc, + &chain_acc, &compared_manifests, &out->compared_images); + + /* Intersect target's diff_ids against the accumulated set. For every + * shared diff_id, walk the raw cache entry tree (if present) and add + * its on-disk size to shared_bytes. + */ + const char *store_root = oci_store_root(store); + for (size_t i = 0; i < n_target; i++) { + const char *diff_id = target_diff_ids[i]; + if (!oci_digest_set_contains(&diff_acc, diff_id)) + continue; + out->shared_layers++; + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(diff_id, &algo, hex)) + continue; + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) + continue; + char layer_path[DEDUP_PATH_MAX]; + int n = snprintf(layer_path, sizeof(layer_path), "%s/layers/%s/%s", + store_root, algo_name, hex); + if (n < 0 || (size_t) n >= sizeof(layer_path)) + continue; + out->shared_bytes += sum_tree_size(layer_path); + } + + /* Longest k such that ChainID(target[0..k-1]) is also reachable from + * some other image. Walk from the deepest layer backwards so the first + * hit terminates the search. + */ + for (size_t k = n_target; k > 0; k--) { + if (oci_digest_set_contains(&chain_acc, chain_strs[k - 1])) { + out->deepest_shared_prefix = k; + snprintf(out->deepest_shared_chainid, + sizeof(out->deepest_shared_chainid), "%s", + chain_strs[k - 1]); + break; + } + } + + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_digest_set_free(&compared_manifests); + if (chain_strs) { + for (size_t i = 0; i < n_target; i++) + free(chain_strs[i]); + free((void *) chain_strs); + } + free_strv(target_diff_ids); + return 0; +} diff --git a/src/oci/dedup-metrics.h b/src/oci/dedup-metrics.h new file mode 100644 index 0000000..0c3993f --- /dev/null +++ b/src/oci/dedup-metrics.h @@ -0,0 +1,106 @@ +/* OCI cross-image layer dedup metrics + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Computes how much of one image's layer set is shared with every other image + * recorded in the local store. Two dedup angles are reported because the two + * Plan 3 layer caches dedup along different axes: + * + * - raw cache (layers/sha256//) keys per-layer payloads by their + * uncompressed-tar digest. Two images that share a layer with the same + * diff_id share that cache entry regardless of where the layer sits in + * either image's ordered layer list. shared_layers counts individual + * layers; shared_bytes accumulates the on-disk size of the raw entries + * covering those shared diff_ids (entries the raw cache has actually + * populated; absent entries still count as shared layers, just with zero + * bytes). + * + * - ChainID stack cache (layers/stacks/sha256//) keys cumulative + * stage_dir snapshots by the OCI ChainID of the terminating layer in an + * ordered prefix. Two images can share a stack prefix only when their + * first K layers (in order) have identical diff_ids; ChainID composition + * captures that. deepest_shared_prefix reports the longest such prefix + * length the target shares with at least one other image. This is the + * same shape the C3.3c-ii orchestrator short-circuits on during unpack. + * + * Output is informational, not a GC keep-set: missing or malformed image- + * config blobs on the OTHER images are skipped silently rather than failing + * the whole compute. Failure semantics are stricter for the TARGET image + * (missing config or missing manifest blob is surfaced as -1) so the caller + * can render a graceful-degrade notice instead of bogus numbers. + * + * Plan 4 oci status reuses this helper by calling it once per pin and + * aggregating the results client-side. + */ + +#pragma once + +#include +#include + +#include "digest.h" +#include "store.h" + +typedef struct { + /* Number of layers in target's image-config rootfs.diff_ids. */ + size_t total_layers; + /* |target.diff_ids ∩ union(other_image.diff_ids)|. A diff_id counted + * here may or may not have a raw cache entry on disk: dedup is about + * cross-image overlap, not cache populate state. + */ + size_t shared_layers; + /* Sum of /layers/sha256// tree sizes for diff_ids in the + * intersection that have an extant raw cache entry. Reports logical + * bytes (sum of file st_size) rather than physical disk usage; APFS + * clonefile means the on-disk footprint is typically much smaller. + */ + uint64_t shared_bytes; + /* Number of OTHER images that contributed at least one diff_id, deduped + * by manifest digest so a pin and its unpacked sysroot for the same + * manifest do not double-count. Excludes the target itself. + */ + size_t compared_images; + /* Longest K such that ChainID(target.diff_ids[0..K-1]) is also a + * ChainID reached by some other image's diff_id chain. 0 means no + * shared prefix. + */ + size_t deepest_shared_prefix; + /* The ChainID at depth deepest_shared_prefix, in ":" form. + * Empty string when deepest_shared_prefix == 0. Sized for sha512-prefixed + * output so the same buffer fits both digest algos plus the ":" + * separator. + */ + char deepest_shared_chainid[OCI_DIGEST_HEX_MAX + 16]; +} oci_dedup_metrics_t; + +/* Compute cross-image dedup metrics for one target image. + * + * target_manifest_digest: the manifest the caller is inspecting, in canonical + * ":" form. Must already be present under /blobs/. For + * an image-index pin, the caller is responsible for resolving the per- + * platform sub-manifest first; this helper does not pick arm64 itself. + * + * volume_root: optional path to the unpacked-sysroot volume (the directory + * holding images/sha256-/). NULL skips the unpacked-tree walk and + * the compute uses only pins recorded in index.json. Missing volume_root + * or empty images/ subtree is treated as the empty case, not an error. + * + * out: receives the computed metrics on success; left zeroed on failure. + * + * Failure model (-1 with errno preserved and *err populated): + * - target manifest blob missing or unparseable + * - target image-config blob missing or unparseable + * - target image-config has no rootfs.diff_ids (spec violation) + * - allocation failure during walk + * + * Other-image failures (malformed config, missing blob, malformed origin + * sidecar) are NOT surfaced: the offending image is skipped and the walk + * continues. compared_images reflects only images that contributed at + * least one usable diff_id list. + */ +int oci_dedup_metrics_compute(oci_store_t *store, + const char *target_manifest_digest, + const char *volume_root, + oci_dedup_metrics_t *out, + const char **err); diff --git a/src/oci/digest-set.c b/src/oci/digest-set.c new file mode 100644 index 0000000..0e6dc9f --- /dev/null +++ b/src/oci/digest-set.c @@ -0,0 +1,121 @@ +/* OCI digest set implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Sorted-array set: add() performs a lower-bound scan with strcmp and + * keeps the array ordered so contains() can use bsearch. The capacity + * grows geometrically (x2) because Plan 1 collects up to one entry per + * manifest + config + layer across every pinned image plus every + * unpacked sysroot, and an O(n) realloc per add would dominate that. + */ + +#include "digest-set.h" + +#include +#include +#include +#include + +void oci_digest_set_init(oci_digest_set_t *s) +{ + if (!s) + return; + s->items = NULL; + s->count = 0; + s->cap = 0; +} + +void oci_digest_set_free(oci_digest_set_t *s) +{ + if (!s) + return; + if (s->items) { + for (size_t i = 0; i < s->count; i++) + free(s->items[i]); + free((void *) s->items); + } + s->items = NULL; + s->count = 0; + s->cap = 0; +} + +/* Locate the lower bound of digest in the sorted items[] range. Returns + * an index in [0, count]; *found is true when items[idx] equals digest. + */ +static size_t lower_bound(const oci_digest_set_t *s, + const char *digest, + bool *found) +{ + size_t lo = 0, hi = s->count; + while (lo < hi) { + size_t mid = lo + (hi - lo) / 2; + int cmp = strcmp(s->items[mid], digest); + if (cmp < 0) + lo = mid + 1; + else + hi = mid; + } + *found = lo < s->count && strcmp(s->items[lo], digest) == 0; + return lo; +} + +int oci_digest_set_add(oci_digest_set_t *s, const char *digest) +{ + if (!s || !digest) { + errno = EINVAL; + return -1; + } + bool found = false; + size_t pos = lower_bound(s, digest, &found); + if (found) + return 0; + + if (s->count == s->cap) { + size_t newcap = s->cap ? s->cap * 2 : 16; + void *raw = realloc((void *) s->items, newcap * sizeof(*s->items)); + if (!raw) { + errno = ENOMEM; + return -1; + } + s->items = (char **) raw; + s->cap = newcap; + } + + char *copy = strdup(digest); + if (!copy) { + errno = ENOMEM; + return -1; + } + /* Shift the tail right one slot so the new entry slides into its + * sorted position. memmove handles the overlap; with pos == count + * the move length is zero. + */ + if (pos < s->count) + memmove((void *) &s->items[pos + 1], (const void *) &s->items[pos], + (s->count - pos) * sizeof(*s->items)); + s->items[pos] = copy; + s->count++; + return 0; +} + +bool oci_digest_set_contains(const oci_digest_set_t *s, const char *digest) +{ + if (!s || !digest || s->count == 0) + return false; + bool found = false; + (void) lower_bound(s, digest, &found); + return found; +} + +size_t oci_digest_set_size(const oci_digest_set_t *s) +{ + return s ? s->count : 0; +} + +const char *oci_digest_set_at(const oci_digest_set_t *s, size_t i) +{ + if (!s || i >= s->count) + return NULL; + return s->items[i]; +} diff --git a/src/oci/digest-set.h b/src/oci/digest-set.h new file mode 100644 index 0000000..ba2f568 --- /dev/null +++ b/src/oci/digest-set.h @@ -0,0 +1,63 @@ +/* OCI digest set + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Append-only ordered set of ":" digest strings, used by the + * Plan 1 garbage collector to accumulate the reachable blob set across + * pin walks, image-config blobs, and unpacked-sysroot origin sidecars. + * + * Storage is a sorted, strdup-owned C array. Membership is checked via + * bsearch and insertion uses lower-bound + memmove. The expected + * working size is in the low hundreds (one entry per manifest, config, + * and layer blob across all pinned images), so O(n) per insertion is + * cheap. The C1.3 sweep walks blobs// and calls contains() + * once per blob, which bsearch makes O(log n). If profiling later + * proves the set hot enough to matter, swap to a hash table without + * touching the public API. + * + * The set treats digest strings as opaque bytes: the caller is + * expected to feed only validated ":" forms produced by + * oci_digest_parse so a hand-edited index.json cannot smuggle in an + * uppercase variant that defeats the dedup. + */ + +#pragma once + +#include +#include + +typedef struct { + char **items; + size_t count; + size_t cap; +} oci_digest_set_t; + +/* Zero-initialise a set in place. Safe to call on a struct that was + * declared with {0}; provided for readability at allocation sites. + */ +void oci_digest_set_init(oci_digest_set_t *s); + +/* Release every owned digest string and zero the struct. Safe on a + * zero-initialised set and on NULL. + */ +void oci_digest_set_free(oci_digest_set_t *s); + +/* Insert digest into the set. Returns 0 on success or when the digest + * is already present (idempotent), -1 with errno set on failure. A + * NULL digest is rejected with EINVAL so a caller bug does not leak + * into the sweep phase later. + */ +int oci_digest_set_add(oci_digest_set_t *s, const char *digest); + +/* True when digest is in the set. NULL inputs return false. */ +bool oci_digest_set_contains(const oci_digest_set_t *s, const char *digest); + +/* Current cardinality of the set. */ +size_t oci_digest_set_size(const oci_digest_set_t *s); + +/* Borrow the digest string at index i (lexicographically ordered). The + * returned pointer is valid until the next mutating call or free. + * Returns NULL when i is out of range. + */ +const char *oci_digest_set_at(const oci_digest_set_t *s, size_t i); diff --git a/src/oci/digest.c b/src/oci/digest.c new file mode 100644 index 0000000..9b9a060 --- /dev/null +++ b/src/oci/digest.c @@ -0,0 +1,281 @@ +/* Content digests for OCI image blobs + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "digest.h" + +#include +#include +#include +#include +#include + +/* CC_LONG is 32-bit; clamp every update call so multi-gigabyte layers cannot + * overflow the CommonCrypto length argument silently. 1 GiB is well below the + * limit and large enough that the per-call overhead is negligible. + */ +#define DIGESTER_CHUNK_MAX ((size_t) (1u << 30)) + +struct oci_digester { + oci_digest_algo_t algo; + union { + CC_SHA256_CTX sha256; + CC_SHA512_CTX sha512; + } ctx; +}; + +static const char HEX_LOWER[] = "0123456789abcdef"; + +static void bin_to_hex_lower(const uint8_t *bin, size_t bin_len, char *out) +{ + for (size_t i = 0; i < bin_len; i++) { + out[i * 2] = HEX_LOWER[(bin[i] >> 4) & 0xf]; + out[i * 2 + 1] = HEX_LOWER[bin[i] & 0xf]; + } + out[bin_len * 2] = '\0'; +} + +const char *oci_digest_algo_name(oci_digest_algo_t algo) +{ + switch (algo) { + case OCI_DIGEST_SHA256: + return "sha256"; + case OCI_DIGEST_SHA512: + return "sha512"; + } + return NULL; +} + +size_t oci_digest_hex_len(oci_digest_algo_t algo) +{ + switch (algo) { + case OCI_DIGEST_SHA256: + return OCI_DIGEST_SHA256_HEX_LEN; + case OCI_DIGEST_SHA512: + return OCI_DIGEST_SHA512_HEX_LEN; + } + return 0; +} + +bool oci_digest_algo_from_name(const char *name, oci_digest_algo_t *algo) +{ + if (!name || !algo) + return false; + if (!strcmp(name, "sha256")) { + *algo = OCI_DIGEST_SHA256; + return true; + } + if (!strcmp(name, "sha512")) { + *algo = OCI_DIGEST_SHA512; + return true; + } + return false; +} + +bool oci_digest_hex_valid(oci_digest_algo_t algo, const char *hex) +{ + if (!hex) + return false; + size_t want = oci_digest_hex_len(algo); + if (want == 0) + return false; + if (strlen(hex) != want) + return false; + for (size_t i = 0; i < want; i++) { + char c = hex[i]; + bool ok = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); + if (!ok) + return false; + } + return true; +} + +bool oci_digest_parse(const char *colon_form, + oci_digest_algo_t *out_algo, + char *out_hex) +{ + if (!colon_form || !out_algo || !out_hex) + return false; + + out_hex[0] = '\0'; + const char *colon = strchr(colon_form, ':'); + if (!colon || colon == colon_form) + return false; + + char name[8]; + size_t name_len = (size_t) (colon - colon_form); + if (name_len >= sizeof(name)) + return false; + memcpy(name, colon_form, name_len); + name[name_len] = '\0'; + + oci_digest_algo_t algo; + if (!oci_digest_algo_from_name(name, &algo)) + return false; + + const char *hex = colon + 1; + if (!oci_digest_hex_valid(algo, hex)) + return false; + + *out_algo = algo; + memcpy(out_hex, hex, oci_digest_hex_len(algo) + 1); + return true; +} + +oci_digester_t *oci_digester_new(oci_digest_algo_t algo) +{ + oci_digester_t *d = calloc(1, sizeof(*d)); + if (!d) + return NULL; + d->algo = algo; + switch (algo) { + case OCI_DIGEST_SHA256: + (void) CC_SHA256_Init(&d->ctx.sha256); + break; + case OCI_DIGEST_SHA512: + (void) CC_SHA512_Init(&d->ctx.sha512); + break; + default: + free(d); + return NULL; + } + return d; +} + +void oci_digester_free(oci_digester_t *d) +{ + free(d); +} + +void oci_digester_update(oci_digester_t *d, const void *buf, size_t len) +{ + if (!d || !buf || len == 0) + return; + const uint8_t *p = buf; + while (len > 0) { + size_t chunk = len > DIGESTER_CHUNK_MAX ? DIGESTER_CHUNK_MAX : len; + switch (d->algo) { + case OCI_DIGEST_SHA256: + (void) CC_SHA256_Update(&d->ctx.sha256, p, (CC_LONG) chunk); + break; + case OCI_DIGEST_SHA512: + (void) CC_SHA512_Update(&d->ctx.sha512, p, (CC_LONG) chunk); + break; + } + p += chunk; + len -= chunk; + } +} + +size_t oci_digester_finish_hex(oci_digester_t *d, char *out_hex) +{ + if (!d || !out_hex) + return 0; + uint8_t md[CC_SHA512_DIGEST_LENGTH]; + size_t bin_len = 0; + switch (d->algo) { + case OCI_DIGEST_SHA256: + (void) CC_SHA256_Final(md, &d->ctx.sha256); + bin_len = CC_SHA256_DIGEST_LENGTH; + break; + case OCI_DIGEST_SHA512: + (void) CC_SHA512_Final(md, &d->ctx.sha512); + bin_len = CC_SHA512_DIGEST_LENGTH; + break; + default: + return 0; + } + bin_to_hex_lower(md, bin_len, out_hex); + return bin_len * 2; +} + +size_t oci_digest_bytes(oci_digest_algo_t algo, + const void *buf, + size_t len, + char *out_hex) +{ + if (!out_hex) + return 0; + oci_digester_t *d = oci_digester_new(algo); + if (!d) + return 0; + oci_digester_update(d, buf, len); + size_t n = oci_digester_finish_hex(d, out_hex); + oci_digester_free(d); + return n; +} + +int oci_chainid_compute(const char *prev_chain, + const char *diff_id, + char *out, + size_t cap) +{ + if (!diff_id || !out || cap == 0) { + errno = EINVAL; + return -1; + } + + /* Validate the diff_id is well-formed before any hashing so the L0 + * passthrough path and the Li hash path agree on input validation. + */ + oci_digest_algo_t diff_algo; + char diff_hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(diff_id, &diff_algo, diff_hex)) { + errno = EINVAL; + return -1; + } + + if (!prev_chain) { + /* L0 case: ChainID(L0) == DiffID(L0). Copy verbatim so a sha512 + * diff_id round-trips unchanged through the layer-0 slot. + */ + size_t diff_len = strlen(diff_id); + if (diff_len + 1 > cap) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(out, diff_id, diff_len + 1); + return 0; + } + + /* Li case: validate prev_chain shape too. The spec allows any + * : digest string in the textual concatenation; in practice + * every ChainID this helper produces is sha256-prefixed, but the + * parser accepts both sha256 and sha512 so a future L0 sha512 diff_id + * still composes correctly with subsequent layers. + */ + oci_digest_algo_t prev_algo; + char prev_hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(prev_chain, &prev_algo, prev_hex)) { + errno = EINVAL; + return -1; + } + + /* Output is always sha256-prefixed: ChainID composition is defined + * with SHA-256 in the OCI image-spec, regardless of the algorithm + * used for the constituent digests. + */ + static const char OUT_PREFIX[] = "sha256:"; + size_t out_need = sizeof(OUT_PREFIX) - 1 + OCI_DIGEST_SHA256_HEX_LEN + 1; + if (cap < out_need) { + errno = ENAMETOOLONG; + return -1; + } + + oci_digester_t *d = oci_digester_new(OCI_DIGEST_SHA256); + if (!d) { + errno = ENOMEM; + return -1; + } + oci_digester_update(d, prev_chain, strlen(prev_chain)); + static const char SP = ' '; + oci_digester_update(d, &SP, 1); + oci_digester_update(d, diff_id, strlen(diff_id)); + + memcpy(out, OUT_PREFIX, sizeof(OUT_PREFIX) - 1); + oci_digester_finish_hex(d, out + sizeof(OUT_PREFIX) - 1); + oci_digester_free(d); + return 0; +} diff --git a/src/oci/digest.h b/src/oci/digest.h new file mode 100644 index 0000000..06b96c5 --- /dev/null +++ b/src/oci/digest.h @@ -0,0 +1,128 @@ +/* Content digests for OCI image blobs + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Wraps macOS CommonCrypto SHA-256 and SHA-512 in a streaming API so the + * blob store and registry client can hash gigabyte-class layer downloads + * without ever buffering the full payload in memory. + * + * Hex output is always lowercase; the OCI image reference parser already + * rejects uppercase digest hex (see src/oci/ref.c), so every digest hex that + * flows between the parser, the manifest fetcher, and the local store must + * stay in the same canonical encoding to avoid silent dedup misses. + */ + +#pragma once + +#include +#include + +typedef enum { + OCI_DIGEST_SHA256, + OCI_DIGEST_SHA512, +} oci_digest_algo_t; + +/* Hex length per algorithm, excluding the trailing NUL. */ +#define OCI_DIGEST_SHA256_HEX_LEN 64 +#define OCI_DIGEST_SHA512_HEX_LEN 128 +#define OCI_DIGEST_HEX_MAX OCI_DIGEST_SHA512_HEX_LEN + +/* Opaque streaming digest. Allocated on the heap because the underlying + * CommonCrypto context is moderately sized (SHA-512 keeps an 80-word state) + * and callers tend to thread a digester pointer through several modules. + */ +typedef struct oci_digester oci_digester_t; + +/* Allocate a streaming digester for algo. Returns NULL on bad enum or oom. */ +oci_digester_t *oci_digester_new(oci_digest_algo_t algo); + +/* Release a digester. Safe on NULL. */ +void oci_digester_free(oci_digester_t *d); + +/* Append data. Splits large buffers into CC_LONG-sized chunks internally + * because CommonCrypto's update takes a uint32_t length and OCI layers can + * exceed 4 GiB. + */ +void oci_digester_update(oci_digester_t *d, const void *buf, size_t len); + +/* Finalize and write the lowercase hex string to out_hex. out_hex must hold + * at least OCI_DIGEST_HEX_MAX + 1 bytes. Returns the hex length on success + * (without trailing NUL) or 0 if d is NULL. The digester is consumed by this + * call: the only valid next operation is oci_digester_free. + */ +size_t oci_digester_finish_hex(oci_digester_t *d, char *out_hex); + +/* Lookup the algorithm name string ("sha256" / "sha512"). Returns NULL when + * algo is out of range. The returned pointer is to static storage. + */ +const char *oci_digest_algo_name(oci_digest_algo_t algo); + +/* Expected hex length for an algorithm (without trailing NUL). Returns 0 on + * bad enum. + */ +size_t oci_digest_hex_len(oci_digest_algo_t algo); + +/* Parse an algorithm name. Returns true and writes algo on match; false on + * unknown name. + */ +bool oci_digest_algo_from_name(const char *name, oci_digest_algo_t *algo); + +/* Validate that hex is exactly oci_digest_hex_len(algo) characters and that + * every character is a lowercase hex digit. Rejects NULL. + */ +bool oci_digest_hex_valid(oci_digest_algo_t algo, const char *hex); + +/* Parse ":" into algo and a canonical lowercase hex copy. The + * input hex must already be lowercase; mixed case is rejected to match the + * reference parser. out_hex must hold OCI_DIGEST_HEX_MAX + 1 bytes. On + * success returns true; otherwise returns false and out_hex is left zeroed. + */ +bool oci_digest_parse(const char *colon_form, + oci_digest_algo_t *out_algo, + char *out_hex); + +/* One-shot helper: compute algo over buf/len and emit lowercase hex into + * out_hex (which must hold OCI_DIGEST_HEX_MAX + 1 bytes). Returns the hex + * length on success or 0 on bad enum / NULL output. + */ +size_t oci_digest_bytes(oci_digest_algo_t algo, + const void *buf, + size_t len, + char *out_hex); + +/* Compute the OCI image-spec ChainID for one layer in canonical + * ":" form. ChainID is the cumulative content key used by the + * Plan 3 C3.3 stack cache: ChainID(L0) == DiffID(L0), and for any later + * layer ChainID(Li) == sha256(" ") where the input is + * the previous chain string, an ASCII space (0x20), and the current layer's + * diff_id string, both in their canonical ":" form. See OCI + * image-spec v1.0.2 section 3.4 "Layer ChainID" for the reference text. + * + * The output is always sha256-prefixed regardless of diff_id's algorithm: + * ChainID composition is defined over the textual digest representation, so + * a sha512 diff_id contributes its full "sha512:" string but the result + * is hashed with SHA-256 (the only ChainID algorithm the spec defines). + * + * Parameters: + * prev_chain NULL signals the L0 case; the helper copies diff_id into + * out verbatim and returns 0. Non-NULL must be a valid + * ":" string in canonical lowercase form. + * diff_id This layer's diff_id; must be non-NULL and ":". + * out Receives the new ChainID string (NUL-terminated, always + * "sha256:<64-hex>" when prev_chain != NULL, or a copy of + * diff_id when prev_chain == NULL). + * cap Capacity of out in bytes. Must be at least + * OCI_DIGEST_HEX_MAX + 16 so a sha512 diff_id fits in the L0 + * passthrough path. + * + * Returns 0 on success, -1 with errno set on failure: + * EINVAL diff_id NULL, prev_chain non-NULL but malformed, or + * diff_id malformed + * ENAMETOOLONG cap too small to hold the result + * ENOMEM internal SHA-256 digester allocation failed + */ +int oci_chainid_compute(const char *prev_chain, + const char *diff_id, + char *out, + size_t cap); diff --git a/src/oci/fetch.c b/src/oci/fetch.c new file mode 100644 index 0000000..753c137 --- /dev/null +++ b/src/oci/fetch.c @@ -0,0 +1,1671 @@ +/* OCI registry HTTPS client + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Implements anonymous and bearer-challenge HTTPS pulls against the OCI + * distribution-spec /v2/ endpoints. Manifest fetches return body bytes plus a + * captured Content-Type and Docker-Content-Digest so the slice-3 parser and + * future tag-to-digest pinning can consume them directly. Blob fetches stream + * the response body into the slice-2 blob store, capping the running byte + * count at the descriptor's declared size and letting the writer's digest + * check reject any payload that hashes to anything other than the descriptor + * hex. + * + * The 401 retry path is "try anonymous first, then parse Www-Authenticate, + * fetch a token, retry once". A second 401 propagates as a fetch failure; the + * caller decides whether to surface authorization-failed or treat it as a + * transient network error. The cached bearer token is invalidated by any 401 + * but otherwise reused across requests on the same fetcher, so a pull of an + * image with N layers makes one token call rather than N+1. + */ + +#include "fetch.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "policy.h" + +/* Hard ceiling on a single manifest / index / config response. Real-world + * documents are well under 1 MiB; the limit is here so a misbehaving registry + * cannot fill memory with an unbounded body. Blob responses do not flow + * through this buffer; they stream into the blob store. + */ +#define FETCH_BODY_MAX ((size_t) 16 * 1024 * 1024) + +typedef struct { + char *realm; + char *service; + char *scope; +} bearer_challenge_t; + +struct oci_fetcher { + CURL *easy; + char *base_url_override; + char *bearer_token; + bearer_challenge_t challenge; + /* Pre-built "user:pass" string for CURLOPT_USERPWD. NULL when CLI basic + * auth is disabled. The fetcher attaches it to every easy-handle reset + * (manifest GET, blob GET, token GET) so a registry that bridges basic + * and bearer sees the basic credentials on both the manifest probe and + * the token exchange. + */ + char *user_pass; + /* PEM bundle path passed through to CURLOPT_CAINFO. NULL leaves libcurl on + * its compiled-in trust store. + */ + char *ca_file; + bool allow_insecure; + /* Caller-owned policy. NULL when the caller has not loaded a policy.json. + * Consulted by resolve_effective on every manifest/blob entry; the + * fetcher does not take a copy and does not free it. + */ + const oci_policy_t *policy; +}; + +/* Per-request merge of CLI-supplied options and the policy lookup for the + * current ref->registry. resolve_effective produces one of these and the + * request paths read it instead of f->{user_pass,ca_file,allow_insecure}. + * Strings are borrowed (point into f->* or into the policy_t entry) except + * user_pass_loaded, which holds a heap "user:pass" built from a policy + * auth_file. effective_free releases that one allocation. + */ +typedef struct { + const char *user_pass; + const char *ca_file; + bool allow_insecure; + char *user_pass_loaded; +} effective_opts_t; + +static void effective_free(effective_opts_t *eff) +{ + if (!eff) + return; + free(eff->user_pass_loaded); + eff->user_pass_loaded = NULL; + eff->user_pass = NULL; + eff->ca_file = NULL; + eff->allow_insecure = false; +} + +/* Build the per-request effective options from the fetcher's CLI defaults and + * any policy entry matching ref->registry. CLI flags win: a CLI-supplied + * user_pass / ca_file / allow_insecure shadows the policy value for the same + * field. A policy auth_file is loaded via oci_policy_load_auth, which + * enforces 0600 mode and the {username,password} JSON shape. Returns 0 on + * success, -1 with errno + *err_msg on auth_file load failure. The caller + * always invokes effective_free, including on rc != 0. + */ +static int resolve_effective(const oci_fetcher_t *f, + const oci_ref_t *ref, + effective_opts_t *eff, + const char **err_msg) +{ + memset(eff, 0, sizeof(*eff)); + eff->user_pass = f->user_pass; + eff->ca_file = f->ca_file; + eff->allow_insecure = f->allow_insecure; + + if (!f->policy || !ref || !ref->registry) + return 0; + + oci_policy_effective_t pol; + oci_policy_lookup(f->policy, ref->registry, &pol); + + if (!eff->allow_insecure && pol.insecure) + eff->allow_insecure = true; + if (!eff->ca_file && pol.ca_bundle) + eff->ca_file = pol.ca_bundle; + + /* Only consult policy auth_file when the caller did not supply CLI + * credentials. The load happens per-request; auth files are small and + * mode-checked each time, which avoids any cache-vs-disk consistency + * worry at the cost of re-parsing a sub-kilobyte JSON document. + */ + if (!eff->user_pass && pol.auth_file) { + char *user = NULL; + char *pass = NULL; + const char *aerr = NULL; + if (oci_policy_load_auth(pol.auth_file, &user, &pass, &aerr) < 0) { + int e = errno; + free(user); + free(pass); + if (err_msg) + *err_msg = aerr ? aerr : "policy auth file load failed"; + errno = e ? e : EINVAL; + return -1; + } + size_t ul = strlen(user); + size_t pl = strlen(pass); + char *up = malloc(ul + 1 + pl + 1); + if (!up) { + free(user); + free(pass); + if (err_msg) + *err_msg = "out of memory composing policy credentials"; + errno = ENOMEM; + return -1; + } + memcpy(up, user, ul); + up[ul] = ':'; + memcpy(up + ul + 1, pass, pl); + up[ul + 1 + pl] = '\0'; + free(user); + free(pass); + eff->user_pass_loaded = up; + eff->user_pass = up; + } + return 0; +} + +static pthread_once_t g_curl_init_once = PTHREAD_ONCE_INIT; +static int g_curl_init_rc = -1; + +static void curl_global_once(void) +{ + g_curl_init_rc = curl_global_init(CURL_GLOBAL_DEFAULT) == CURLE_OK ? 0 : -1; +} + +int oci_fetch_global_init(void) +{ + pthread_once(&g_curl_init_once, curl_global_once); + if (g_curl_init_rc < 0) + errno = EIO; + return g_curl_init_rc; +} + +void oci_fetch_global_cleanup(void) +{ + /* curl_global_cleanup is not safe under threading. elfuse process lives + * for the duration of one pull so leaving libcurl initialized is fine. + */ +} + +static void bearer_challenge_free(bearer_challenge_t *c) +{ + if (!c) + return; + free(c->realm); + free(c->service); + free(c->scope); + c->realm = NULL; + c->service = NULL; + c->scope = NULL; +} + +static char *build_user_pass(const char *user, const char *pass) +{ + if (!user) + return NULL; + size_t ul = strlen(user); + size_t pl = pass ? strlen(pass) : 0; + char *out = malloc(ul + 1 + pl + 1); + if (!out) + return NULL; + memcpy(out, user, ul); + out[ul] = ':'; + if (pl) + memcpy(out + ul + 1, pass, pl); + out[ul + 1 + pl] = '\0'; + return out; +} + +oci_fetcher_t *oci_fetcher_new(const oci_fetcher_options_t *opts) +{ + if (oci_fetch_global_init() < 0) + return NULL; + oci_fetcher_t *f = calloc(1, sizeof(*f)); + if (!f) { + errno = ENOMEM; + return NULL; + } + f->easy = curl_easy_init(); + if (!f->easy) { + free(f); + errno = EIO; + return NULL; + } + if (opts && opts->base_url_override) { + f->base_url_override = strdup(opts->base_url_override); + if (!f->base_url_override) { + curl_easy_cleanup(f->easy); + free(f); + errno = ENOMEM; + return NULL; + } + } + if (opts && opts->username) { + f->user_pass = build_user_pass(opts->username, opts->password); + if (!f->user_pass) { + curl_easy_cleanup(f->easy); + free(f->base_url_override); + free(f); + errno = ENOMEM; + return NULL; + } + } + if (opts && opts->ca_file) { + f->ca_file = strdup(opts->ca_file); + if (!f->ca_file) { + curl_easy_cleanup(f->easy); + free(f->base_url_override); + free(f->user_pass); + free(f); + errno = ENOMEM; + return NULL; + } + } + if (opts) + f->allow_insecure = opts->allow_insecure; + if (opts) + f->policy = opts->policy; + return f; +} + +void oci_fetcher_free(oci_fetcher_t *f) +{ + if (!f) + return; + if (f->easy) + curl_easy_cleanup(f->easy); + free(f->base_url_override); + free(f->bearer_token); + bearer_challenge_free(&f->challenge); + free(f->user_pass); + free(f->ca_file); + free(f); +} + +void oci_fetch_response_free(oci_fetch_response_t *r) +{ + if (!r) + return; + free(r->body); + free(r->content_type); + free(r->docker_content_digest); + free(r->etag); + r->body = NULL; + r->content_type = NULL; + r->docker_content_digest = NULL; + r->etag = NULL; + r->body_len = 0; + r->http_status = 0; +} + +/* Strip the [bracketed] form of an IPv6 literal and any trailing :port from a + * registry-shaped string ("127.0.0.1:fake", "ghcr.io", "[::1]:5000", + * "registry.example.com"). Writes the bare host into out and returns true on + * success; returns false when out is too small to fit the result. + * + * Bracketed IPv6 forms have a colon inside the address, so port-stripping + * keys off the closing ']'; for non-bracketed registries the rightmost ':' + * is the port delimiter. + */ +static bool extract_host_from_registry(const char *reg, char *out, size_t cap) +{ + if (!reg || !out || cap == 0) + return false; + if (reg[0] == '[') { + const char *close = strchr(reg, ']'); + if (!close) + return false; + size_t n = (size_t) (close - reg - 1); + if (n + 1 > cap) + return false; + memcpy(out, reg + 1, n); + out[n] = '\0'; + return true; + } + const char *colon = strrchr(reg, ':'); + size_t n = colon ? (size_t) (colon - reg) : strlen(reg); + if (n + 1 > cap) + return false; + memcpy(out, reg, n); + out[n] = '\0'; + return true; +} + +static bool is_loopback_host(const char *host) +{ + if (!host) + return false; + if (!strcasecmp(host, "127.0.0.1")) + return true; + if (!strcasecmp(host, "localhost")) + return true; + if (!strcasecmp(host, "::1")) + return true; + return false; +} + +/* Reject allow_insecure when the registry host is not on the loopback + * whitelist. Honors ref->registry as the authoritative target even when a + * test passes base_url_override, so that policy reflects the production + * surface ("which host am I pulling from?") rather than where the bytes + * happen to flow during a unit test. The decision is made on the effective + * opts (CLI || policy), so a policy insecure=true on a non-loopback host + * fails the same way a CLI --insecure on a non-loopback host fails. + */ +static int check_insecure_policy(const effective_opts_t *eff, + const oci_ref_t *ref, + const char **err_msg) +{ + if (!eff->allow_insecure) + return 0; + char host[256]; + if (!extract_host_from_registry(ref->registry, host, sizeof(host))) { + if (err_msg) + *err_msg = "registry host is malformed"; + errno = EINVAL; + return -1; + } + if (!is_loopback_host(host)) { + if (err_msg) + *err_msg = "allow_insecure is restricted to loopback registries"; + errno = EPERM; + return -1; + } + return 0; +} + +/* Apply the per-request effective security options to the easy handle in its + * post-reset state. Called from every GET path (manifest, blob, token) after + * curl_easy_reset so the option set survives the reset. + */ +static void apply_security_opts(CURL *easy, const effective_opts_t *eff) +{ + if (eff->user_pass) { + curl_easy_setopt(easy, CURLOPT_USERPWD, eff->user_pass); + curl_easy_setopt(easy, CURLOPT_HTTPAUTH, (long) CURLAUTH_BASIC); + } + if (eff->ca_file) + curl_easy_setopt(easy, CURLOPT_CAINFO, eff->ca_file); + if (eff->allow_insecure) { + curl_easy_setopt(easy, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(easy, CURLOPT_SSL_VERIFYHOST, 0L); + } +} + +/* docker.io is the canonical registry name from the reference parser; the + * actual API host is registry-1.docker.io. Every other registry (ghcr.io, + * quay.io, public.ecr.aws, mirrors) uses its own host directly. + */ +static const char *api_host_for_registry(const char *reg) +{ + if (reg && !strcmp(reg, "docker.io")) + return "registry-1.docker.io"; + return reg; +} + +static char *build_base_url(const oci_fetcher_t *f, const oci_ref_t *ref) +{ + if (f->base_url_override) + return strdup(f->base_url_override); + const char *host = api_host_for_registry(ref->registry); + if (!host) + return NULL; + size_t n = strlen(host) + sizeof("https://"); + char *url = malloc(n); + if (!url) + return NULL; + snprintf(url, n, "https://%s", host); + return url; +} + +static char *build_manifest_url(const oci_fetcher_t *f, + const oci_ref_t *ref, + const char *selector) +{ + char *base = build_base_url(f, ref); + if (!base) + return NULL; + size_t n = strlen(base) + strlen(ref->repository) + strlen(selector) + + sizeof("/v2//manifests/"); + char *url = malloc(n); + if (!url) { + free(base); + return NULL; + } + snprintf(url, n, "%s/v2/%s/manifests/%s", base, ref->repository, selector); + free(base); + return url; +} + +static char *build_blob_url(const oci_fetcher_t *f, + const oci_ref_t *ref, + const char *digest_str) +{ + char *base = build_base_url(f, ref); + if (!base) + return NULL; + size_t n = strlen(base) + strlen(ref->repository) + strlen(digest_str) + + sizeof("/v2//blobs/"); + char *url = malloc(n); + if (!url) { + free(base); + return NULL; + } + snprintf(url, n, "%s/v2/%s/blobs/%s", base, ref->repository, digest_str); + free(base); + return url; +} + +typedef struct { + char *buf; + size_t len; + size_t cap; + size_t max; + bool overflow; +} body_buf_t; + +static size_t body_write_cb(char *ptr, + size_t size, + size_t nmemb, + void *userdata) +{ + body_buf_t *b = userdata; + size_t n = size * nmemb; + if (b->overflow) + return 0; + if (b->len + n + 1 > b->max) { + b->overflow = true; + return 0; + } + if (b->len + n + 1 > b->cap) { + size_t newcap = b->cap ? b->cap : 4096; + while (newcap < b->len + n + 1) + newcap *= 2; + if (newcap > b->max + 1) + newcap = b->max + 1; + char *r = realloc(b->buf, newcap); + if (!r) { + b->overflow = true; + return 0; + } + b->buf = r; + b->cap = newcap; + } + memcpy(b->buf + b->len, ptr, n); + b->len += n; + b->buf[b->len] = '\0'; + return n; +} + +static char *trim_inplace(char *s) +{ + if (!s) + return NULL; + while (*s && isspace((unsigned char) *s)) + s++; + size_t n = strlen(s); + while (n > 0 && isspace((unsigned char) s[n - 1])) { + s[n - 1] = '\0'; + n--; + } + return s; +} + +static char *match_header(char *line, const char *key) +{ + size_t klen = strlen(key); + if (strncasecmp(line, key, klen) != 0) + return NULL; + if (line[klen] != ':') + return NULL; + char *v = line + klen + 1; + while (*v == ' ' || *v == '\t') + v++; + return v; +} + +static char *strdup_range(const char *s, const char *end) +{ + size_t n = (size_t) (end - s); + char *r = malloc(n + 1); + if (!r) + return NULL; + memcpy(r, s, n); + r[n] = '\0'; + return r; +} + +/* Parse a Bearer challenge value into realm/service/scope. Accepts unquoted + * values too (some test fixtures and a few private registries skip the + * quotes). Returns 0 on success or -1 on malformed input. On success *out is + * fully owned by the caller; any prior contents are freed. + */ +static int parse_bearer_challenge(const char *value, bearer_challenge_t *out) +{ + bearer_challenge_t tmp = {0}; + const char *p = value; + while (*p == ' ' || *p == '\t') + p++; + if (strncasecmp(p, "Bearer", 6) != 0) + return -1; + p += 6; + while (*p == ' ' || *p == '\t') + p++; + while (*p) { + const char *key_start = p; + while (*p && *p != '=' && *p != ',') + p++; + if (*p != '=') { + bearer_challenge_free(&tmp); + return -1; + } + const char *key_end = p; + p++; + char *value_str; + if (*p == '"') { + p++; + const char *vstart = p; + while (*p && *p != '"') + p++; + if (*p != '"') { + bearer_challenge_free(&tmp); + return -1; + } + value_str = strdup_range(vstart, p); + p++; + } else { + const char *vstart = p; + while (*p && *p != ',') + p++; + value_str = strdup_range(vstart, p); + } + if (!value_str) { + bearer_challenge_free(&tmp); + return -1; + } + size_t klen = (size_t) (key_end - key_start); + char **target = NULL; + if (klen == 5 && !strncasecmp(key_start, "realm", 5)) + target = &tmp.realm; + else if (klen == 7 && !strncasecmp(key_start, "service", 7)) + target = &tmp.service; + else if (klen == 5 && !strncasecmp(key_start, "scope", 5)) + target = &tmp.scope; + if (target) { + free(*target); + *target = value_str; + } else { + free(value_str); + } + while (*p == ',' || *p == ' ' || *p == '\t') + p++; + } + if (!tmp.realm) { + bearer_challenge_free(&tmp); + return -1; + } + bearer_challenge_free(out); + *out = tmp; + return 0; +} + +typedef struct { + char *content_type; + char *docker_content_digest; + char *etag; + bearer_challenge_t *challenge_out; +} headers_ctx_t; + +static size_t header_cb(char *buffer, + size_t size, + size_t nitems, + void *userdata) +{ + headers_ctx_t *ctx = userdata; + size_t n = size * nitems; + size_t total = n; + if (n == 0 || n >= 4096) + return total; + char line[4096]; + memcpy(line, buffer, n); + line[n] = '\0'; + while (n > 0 && (line[n - 1] == '\r' || line[n - 1] == '\n')) + line[--n] = '\0'; + if (n == 0) + return total; + + char *v = match_header(line, "Content-Type"); + if (v) { + v = trim_inplace(v); + char *semi = strchr(v, ';'); + if (semi) + *semi = '\0'; + v = trim_inplace(v); + free(ctx->content_type); + ctx->content_type = strdup(v); + return total; + } + v = match_header(line, "Docker-Content-Digest"); + if (v) { + v = trim_inplace(v); + free(ctx->docker_content_digest); + ctx->docker_content_digest = strdup(v); + return total; + } + v = match_header(line, "ETag"); + if (v) { + v = trim_inplace(v); + free(ctx->etag); + ctx->etag = strdup(v); + return total; + } + if (ctx->challenge_out) { + v = match_header(line, "Www-Authenticate"); + if (v) { + v = trim_inplace(v); + (void) parse_bearer_challenge(v, ctx->challenge_out); + } + } + return total; +} + +static struct curl_slist *build_request_headers(const oci_fetcher_t *f, + const char *const *accept_types, + const char *if_none_match) +{ + struct curl_slist *hdrs = NULL; + if (accept_types) { + for (const char *const *p = accept_types; *p; p++) { + char hdr[256]; + snprintf(hdr, sizeof(hdr), "Accept: %s", *p); + hdrs = curl_slist_append(hdrs, hdr); + } + } + if (f->bearer_token) { + size_t n = strlen(f->bearer_token) + sizeof("Authorization: Bearer "); + char *hdr = malloc(n); + if (hdr) { + snprintf(hdr, n, "Authorization: Bearer %s", f->bearer_token); + hdrs = curl_slist_append(hdrs, hdr); + free(hdr); + } + } + if (if_none_match) { + size_t n = strlen(if_none_match) + sizeof("If-None-Match: "); + char *hdr = malloc(n); + if (hdr) { + snprintf(hdr, n, "If-None-Match: %s", if_none_match); + hdrs = curl_slist_append(hdrs, hdr); + free(hdr); + } + } + return hdrs; +} + +static int fetch_token(oci_fetcher_t *f, + const effective_opts_t *eff, + const char **err_msg) +{ + if (!f->challenge.realm) { + if (err_msg) + *err_msg = "no bearer realm to fetch token from"; + errno = EINVAL; + return -1; + } + + char *enc_service = f->challenge.service + ? curl_easy_escape(f->easy, f->challenge.service, 0) + : NULL; + char *enc_scope = f->challenge.scope + ? curl_easy_escape(f->easy, f->challenge.scope, 0) + : NULL; + size_t n = strlen(f->challenge.realm) + + (enc_service ? strlen(enc_service) + 16 : 0) + + (enc_scope ? strlen(enc_scope) + 16 : 0) + 2; + char *url = malloc(n); + if (!url) { + curl_free(enc_service); + curl_free(enc_scope); + if (err_msg) + *err_msg = "out of memory"; + errno = ENOMEM; + return -1; + } + int len = snprintf(url, n, "%s", f->challenge.realm); + char sep = strchr(f->challenge.realm, '?') ? '&' : '?'; + if (enc_service) { + len += snprintf(url + len, n - (size_t) len, "%cservice=%s", sep, + enc_service); + sep = '&'; + } + if (enc_scope) { + snprintf(url + len, n - (size_t) len, "%cscope=%s", sep, enc_scope); + } + curl_free(enc_service); + curl_free(enc_scope); + + body_buf_t body = {.max = FETCH_BODY_MAX}; + headers_ctx_t hctx = {0}; + curl_easy_reset(f->easy); + apply_security_opts(f->easy, eff); + curl_easy_setopt(f->easy, CURLOPT_URL, url); + curl_easy_setopt(f->easy, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(f->easy, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(f->easy, CURLOPT_USERAGENT, "elfuse-oci/1"); + curl_easy_setopt(f->easy, CURLOPT_WRITEFUNCTION, body_write_cb); + curl_easy_setopt(f->easy, CURLOPT_WRITEDATA, &body); + curl_easy_setopt(f->easy, CURLOPT_HEADERFUNCTION, header_cb); + curl_easy_setopt(f->easy, CURLOPT_HEADERDATA, &hctx); + + CURLcode rc = curl_easy_perform(f->easy); + long status = 0; + curl_easy_getinfo(f->easy, CURLINFO_RESPONSE_CODE, &status); + free(url); + free(hctx.content_type); + free(hctx.docker_content_digest); + + if (rc != CURLE_OK) { + free(body.buf); + if (err_msg) + *err_msg = curl_easy_strerror(rc); + errno = EIO; + return -1; + } + if (status < 200 || status >= 300) { + free(body.buf); + if (err_msg) + *err_msg = "token endpoint returned non-2xx status"; + errno = EPROTO; + return -1; + } + if (!body.buf || body.len == 0) { + free(body.buf); + if (err_msg) + *err_msg = "token endpoint returned empty body"; + errno = EPROTO; + return -1; + } + + cJSON *json = cJSON_ParseWithLength(body.buf, body.len); + free(body.buf); + if (!json) { + if (err_msg) + *err_msg = "token endpoint returned invalid JSON"; + errno = EPROTO; + return -1; + } + cJSON *t = cJSON_GetObjectItemCaseSensitive(json, "token"); + if (!cJSON_IsString(t) || !t->valuestring) + t = cJSON_GetObjectItemCaseSensitive(json, "access_token"); + if (!cJSON_IsString(t) || !t->valuestring) { + cJSON_Delete(json); + if (err_msg) + *err_msg = "token endpoint response lacks 'token' field"; + errno = EPROTO; + return -1; + } + free(f->bearer_token); + f->bearer_token = strdup(t->valuestring); + cJSON_Delete(json); + if (!f->bearer_token) { + if (err_msg) + *err_msg = "out of memory caching token"; + errno = ENOMEM; + return -1; + } + return 0; +} + +static int perform_manifest_get(oci_fetcher_t *f, + const effective_opts_t *eff, + const char *url, + const char *const *accept_types, + const char *if_none_match, + oci_fetch_response_t *out, + bearer_challenge_t *challenge_out, + const char **err_msg) +{ + body_buf_t body = {.max = FETCH_BODY_MAX}; + headers_ctx_t hctx = {.challenge_out = challenge_out}; + if (challenge_out) + bearer_challenge_free(challenge_out); + + curl_easy_reset(f->easy); + apply_security_opts(f->easy, eff); + curl_easy_setopt(f->easy, CURLOPT_URL, url); + curl_easy_setopt(f->easy, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(f->easy, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(f->easy, CURLOPT_USERAGENT, "elfuse-oci/1"); + curl_easy_setopt(f->easy, CURLOPT_WRITEFUNCTION, body_write_cb); + curl_easy_setopt(f->easy, CURLOPT_WRITEDATA, &body); + curl_easy_setopt(f->easy, CURLOPT_HEADERFUNCTION, header_cb); + curl_easy_setopt(f->easy, CURLOPT_HEADERDATA, &hctx); + struct curl_slist *hdrs = + build_request_headers(f, accept_types, if_none_match); + if (hdrs) + curl_easy_setopt(f->easy, CURLOPT_HTTPHEADER, hdrs); + + CURLcode rc = curl_easy_perform(f->easy); + long status = 0; + curl_easy_getinfo(f->easy, CURLINFO_RESPONSE_CODE, &status); + if (hdrs) + curl_slist_free_all(hdrs); + + out->http_status = status; + if (rc != CURLE_OK) { + free(body.buf); + free(hctx.content_type); + free(hctx.docker_content_digest); + free(hctx.etag); + if (err_msg) + *err_msg = curl_easy_strerror(rc); + errno = EIO; + return -1; + } + if (body.overflow) { + free(body.buf); + free(hctx.content_type); + free(hctx.docker_content_digest); + free(hctx.etag); + if (err_msg) + *err_msg = "response body exceeded max size"; + errno = EFBIG; + return -1; + } + out->body = body.buf; + out->body_len = body.len; + out->content_type = hctx.content_type; + out->docker_content_digest = hctx.docker_content_digest; + out->etag = hctx.etag; + return 0; +} + +int oci_fetch_manifest(oci_fetcher_t *f, + const oci_ref_t *ref, + const char *digest_or_tag, + const char *const *accept_types, + const char *if_none_match, + oci_fetch_response_t *out, + const char **err_msg) +{ + if (!f || !ref || !out) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + memset(out, 0, sizeof(*out)); + effective_opts_t eff; + if (resolve_effective(f, ref, &eff, err_msg) < 0) + return -1; + if (check_insecure_policy(&eff, ref, err_msg) < 0) { + effective_free(&eff); + return -1; + } + const char *selector = digest_or_tag; + if (!selector) + selector = ref->digest; + if (!selector) + selector = ref->tag; + if (!selector) { + if (err_msg) + *err_msg = "reference has no tag or digest"; + errno = EINVAL; + effective_free(&eff); + return -1; + } + char *url = build_manifest_url(f, ref, selector); + if (!url) { + if (err_msg) + *err_msg = "out of memory"; + errno = ENOMEM; + effective_free(&eff); + return -1; + } + + bearer_challenge_t challenge = {0}; + /* Always capture the Bearer challenge, even when a token is already + * cached: registry tokens are short-lived (Docker Hub expires them in + * ~300s), so a long multi-blob pull can outlive the cached token. If the + * stale token 401s, the captured challenge lets the block below refresh + * it and retry instead of failing the pull. + */ + int rc = perform_manifest_get(f, &eff, url, accept_types, if_none_match, + out, &challenge, err_msg); + if (rc < 0) { + free(url); + bearer_challenge_free(&challenge); + effective_free(&eff); + return -1; + } + + if (out->http_status == 401 && challenge.realm) { + bearer_challenge_free(&f->challenge); + f->challenge = challenge; + memset(&challenge, 0, sizeof(challenge)); + oci_fetch_response_free(out); + memset(out, 0, sizeof(*out)); + if (fetch_token(f, &eff, err_msg) < 0) { + free(url); + effective_free(&eff); + return -1; + } + rc = perform_manifest_get(f, &eff, url, accept_types, if_none_match, + out, NULL, err_msg); + if (rc < 0) { + free(url); + effective_free(&eff); + return -1; + } + } else { + bearer_challenge_free(&challenge); + } + + free(url); + effective_free(&eff); + + /* 304 Not Modified is a success path for conditional revalidation: the + * caller asked the registry whether the pinned digest still matches and + * the answer is yes. The body is intentionally empty; the etag (when the + * server emitted one) stays attached for caller diagnostics. + */ + if (out->http_status == 304) + return 0; + if (out->http_status < 200 || out->http_status >= 300) { + if (err_msg) + *err_msg = "manifest fetch returned non-2xx status"; + errno = EPROTO; + return -1; + } + return 0; +} + +typedef struct { + oci_blob_writer_t *w; + /* The easy handle this stream feeds. Needed so the body callback can + * peek CURLINFO_RESPONSE_CODE on the first chunk and notice when a + * server ignored the Range header (200 instead of 206) before any + * bytes get committed to the writer. + */ + CURL *easy; + int64_t bytes_seen; + int64_t bytes_expected; + /* Bytes already present on disk in the writer's partial. Zero on a + * fresh fetch. Drives the body-callback's status peek. + */ + int64_t resume_offset; + bool overflow; + bool write_failed; + /* Set when the body callback observes a non-206 status while the + * request carried a Range header. Triggers BH_NEEDS_RESTART in the + * score path; the writer's polluted digester state is discarded + * along with the partial when the restart re-arms a fresh writer. + */ + bool range_rejected; +} blob_stream_ctx_t; + +static size_t blob_stream_cb(char *ptr, + size_t size, + size_t nmemb, + void *userdata) +{ + blob_stream_ctx_t *ctx = userdata; + size_t n = size * nmemb; + if (ctx->overflow || ctx->write_failed || ctx->range_rejected) + return 0; + /* First chunk on a resumed transfer: if the server replied with + * anything other than 206 Partial Content, the Range header was + * ignored or rejected. Surface the restart signal here rather than + * letting the size cap trip on the full-body retransmission. + */ + if (ctx->resume_offset > 0 && ctx->bytes_seen == ctx->resume_offset && + ctx->easy) { + long status = 0; + curl_easy_getinfo(ctx->easy, CURLINFO_RESPONSE_CODE, &status); + if (status != 206) { + ctx->range_rejected = true; + return 0; + } + } + int64_t projected = ctx->bytes_seen + (int64_t) n; + if (projected > ctx->bytes_expected) { + ctx->overflow = true; + return 0; + } + if (!oci_blob_writer_write(ctx->w, ptr, n)) { + ctx->write_failed = true; + return 0; + } + ctx->bytes_seen = projected; + return n; +} + +/* Per-handle state for a batch transfer. The handle owns its easy handle, + * staging writer, URL string, request-header slist, and any captured bearer + * challenge / response headers. batch_handle_free is safe to call on a + * zero-initialised slot, and safe to call multiple times. + */ +typedef enum { + BH_ACTIVE, /* enqueueable: not yet completed this round */ + BH_NEEDS_RETRY, /* first round hit 401 + Bearer challenge */ + BH_NEEDS_RESTART, /* server ignored Range or replied 416; refetch fresh */ + BH_DONE_OK, /* transfer completed; writer holds verified bytes */ + BH_FAILED, /* transport / status / size error; err_msg populated */ +} batch_state_t; + +typedef struct { + const oci_descriptor_t *desc; + oci_blob_writer_t *w; + char *url; + CURL *easy; + blob_stream_ctx_t bctx; + bearer_challenge_t challenge; + headers_ctx_t hctx; + struct curl_slist *hdrs; + long http_status; + CURLcode last_curl_rc; + batch_state_t state; + bool added; + /* Bytes already present on disk from a prior interrupted fetch. Zero on + * a fresh start; positive when oci_blob_writer_resume_named picked up a + * partial. Drives the per-handle Range header and the score-side detection + * of a server that ignored the Range request. + */ + int64_t resume_offset; + /* Per-blob progress callback. Borrowed from the batch entry's argument; + * NULL when the caller did not request progress. The xferinfo wrapper + * forwards into this callback with bytes_dl adjusted to total-blob + * progress (libcurl's dlnow + resume_offset) so the renderer can pair + * bytes_dl with desc->size as a true completion ratio. + */ + oci_fetch_blob_batch_progress_cb_t progress_cb; + void *progress_user; + const char *err_msg; +} batch_handle_t; + +/* libcurl xferinfo wrapper. clientp is the owning batch_handle_t so the + * callback can look up the descriptor and the resume offset without a + * separate context struct. dltotal is ignored because resumed transfers + * report dltotal == remaining bytes, not the full blob size -- desc->size + * is the authoritative total. The return value propagates from the + * caller's progress_cb so a future renderer can abort a transfer by + * returning non-zero, matching libcurl's xferinfo contract. + */ +static int batch_xferinfo_cb(void *clientp, + curl_off_t dltotal, + curl_off_t dlnow, + curl_off_t ultotal, + curl_off_t ulnow) +{ + (void) dltotal; + (void) ultotal; + (void) ulnow; + batch_handle_t *h = clientp; + if (!h || !h->progress_cb) + return 0; + int64_t bytes_dl = (int64_t) dlnow + h->resume_offset; + return h->progress_cb(h->desc, bytes_dl, h->desc->size, h->progress_user); +} + +static int batch_max_concurrent(void) +{ + const char *e = getenv("OCI_FETCH_MAX_CONCURRENT"); + if (!e || !*e) + return 4; + long n = strtol(e, NULL, 10); + if (n < 1) + n = 1; + if (n > 16) + n = 16; + return (int) n; +} + +static void batch_handle_free(batch_handle_t *h) +{ + if (h->w) { + oci_blob_writer_abort(h->w); + h->w = NULL; + } + if (h->easy) { + curl_easy_cleanup(h->easy); + h->easy = NULL; + } + if (h->hdrs) { + curl_slist_free_all(h->hdrs); + h->hdrs = NULL; + } + free(h->url); + h->url = NULL; + bearer_challenge_free(&h->challenge); + free(h->hctx.content_type); + h->hctx.content_type = NULL; + free(h->hctx.docker_content_digest); + h->hctx.docker_content_digest = NULL; + free(h->hctx.etag); + h->hctx.etag = NULL; +} + +/* Configure an easy handle for a blob fetch. Used both at initial prepare + * time and (after a writer + slist reset) during the post-401 retry round. + * The challenge capture slot is wired only on round 0 since the existing + * single-blob path only attempts one refresh. + */ +static void batch_configure_easy(oci_fetcher_t *f, + const effective_opts_t *eff, + batch_handle_t *h, + bool capture_challenge) +{ + h->bctx.w = h->w; + h->bctx.easy = h->easy; + /* Seed bytes_seen with the partial bytes the writer already absorbed so + * the streaming overflow gate measures total-blob progress against + * desc->size, not just the bytes the server returned on this leg. + */ + h->bctx.bytes_seen = h->resume_offset; + h->bctx.bytes_expected = h->desc->size; + h->bctx.resume_offset = h->resume_offset; + h->bctx.overflow = false; + h->bctx.write_failed = false; + h->bctx.range_rejected = false; + h->hctx.challenge_out = capture_challenge ? &h->challenge : NULL; + + apply_security_opts(h->easy, eff); + curl_easy_setopt(h->easy, CURLOPT_URL, h->url); + curl_easy_setopt(h->easy, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(h->easy, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(h->easy, CURLOPT_USERAGENT, "elfuse-oci/1"); + curl_easy_setopt(h->easy, CURLOPT_WRITEFUNCTION, blob_stream_cb); + curl_easy_setopt(h->easy, CURLOPT_WRITEDATA, &h->bctx); + curl_easy_setopt(h->easy, CURLOPT_HEADERFUNCTION, header_cb); + curl_easy_setopt(h->easy, CURLOPT_HEADERDATA, &h->hctx); + if (h->resume_offset > 0) { + char range[64]; + snprintf(range, sizeof(range), "%lld-", (long long) h->resume_offset); + curl_easy_setopt(h->easy, CURLOPT_RANGE, range); + } + if (h->progress_cb) { + curl_easy_setopt(h->easy, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(h->easy, CURLOPT_XFERINFOFUNCTION, batch_xferinfo_cb); + curl_easy_setopt(h->easy, CURLOPT_XFERINFODATA, h); + } + h->hdrs = build_request_headers(f, NULL, NULL); + if (h->hdrs) + curl_easy_setopt(h->easy, CURLOPT_HTTPHEADER, h->hdrs); +} + +static int batch_prepare_handle(oci_fetcher_t *f, + const effective_opts_t *eff, + const oci_ref_t *ref, + batch_handle_t *h, + oci_blob_store_t *store, + const char **err_msg) +{ + h->w = oci_blob_writer_resume_named(store, h->desc->algo, h->desc->hex, + h->desc->size, &h->resume_offset); + if (!h->w) { + if (err_msg) + *err_msg = "failed to start blob writer"; + return -1; + } + h->url = build_blob_url(f, ref, h->desc->digest_str); + if (!h->url) { + if (err_msg) + *err_msg = "out of memory"; + errno = ENOMEM; + return -1; + } + h->easy = curl_easy_init(); + if (!h->easy) { + if (err_msg) + *err_msg = "curl_easy_init failed"; + errno = EIO; + return -1; + } + h->state = BH_ACTIVE; + h->added = false; + h->http_status = 0; + h->last_curl_rc = CURLE_OK; + h->err_msg = NULL; + /* Capture the Bearer challenge on round 0 even with a cached token: a + * short-lived token can expire mid-pull, and the round-0 401 retry can + * only refresh it if the challenge was parsed off the response. + */ + batch_configure_easy(f, eff, h, true); + return 0; +} + +/* Re-arm a handle for a fresh transfer attempt: token-refresh retry after a + * 401 + Bearer challenge, or restart-from-zero after a server ignored the + * Range header (200 instead of 206) or replied 416. The original writer is + * aborted (its staging file gets unlinked) and a brand-new one starts at + * byte zero, with resume_offset reset so batch_configure_easy emits no + * Range header on the next attempt. The easy handle is reset and re-wired + * with the current bearer token. Challenge capture is disabled so any + * second-round 401 falls straight through to FAILED, and a second-round + * 200-after-Range cannot reoccur because resume_offset is now zero. + */ +static int batch_reset_handle_fresh(oci_fetcher_t *f, + const effective_opts_t *eff, + batch_handle_t *h, + oci_blob_store_t *store) +{ + oci_blob_writer_abort(h->w); + h->w = NULL; + if (h->hdrs) { + curl_slist_free_all(h->hdrs); + h->hdrs = NULL; + } + free(h->hctx.content_type); + h->hctx.content_type = NULL; + free(h->hctx.docker_content_digest); + h->hctx.docker_content_digest = NULL; + free(h->hctx.etag); + h->hctx.etag = NULL; + bearer_challenge_free(&h->challenge); + + h->w = oci_blob_writer_begin_named(store, h->desc->algo, h->desc->hex); + if (!h->w) + return -1; + h->resume_offset = 0; + curl_easy_reset(h->easy); + h->state = BH_ACTIVE; + h->added = false; + h->http_status = 0; + h->last_curl_rc = CURLE_OK; + h->err_msg = NULL; + batch_configure_easy(f, eff, h, false); + return 0; +} + +/* Score a completed CURLMSG_DONE entry. Translates a curl + HTTP status pair + * into a batch_state_t transition, mirroring the diagnostic strings the + * single-blob path historically produced so the test suite's err_msg + * assertions stay byte-identical. + */ +static void batch_score_done(batch_handle_t *h, + CURLcode crc, + long status, + int round) +{ + h->http_status = status; + h->last_curl_rc = crc; + /* The body callback flagged a non-206 response to a Range request. + * Surface the restart intent before any size / digest / curl-error + * diagnostics: the discarded bytes would otherwise look like an + * overflow, and a 416 with a small error body would look like a + * payload write failure. + */ + if (h->bctx.range_rejected) { + h->state = BH_NEEDS_RESTART; + return; + } + if (crc != CURLE_OK) { + if (h->bctx.overflow) { + h->err_msg = "blob exceeded declared size"; + errno = EPROTO; + } else if (h->bctx.write_failed) { + h->err_msg = "blob writer rejected payload"; + errno = EIO; + } else { + h->err_msg = curl_easy_strerror(crc); + errno = EIO; + } + h->state = BH_FAILED; + return; + } + if (status == 401 && h->challenge.realm && round == 0) { + h->state = BH_NEEDS_RETRY; + return; + } + /* A 416 with an empty body never reached blob_stream_cb, so the + * range_rejected flag above did not fire. Catch that path here. + * status == 200 with resume_offset > 0 also belongs to this restart + * arm, though in practice the body callback catches it first. + */ + if (h->resume_offset > 0 && (status == 200 || status == 416)) { + h->state = BH_NEEDS_RESTART; + return; + } + if (status < 200 || status >= 300) { + h->err_msg = "blob fetch returned non-2xx status"; + errno = EPROTO; + h->state = BH_FAILED; + return; + } + if (h->bctx.bytes_seen != h->desc->size) { + h->err_msg = "blob size mismatch"; + errno = EPROTO; + h->state = BH_FAILED; + return; + } + h->state = BH_DONE_OK; + /* libcurl's xferinfo does not guarantee a final dlnow == dltotal tick, + * so the renderer would otherwise stall one update short of "done". + * One explicit invocation at the score boundary normalises the + * sequence the user-side callback sees, regardless of socket pacing. + */ + if (h->progress_cb) + (void) h->progress_cb(h->desc, h->desc->size, h->desc->size, + h->progress_user); +} + +int oci_fetch_blob_batch(oci_fetcher_t *f, + const oci_ref_t *ref, + const oci_descriptor_t *const *descs, + size_t n_descs, + oci_blob_store_t *store, + oci_fetch_blob_batch_progress_cb_t progress_cb, + void *cb_user_data, + const char **err_msg) +{ + if (!f || !ref || !descs || !store) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + if (n_descs == 0) + return 0; + + effective_opts_t eff; + if (resolve_effective(f, ref, &eff, err_msg) < 0) + return -1; + if (check_insecure_policy(&eff, ref, err_msg) < 0) { + effective_free(&eff); + return -1; + } + + /* Drop stale tmp partials that no surviving batch can resume from. A + * week is long enough to let an interrupted multi-day pull finish on + * the next attempt while still keeping the staging area bounded for + * caches that see frequent unique blobs. The blob-store guards the + * tmp/ namespace, so the wide blob-* prefix cannot touch unrelated + * files. + */ + oci_blob_store_sweep_partials(store, 7L * 86400); + + int rc = -1; + int max_concurrent = batch_max_concurrent(); + bool any_failed = false; + CURLM *multi = NULL; + + batch_handle_t *handles = calloc(n_descs, sizeof(*handles)); + if (!handles) { + if (err_msg) + *err_msg = "out of memory"; + errno = ENOMEM; + goto cleanup; + } + + /* Dedup pass: drop blobs already in the store, collapse same-digest + * entries (the layers array can repeat a digest legitimately). nh is + * the number of handles that actually need a transfer. + */ + size_t nh = 0; + for (size_t i = 0; i < n_descs; i++) { + const oci_descriptor_t *d = descs[i]; + if (!d || d->size < 0) { + if (err_msg) + *err_msg = "descriptor size is negative"; + errno = EINVAL; + goto cleanup; + } + if (oci_blob_store_has(store, d->algo, d->hex)) + continue; + bool dup = false; + for (size_t j = 0; j < nh; j++) { + if (handles[j].desc->algo == d->algo && + strcmp(handles[j].desc->hex, d->hex) == 0) { + dup = true; + break; + } + } + if (dup) + continue; + handles[nh].desc = d; + handles[nh].progress_cb = progress_cb; + handles[nh].progress_user = cb_user_data; + nh++; + } + if (nh == 0) { + rc = 0; + goto cleanup; + } + + for (size_t i = 0; i < nh; i++) { + if (batch_prepare_handle(f, &eff, ref, &handles[i], store, err_msg) < 0) + goto cleanup; + } + + multi = curl_multi_init(); + if (!multi) { + if (err_msg) + *err_msg = "curl_multi_init failed"; + errno = EIO; + goto cleanup; + } + + int round = 0; + /* Outer loop: each iteration tops up the multi up to max_concurrent + * ACTIVE handles and drains them until still_running hits zero. When no + * ACTIVE remain the loop checks for NEEDS_RETRY (single token refresh + * per batch) and either restarts those handles or exits. + */ + while (1) { + size_t added_count = 0; + for (size_t i = 0; i < nh; i++) { + if (handles[i].added) + added_count++; + } + for (size_t i = 0; i < nh && added_count < (size_t) max_concurrent; + i++) { + if (handles[i].state == BH_ACTIVE && !handles[i].added) { + if (curl_multi_add_handle(multi, handles[i].easy) == CURLM_OK) { + handles[i].added = true; + added_count++; + } + } + } + if (added_count == 0) + break; + + int still_running = 0; + do { + int num_fds = 0; + CURLMcode mrc = curl_multi_poll(multi, NULL, 0, 1000, &num_fds); + if (mrc != CURLM_OK) { + if (err_msg) + *err_msg = curl_multi_strerror(mrc); + errno = EIO; + any_failed = true; + goto drained; + } + curl_multi_perform(multi, &still_running); + CURLMsg *msg; + int n_msgs = 0; + while ((msg = curl_multi_info_read(multi, &n_msgs)) != NULL) { + if (msg->msg != CURLMSG_DONE) + continue; + batch_handle_t *h = NULL; + for (size_t i = 0; i < nh; i++) { + if (handles[i].easy == msg->easy_handle) { + h = &handles[i]; + break; + } + } + if (!h) + continue; + CURLcode crc = msg->data.result; + long status = 0; + curl_easy_getinfo(h->easy, CURLINFO_RESPONSE_CODE, &status); + curl_multi_remove_handle(multi, h->easy); + h->added = false; + batch_score_done(h, crc, status, round); + } + } while (still_running > 0); + drained:; + /* If there are still ACTIVE slots not yet enqueued, fall back into + * the outer loop to add them; otherwise check for retries. + */ + bool more_active = false; + for (size_t i = 0; i < nh; i++) + if (handles[i].state == BH_ACTIVE) { + more_active = true; + break; + } + if (more_active) + continue; + + bool any_retry = false; + bool any_restart = false; + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_NEEDS_RETRY) + any_retry = true; + else if (handles[i].state == BH_NEEDS_RESTART) + any_restart = true; + } + if (!any_retry && !any_restart) + break; + if (any_failed) + break; + + if (any_retry) { + /* Single token refresh per batch. Steal one retry handle's + * challenge onto f->challenge so fetch_token sees the + * realm/service/scope, then re-arm every NEEDS_RETRY handle + * with the new bearer. + */ + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_NEEDS_RETRY) { + bearer_challenge_free(&f->challenge); + f->challenge = handles[i].challenge; + memset(&handles[i].challenge, 0, + sizeof(handles[i].challenge)); + break; + } + } + if (fetch_token(f, &eff, err_msg) < 0) { + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_NEEDS_RETRY) { + handles[i].state = BH_FAILED; + handles[i].err_msg = "token refresh failed"; + } + } + any_failed = true; + break; + } + round++; + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_NEEDS_RETRY) { + if (batch_reset_handle_fresh(f, &eff, &handles[i], store) < + 0) { + handles[i].state = BH_FAILED; + handles[i].err_msg = "failed to reset writer for retry"; + any_failed = true; + } + } + } + if (any_failed) + break; + } + + if (any_restart) { + /* Range-resume retry: no token refresh, just a fresh writer + * with resume_offset cleared so the next attempt fetches the + * full blob without a Range header. The reset itself zeroes + * resume_offset, so a second 200-after-Range / 416 cannot + * pick the restart branch again -- the handle either + * succeeds or falls into BH_FAILED on the next score. + */ + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_NEEDS_RESTART) { + if (batch_reset_handle_fresh(f, &eff, &handles[i], store) < + 0) { + handles[i].state = BH_FAILED; + handles[i].err_msg = + "failed to reset writer for restart"; + any_failed = true; + } + } + } + if (any_failed) + break; + } + } + + for (size_t i = 0; i < nh; i++) { + if (handles[i].state == BH_FAILED) { + any_failed = true; + if (err_msg && !*err_msg && handles[i].err_msg) + *err_msg = handles[i].err_msg; + } else if (handles[i].state == BH_ACTIVE || + handles[i].state == BH_NEEDS_RETRY || + handles[i].state == BH_NEEDS_RESTART) { + /* Should be unreachable: the loop only exits when nothing is + * still queued. Defensive: treat as failure rather than + * silently dropping the slot. + */ + any_failed = true; + handles[i].state = BH_FAILED; + if (err_msg && !*err_msg) + *err_msg = "batch left a handle in a non-terminal state"; + } + } + if (any_failed) { + if (err_msg && !*err_msg) + *err_msg = "batch blob fetch failed"; + goto cleanup; + } + + /* Commit only after every transfer succeeded. Commit consumes the writer + * (frees on success), so clear h->w to suppress the batch_handle_free + * abort. A digest mismatch here aborts any remaining unflushed writers + * and surfaces the historical "blob digest mismatch on commit" string. + */ + for (size_t i = 0; i < nh; i++) { + if (handles[i].state != BH_DONE_OK) + continue; + if (oci_blob_writer_commit(handles[i].w) < 0) { + handles[i].w = NULL; + if (err_msg) + *err_msg = "blob digest mismatch on commit"; + for (size_t j = i + 1; j < nh; j++) { + if (handles[j].state == BH_DONE_OK && handles[j].w) { + oci_blob_writer_abort(handles[j].w); + handles[j].w = NULL; + } + } + goto cleanup; + } + handles[i].w = NULL; + } + rc = 0; + +cleanup: + if (multi) + curl_multi_cleanup(multi); + if (handles) { + for (size_t i = 0; i < n_descs; i++) + batch_handle_free(&handles[i]); + free(handles); + } + effective_free(&eff); + return rc; +} + +int oci_fetch_blob(oci_fetcher_t *f, + const oci_ref_t *ref, + const oci_descriptor_t *desc, + oci_blob_store_t *store, + const char **err_msg) +{ + if (!desc) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + return oci_fetch_blob_batch(f, ref, &desc, 1, store, NULL, NULL, err_msg); +} diff --git a/src/oci/fetch.h b/src/oci/fetch.h new file mode 100644 index 0000000..c91009f --- /dev/null +++ b/src/oci/fetch.h @@ -0,0 +1,228 @@ +/* OCI registry HTTPS client + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Wraps libcurl for the subset of the OCI distribution-spec that elfuse needs + * to pull an image: + * + * - Anonymous GET against /v2//manifests/ and + * /v2//blobs/ + * - 401 + Www-Authenticate: Bearer challenge: fetch a token from the realm + * advertised by the registry, then retry the original request with + * Authorization: Bearer + * - Blob streaming: pipe the response body into the slice-2 blob store with + * digest and declared-size verification, so a hostile or truncated layer + * never produces a visible-complete blob + * + * Future slices extend the options struct with basic auth credentials, + * custom CA bundle, and a loopback-gated TLS verify-off path + * (oci-roadmap.md Q7 ship list). The public entry points stay stable. + * + * Thread safety: oci_fetch_global_init must run once before any fetcher is + * created. Each oci_fetcher_t holds its own libcurl easy handle and is not + * safe to share across threads; create one per worker. + */ + +#pragma once + +#include +#include + +#include "blob-store.h" +#include "manifest.h" +#include "ref.h" + +typedef struct oci_policy oci_policy_t; + +typedef struct { + /* Optional override of the registry base URL. When non-NULL, the fetcher + * uses this prefix for every /v2/... request instead of computing one + * from ref->registry. Test scaffolding sets this to a local mock + * (https://127.0.0.1:); production callers leave it NULL. + */ + const char *base_url_override; + + /* HTTP Basic authentication. When username is non-NULL, libcurl produces + * Authorization: Basic on every request the fetcher + * issues, including the token endpoint when the registry also requires a + * Bearer flow. password may be NULL for an empty secret. CLI-supplied + * credentials override anything a policy auth_file points at for the + * same registry. + */ + const char *username; + const char *password; + + /* Path to a PEM-encoded CA bundle. When non-NULL the fetcher passes it to + * libcurl as CURLOPT_CAINFO, replacing the system trust store for that + * connection. Effective only with an OpenSSL-style SSL backend (the + * default macOS Secure Transport backend ignores CAINFO). CLI-supplied + * ca_file overrides any policy ca_bundle for the same registry. + */ + const char *ca_file; + + /* Disable TLS verification. Honored only when the resolved registry host + * is on the loopback whitelist (127.0.0.1, localhost, ::1). Any other + * host with allow_insecure=true causes oci_fetch_manifest / + * oci_fetch_blob to fail with errno=EPERM before a single byte is sent. + * A policy insecure=true for the resolved host has the same effect and + * goes through the same loopback gate; CLI allow_insecure=true is an + * override that wins when the policy declares insecure=false. + */ + bool allow_insecure; + + /* Optional reference to a loaded oci_policy_t. When non-NULL the fetcher + * consults the policy on every manifest/blob request using ref->registry + * as the lookup key and merges the per-host effective view with the CLI + * options above (CLI wins). Lifetime is caller-owned; the policy must + * outlive the fetcher. + */ + const oci_policy_t *policy; +} oci_fetcher_options_t; + +typedef struct oci_fetcher oci_fetcher_t; + +/* Per-process libcurl global init. Safe to call multiple times; only the + * first call performs work. Returns 0 on success or -1 with errno=EIO if + * libcurl rejects the initialization. + */ +int oci_fetch_global_init(void); + +/* Counterpart of oci_fetch_global_init. The caller may invoke it on shutdown + * but elfuse runs short enough that leaving libcurl initialized until process + * exit is acceptable. + */ +void oci_fetch_global_cleanup(void); + +/* Allocate a fetcher. opts may be NULL for defaults. Returns NULL on + * allocation failure with errno preserved. + */ +oci_fetcher_t *oci_fetcher_new(const oci_fetcher_options_t *opts); + +/* Release the fetcher. Safe on NULL. */ +void oci_fetcher_free(oci_fetcher_t *f); + +typedef struct { + /* Heap-allocated response body. NUL-terminated so callers can pass it + * directly to JSON parsers that expect a C string, while body_len is the + * authoritative byte count. + */ + char *body; + size_t body_len; + /* Content-Type header value with parameters stripped (everything before + * the first ';'). NULL if the server omitted the header. + */ + char *content_type; + /* Docker-Content-Digest header value verbatim, e.g. "sha256:abc...". + * NULL if the server omitted it. Useful for tag-to-digest pinning. + */ + char *docker_content_digest; + /* ETag header verbatim, including any surrounding quotes or weak prefix + * (e.g. "sha256:abc..." or W/"..."). NULL if the server omitted it. + * Captured so conditional-GET callers can echo it back without parsing. + */ + char *etag; + long http_status; +} oci_fetch_response_t; + +/* Release any heap fields. Safe on a zero-initialised struct. */ +void oci_fetch_response_free(oci_fetch_response_t *r); + +/* Fetch a manifest, image index, or image config blob by reference. + * + * ref registry/repository, plus optional default tag/digest + * digest_or_tag the actual GET selector ("sha256:..." or a tag string). + * NULL means: use ref->digest if set, otherwise ref->tag. + * accept_types NULL-terminated list of media types to advertise in the + * Accept header. Pass NULL to suppress the Accept header. + * if_none_match optional If-None-Match value sent verbatim. Pass the + * registry-style strong quoted form ("sha256:...") to ask + * the registry for 304 Not Modified when the upstream + * manifest still hashes to the pinned digest. NULL skips + * the conditional header entirely. + * + * On success returns 0 and fills *out (caller frees via + * oci_fetch_response_free). A 304 response is success: out->http_status is + * 304, out->body is NULL, out->body_len is 0, and out->etag may still be + * populated. On HTTP error (other non-2xx) returns -1 with out->http_status + * populated and errno=EPROTO; the body may still be present for + * diagnostics. On transport / auth failure returns -1 with errno preserved + * and *err_msg (when non-NULL) pointing at a static description. + */ +int oci_fetch_manifest(oci_fetcher_t *f, + const oci_ref_t *ref, + const char *digest_or_tag, + const char *const *accept_types, + const char *if_none_match, + oci_fetch_response_t *out, + const char **err_msg); + +/* Fetch a blob into the local store. The descriptor's algo, hex, and size + * fields drive verification: incoming bytes feed an oci_blob_writer keyed by + * the digest, the running byte count is capped at desc->size so a hostile + * server cannot stream forever, and the writer's own digest check at commit + * rejects any payload that hashes to anything other than desc->hex. + * + * Returns 0 on success, -1 with errno set on failure. err_msg points at a + * static description for the common diagnostic modes (digest mismatch, + * size mismatch, transport error, HTTP status). + * + * Already-present blobs are an immediate success (store-side has() check) + * with no network call. + * + * Implementation is a one-element forwarder onto oci_fetch_blob_batch; the + * separate entry point exists so callers that only need a single blob need + * not allocate a descriptor array. + */ +int oci_fetch_blob(oci_fetcher_t *f, + const oci_ref_t *ref, + const oci_descriptor_t *desc, + oci_blob_store_t *store, + const char **err_msg); + +/* Per-blob progress callback. Invoked (potentially repeatedly) by the batch + * fetcher as bytes accrue. bytes_dl is the number of bytes already streamed + * for desc; bytes_total mirrors desc->size. Return value is reserved (C5.3 + * may use a non-zero return to signal abort); C5.1 callers should return 0. + * + * Reserved typedef -- the callback parameter to oci_fetch_blob_batch is + * accepted but not yet invoked. C5.3 wires it through CURLOPT_XFERINFOFUNCTION + * for in-flight progress; the typedef is committed at C5.1 so the batch API + * surface does not move between Plan 5 commits. + */ +typedef int (*oci_fetch_blob_batch_progress_cb_t)(const oci_descriptor_t *desc, + int64_t bytes_dl, + int64_t bytes_total, + void *user_data); + +/* Fetch multiple blobs in parallel via libcurl's multi interface. + * + * descs / n_descs array of descriptor pointers to fetch. Already-present + * blobs (store-side has() hit) are skipped before any + * handle is allocated. Duplicate digests within the batch + * are collapsed to one transfer. + * progress_cb optional per-blob progress callback (C5.3); pass NULL + * cb_user_data opaque pointer forwarded to progress_cb + * + * Concurrency cap reads OCI_FETCH_MAX_CONCURRENT (default 4, clamped to + * [1, 16]). The batch is atomic: any single-blob failure (network error, + * HTTP non-2xx after one auth retry, size or digest mismatch) aborts every + * in-flight writer and the function returns -1 with err_msg set. On success + * every blob is committed to the store before the function returns; commits + * happen sequentially after all transfers succeed. + * + * A single shared effective_opts_t is resolved at entry (policy + CLI merge + * for ref->registry). Token refresh is handled inline: when any first-round + * handle returns 401 with a Bearer challenge, the batch drains all + * in-flight handles, refreshes the fetcher's bearer token once, and restarts + * every 401 handle with the new bearer header. A second 401 fails the + * batch. + */ +int oci_fetch_blob_batch(oci_fetcher_t *f, + const oci_ref_t *ref, + const oci_descriptor_t *const *descs, + size_t n_descs, + oci_blob_store_t *store, + oci_fetch_blob_batch_progress_cb_t progress_cb, + void *cb_user_data, + const char **err_msg); diff --git a/src/oci/inspect.c b/src/oci/inspect.c new file mode 100644 index 0000000..f2a4391 --- /dev/null +++ b/src/oci/inspect.c @@ -0,0 +1,573 @@ +/* Offline manifest tree renderer for elfuse oci inspect + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Reads the blob the local pin points at, classifies it as an image index or + * image manifest, and prints a tree. No network, no fetcher. The manifest + * model from slice 3 enforces every digest is lowercase and every descriptor + * size is non-negative, so the renderer can trust its inputs once the parse + * returns 0. + * + * Detection between index and manifest is structural: oci_index_parse refuses + * a body that has no "manifests" array, oci_manifest_parse refuses a body + * that has no "config" + "layers" pair. The two parsers therefore reject + * disjoint shapes, and trying one then the other is unambiguous. Image + * configs never reach this code path because pins point at manifest-shaped + * blobs (slice 5a stores the manifest body it received from the registry). + */ + +#include "inspect.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blob-store.h" +#include "dedup-metrics.h" +#include "digest.h" +#include "manifest.h" +#include "media-type.h" + +/* Upper bound on a manifest/index body. Real manifests are well under 1 MiB; + * a 64 MiB cap is generous and prevents a corrupted store from forcing a + * pathological malloc. + */ +#define INSPECT_BODY_MAX ((size_t) 64 * 1024 * 1024) + +/* Render a digest in two compact forms: + * + * - short_digest("sha256:abcdef0123456789...") + * -> "sha256:abcdef012345..." (first 19 chars + "...") + * + * Matches the slice 5a pull progress line so the two surfaces stay visually + * consistent. The caller-supplied buffer keeps the function reentrant; using + * one static buffer would clobber on the second %s in a single printf. + */ +static void short_digest(const char *full, char out[24]) +{ + if (!full) { + snprintf(out, 24, "(null)"); + return; + } + size_t len = strlen(full); + if (len <= 22) { + snprintf(out, 24, "%s", full); + return; + } + snprintf(out, 24, "%.19s...", full); +} + +/* Compose a "linux/arm64/v8" string from a parsed platform descriptor. The + * variant suffix is omitted when the variant field is empty so a platform + * with no variant prints as "linux/amd64" rather than "linux/amd64/". + */ +static void render_platform(const oci_platform_t *p, char out[64]) +{ + const char *os = p->os && *p->os ? p->os : "?"; + const char *arch = + p->architecture && *p->architecture ? p->architecture : "?"; + if (p->variant && *p->variant) { + snprintf(out, 64, "%s/%s/%s", os, arch, p->variant); + } else { + snprintf(out, 64, "%s/%s", os, arch); + } +} + +/* Open /blobs// and slurp the contents into a fresh + * heap buffer. NUL-terminates the buffer so the slice 3 parsers (which accept + * exact-length bytes) can also be fed as C strings if a caller wants. On + * miss returns -1 with errno=ENOENT; on read failure returns -1 with errno + * preserved or set to EIO. + */ +static int read_blob_file(oci_blob_store_t *blobs, + oci_digest_algo_t algo, + const char *hex, + char **out_body, + size_t *out_len) +{ + char path[4096]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + return -1; + } + if (st.st_size < 0 || (uintmax_t) st.st_size > INSPECT_BODY_MAX) { + close(fd); + errno = EFBIG; + return -1; + } + size_t want = (size_t) st.st_size; + char *buf = malloc(want + 1); + if (!buf) { + close(fd); + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < want) { + ssize_t r = read(fd, buf + off, want - off); + if (r < 0) { + int saved = errno; + free(buf); + close(fd); + errno = saved; + return -1; + } + if (r == 0) + break; + off += (size_t) r; + } + close(fd); + if (off != want) { + free(buf); + errno = EIO; + return -1; + } + buf[want] = '\0'; + *out_body = buf; + *out_len = want; + return 0; +} + +/* Emit one entry of a JSON-style string array with backslash and double-quote + * escaping. Control characters pass through verbatim: image-config Entrypoint + * and Cmd entries are container argv strings, which in practice never carry + * raw control bytes, and a partial JSON escape table would mislead a reader + * who expects strict RFC 8259 conformance. Callers downstream of inspect + * (commit 2 onward) reparse the array via the cJSON-backed image-config + * loader, not by scanning the human-readable inspect output. + */ +static void print_quoted_token(FILE *out, const char *s) +{ + fputc('"', out); + for (const char *p = s; *p; p++) { + if (*p == '"' || *p == '\\') + fputc('\\', out); + fputc(*p, out); + } + fputc('"', out); +} + +/* Render a NULL-terminated string array as ["a", "b", "c"]. Empty array + * (arr[0] == NULL) renders as []. Caller has already pre-checked arr != NULL. + */ +static void print_json_string_array(FILE *out, char *const *arr) +{ + fputc('[', out); + for (size_t i = 0; arr[i] != NULL; i++) { + if (i > 0) + fputs(", ", out); + print_quoted_token(out, arr[i]); + } + fputc(']', out); +} + +/* Render the image-config runtime block (User, WorkingDir, Entrypoint, Cmd, + * Env). Absent fields (NULL pointer in the parsed model) skip the bullet + * entirely; empty arrays still print "[]" because that is the + * spec-defined "explicit empty" shape and silently hiding it would + * misrepresent the image. Label column width is 13 so values align with the + * preceding " config: " column from render_manifest. + * + * Multi-line Env: the first var sits on the "env:" line; remaining vars + * indent to the value column on continuation lines. Output stays grep-friendly + * (each VAR=value on its own line) without sacrificing the leading section + * header. + */ +static void render_runtime(FILE *out, const oci_image_runtime_t *rt) +{ + fprintf(out, "runtime:\n"); + if (rt->user) + fprintf(out, " user: %s\n", rt->user); + if (rt->working_dir) + fprintf(out, " workingdir: %s\n", rt->working_dir); + if (rt->entrypoint) { + fprintf(out, " entrypoint: "); + print_json_string_array(out, rt->entrypoint); + fputc('\n', out); + } + if (rt->cmd) { + fprintf(out, " cmd: "); + print_json_string_array(out, rt->cmd); + fputc('\n', out); + } + if (rt->env) { + if (rt->env[0] == NULL) { + fprintf(out, " env: []\n"); + } else { + for (size_t i = 0; rt->env[i] != NULL; i++) { + fprintf(out, "%s%s\n", + i == 0 ? " env: " : " ", + rt->env[i]); + } + } + } +} + +/* Best-effort read+parse of the image-config blob referenced by a manifest's + * config descriptor; on success, emits the runtime block. Failure (blob + * missing, parse rejects the body) is silent: the surrounding inspect output + * already names the config digest in the layer table, so a reader can chase + * it via 'elfuse oci pull' or by inspecting the store directly. Inspect's + * primary contract is to render the manifest tree, not to fail when a + * pulled image is missing the auxiliary config blob. + */ +static void try_render_runtime(FILE *out, + oci_blob_store_t *blobs, + const oci_descriptor_t *config_desc) +{ + char *body = NULL; + size_t body_len = 0; + if (read_blob_file(blobs, config_desc->algo, config_desc->hex, &body, + &body_len) < 0) + return; + oci_image_config_t cfg = {0}; + if (oci_image_config_parse(body, body_len, &cfg, NULL) == 0) { + render_runtime(out, &cfg.config); + oci_image_config_free(&cfg); + } + free(body); +} + +/* Print the config + layer table for a parsed manifest. When manifest_digest + * is non-NULL, a "manifest: ()" header line goes + * first; the direct-manifest path passes NULL so it does not duplicate the + * already-printed pin line. After the layer table, attempt to render the + * image-config runtime block (User, Env, Entrypoint, Cmd, WorkingDir) when + * the config blob can be loaded; absent/unreadable config blobs leave the + * runtime section out, since the manifest tree itself is the primary signal. + */ +static void render_manifest(FILE *out, + oci_blob_store_t *blobs, + const oci_manifest_t *mf, + const char *manifest_digest) +{ + if (manifest_digest) { + const char *mt = oci_media_type_name(mf->media_type); + fprintf(out, "manifest: %s (%s)\n", manifest_digest, + mt ? mt : "unknown"); + } + char buf[24]; + short_digest(mf->config.digest_str, buf); + const char *config_mt = oci_media_type_name(mf->config.media_type); + fprintf(out, " config: %-22s %12" PRId64 "B %s\n", buf, mf->config.size, + config_mt ? config_mt : "unknown"); + fprintf(out, " layers:\n"); + for (size_t i = 0; i < mf->nlayers; i++) { + const oci_descriptor_t *l = &mf->layers[i]; + short_digest(l->digest_str, buf); + const char *lmt = oci_media_type_name(l->media_type); + fprintf(out, " [%zu] %-22s %12" PRId64 "B %s\n", i, buf, + l->size, lmt ? lmt : "unknown"); + } + try_render_runtime(out, blobs, &mf->config); +} + +/* Render the C3.4 "layer reuse:" section. Compares the target manifest's + * diff_id list and ChainID chain against every other image recorded in the + * store (pins plus, when volume_root is set, unpacked sysroots) and prints + * a two-line summary: + * + * layer reuse: + * raw cache: N/M layers shared with K other image(s)[, X on cache] + * stack cache: deepest shared prefix reaches layer P/M (sha256:...) + * + * Failure modes are intentionally soft: a missing or malformed image-config + * for the target prints "layer reuse: (image-config unavailable)" without + * disturbing the surrounding manifest tree output. An empty store (no other + * images to compare against) prints "(no other images to compare)" so the + * operator can tell "0 shared because nothing to share with" apart from + * "0 shared because nothing overlaps". + * + * Bytes formatting: values >= 1 MiB render as "~X.Y MiB on cache"; smaller + * non-zero values render in bytes; zero bytes are omitted (still print the + * layer count, just without a bytes clause) because a 0 B clause would imply + * the raw cache is populated when it isn't. + */ +static void render_layer_reuse(FILE *out, + oci_store_t *store, + const char *manifest_digest, + const char *volume_root) +{ + oci_dedup_metrics_t m = {0}; + const char *err = NULL; + if (oci_dedup_metrics_compute(store, manifest_digest, volume_root, &m, + &err) < 0) { + fprintf(out, "layer reuse: (image-config unavailable)\n"); + return; + } + if (m.compared_images == 0) { + fprintf(out, "layer reuse: (no other images to compare)\n"); + return; + } + fprintf(out, "layer reuse:\n"); + fprintf(out, " raw cache: %zu/%zu layers shared with %zu other image(s)", + m.shared_layers, m.total_layers, m.compared_images); + if (m.shared_bytes >= (uint64_t) 1024 * 1024) { + double mib = (double) m.shared_bytes / (1024.0 * 1024.0); + fprintf(out, ", ~%.1f MiB on cache", mib); + } else if (m.shared_bytes > 0) { + fprintf(out, ", %" PRIu64 " B on cache", m.shared_bytes); + } + fputc('\n', out); + if (m.deepest_shared_prefix > 0) { + char short_chain[24]; + short_digest(m.deepest_shared_chainid, short_chain); + fprintf(out, + " stack cache: deepest shared prefix reaches layer %zu/%zu" + " (%s)\n", + m.deepest_shared_prefix, m.total_layers, short_chain); + } else { + fprintf(out, " stack cache: no shared prefix\n"); + } +} + +/* Render the index entry table. Default mode prints only the picked + * linux/arm64 entry (with a "[arm64]" tag); --all-platforms prints every + * entry, tagging the picked one so users still see which one elfuse will + * resolve. + */ +static void render_index_platforms(FILE *out, + const oci_index_t *idx, + const oci_index_entry_t *picked, + bool show_all) +{ + fprintf(out, "platforms:\n"); + for (size_t i = 0; i < idx->nentries; i++) { + const oci_index_entry_t *e = &idx->entries[i]; + bool is_picked = (e == picked); + if (!show_all && !is_picked) + continue; + char digest_buf[24]; + short_digest(e->desc.digest_str, digest_buf); + char platform_buf[64]; + render_platform(&e->platform, platform_buf); + const char *mt = oci_media_type_name(e->desc.media_type); + fprintf(out, " %-9s %-22s %-22s %12" PRId64 "B %s\n", + is_picked ? "[arm64]" : "", platform_buf, digest_buf, + e->desc.size, mt ? mt : "unknown"); + } + fprintf(out, "\n"); +} + +int oci_inspect(oci_store_t *store, + const oci_ref_t *ref, + const oci_inspect_options_t *opts, + const char **err_msg) +{ + if (!store || !ref || !ref->registry || !ref->repository) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + FILE *out = opts && opts->out ? opts->out : stdout; + bool show_all = opts && opts->show_all_platforms; + + /* 1. Resolve manifest digest from ref. */ + char *pinned = NULL; + bool from_pin = false; + if (ref->digest) { + pinned = strdup(ref->digest); + if (!pinned) { + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory"; + return -1; + } + } else if (ref->tag) { + const char *get_err = NULL; + int gr = oci_store_get_ref(store, ref, &pinned, &get_err); + if (gr < 0) { + if (errno == ENOENT) { + fprintf(out, + "pinned: (no local manifest; run 'elfuse oci " + "pull' first)\n"); + return 0; + } + if (err_msg) + *err_msg = get_err ? get_err : "failed to read pin"; + return -1; + } + from_pin = true; + } else { + /* The slice 1 ref parser defaults tag to "latest" when no digest is + * given, so this branch is structurally unreachable through the CLI. + * Guard it anyway so a hand-constructed ref does not segfault. + */ + if (err_msg) + *err_msg = "ref has neither tag nor digest"; + errno = EINVAL; + return -1; + } + + /* 2. Print the pin line. The digest reference annotation tells the user + * this came from ref->digest rather than the local pin file. + */ + if (from_pin) { + fprintf(out, "pinned: %s\n", pinned); + } else { + fprintf(out, "pinned: %s (digest reference)\n", pinned); + } + + /* 3. Validate the digest and read the blob. */ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(pinned, &algo, hex)) { + if (err_msg) + *err_msg = "pinned digest is malformed"; + errno = EINVAL; + free(pinned); + return -1; + } + + char *body = NULL; + size_t body_len = 0; + if (read_blob_file(oci_store_blobs(store), algo, hex, &body, &body_len) < + 0) { + if (errno == ENOENT) { + fprintf(out, "error: manifest blob %s not found in local store\n", + pinned); + if (err_msg) + *err_msg = "manifest blob missing from local store"; + free(pinned); + errno = ENOENT; + return -1; + } + int saved = errno; + if (err_msg) + *err_msg = "failed to read manifest blob"; + free(pinned); + errno = saved; + return -1; + } + + /* 4. Classify: try index first, then manifest. The two parsers reject + * disjoint shapes (one requires "manifests", the other requires "config" + * + "layers"), so a successful parse is unambiguous. + */ + oci_index_t idx = {0}; + oci_manifest_t mf = {0}; + bool is_index = false; + bool is_manifest = false; + if (oci_index_parse(body, body_len, &idx, NULL) == 0) { + is_index = true; + } else if (oci_manifest_parse(body, body_len, &mf, NULL) == 0) { + is_manifest = true; + } else { + if (err_msg) + *err_msg = "manifest blob is neither a valid index nor manifest"; + errno = EPROTO; + free(body); + free(pinned); + return -1; + } + + /* 5. Render. */ + int rc = 0; + if (is_index) { + const char *imt = oci_media_type_name(idx.media_type); + fprintf(out, "type: image index (%s)\n\n", imt ? imt : "unknown"); + + const oci_index_entry_t *picked = oci_index_pick_linux_arm64(&idx); + render_index_platforms(out, &idx, picked, show_all); + + /* Default mode drills into the picked linux/arm64 sub-manifest. The + * --all-platforms request is "show me the cover", not "drill"; skip + * the sub-manifest read entirely. + */ + if (!show_all) { + if (!picked) { + fprintf(out, "error: index has no linux/arm64 entry\n"); + if (err_msg) + *err_msg = "index has no linux/arm64 entry"; + errno = ENOENT; + rc = -1; + } else { + char *sub_body = NULL; + size_t sub_len = 0; + if (read_blob_file(oci_store_blobs(store), picked->desc.algo, + picked->desc.hex, &sub_body, &sub_len) < 0) { + if (errno == ENOENT) { + fprintf(stderr, + "warning: linux/arm64 manifest blob %s not " + "in local store\n", + picked->desc.digest_str); + if (err_msg) + *err_msg = + "indexed manifest blob missing from local " + "store"; + errno = ENOENT; + rc = -1; + } else { + int saved = errno; + if (err_msg) + *err_msg = "failed to read sub-manifest blob"; + errno = saved; + rc = -1; + } + } else { + oci_manifest_t sub_mf = {0}; + if (oci_manifest_parse(sub_body, sub_len, &sub_mf, NULL) == + 0) { + render_manifest(out, oci_store_blobs(store), &sub_mf, + picked->desc.digest_str); + if (!opts || !opts->suppress_layer_reuse) { + render_layer_reuse(out, store, + picked->desc.digest_str, + opts ? opts->volume_root : NULL); + } + oci_manifest_free(&sub_mf); + } else { + fprintf(out, + "error: sub-manifest blob %s is malformed\n", + picked->desc.digest_str); + if (err_msg) + *err_msg = "sub-manifest is malformed"; + errno = EPROTO; + rc = -1; + } + free(sub_body); + } + } + } + } else if (is_manifest) { + const char *mmt = oci_media_type_name(mf.media_type); + fprintf(out, "type: image manifest (%s)\n\n", + mmt ? mmt : "unknown"); + render_manifest(out, oci_store_blobs(store), &mf, NULL); + if (!opts || !opts->suppress_layer_reuse) { + render_layer_reuse(out, store, pinned, + opts ? opts->volume_root : NULL); + } + } + + /* errno preserved across cleanup, like slice 5a oci_pull. */ + int saved_errno = errno; + oci_index_free(&idx); + oci_manifest_free(&mf); + free(body); + free(pinned); + if (rc != 0) + errno = saved_errno; + return rc; +} diff --git a/src/oci/inspect.h b/src/oci/inspect.h new file mode 100644 index 0000000..d282920 --- /dev/null +++ b/src/oci/inspect.h @@ -0,0 +1,74 @@ +/* Offline manifest tree renderer for elfuse oci inspect + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Reads the local store the slice 5a pull pipeline populated and prints the + * resolved manifest graph without touching the network. The function does not + * print the canonical reference header (registry / repository / tag / digest); + * that piece is owned by src/oci/cli.c so the slice-1 inspect smoke output + * stays exactly the same when the store has no record for a ref. + * + * Manifest digest resolution order: + * 1. ref->digest, when set (digest-pinned reference) + * 2. Pin descriptor in /index.json whose ref.name annotation matches + * the canonical "/:" form + * 3. Neither: print "(no local manifest...)" and return 0 (informational) + * + * Render policy: + * - The blob is parsed as an index or a manifest based on the canonical + * mediaType embedded in the JSON. Unknown media types abort with EPROTO. + * - For an image index: prints a platform table. Default mode shows only + * the linux/arm64 entry and then drills into its sub-manifest to print + * the config descriptor and layer table. --all-platforms (opts-> + * show_all_platforms) lists every entry and skips the drill -- it is + * "what platforms does this image cover", not "what is inside the arm64 + * variant". + * - For an image manifest: prints config + layers directly. + * + * Failure mode for partial stores: when the index loads but the linux/arm64 + * sub-manifest blob is missing from the store, the platform table is still + * printed (stdout), a warning lands on stderr, and the call returns -1 with + * errno=ENOENT. That preserves the informational view while letting scripts + * detect the inconsistency through the exit code. + */ + +#pragma once + +#include +#include + +#include "ref.h" +#include "store.h" + +typedef struct { + /* Destination for the rendered tree. NULL defaults to stdout. */ + FILE *out; + /* List every platform entry of an image index instead of only the picked + * linux/arm64 entry. In this mode oci_inspect does not drill into any + * sub-manifest and skips the layer reuse section. + */ + bool show_all_platforms; + /* Optional volume root for the unpacked-sysroot walk in the layer reuse + * section. NULL means pin-only dedup, matching the C1.2 GC walker + * convention. Pure information: dedup metrics never write to disk. + */ + const char *volume_root; + /* When true, suppress the "layer reuse:" section that is otherwise + * rendered after the manifest layer table. The default (false, which is + * also the NULL-opts case) renders the section; tests that only want to + * verify the renderer baseline without the dedup compute side-effects set + * this true to skip it. The CLI never sets this true. + */ + bool suppress_layer_reuse; +} oci_inspect_options_t; + +/* Render the manifest tree the store holds for ref. opts may be NULL for the + * defaults (out=stdout, show_all_platforms=false). Returns 0 on success or + * pin miss; -1 with errno preserved and *err_msg (when non-NULL) pointing at + * a static description on failure (malformed blob, blob missing, IO error). + */ +int oci_inspect(oci_store_t *store, + const oci_ref_t *ref, + const oci_inspect_options_t *opts, + const char **err_msg); diff --git a/src/oci/layer-apply.c b/src/oci/layer-apply.c new file mode 100644 index 0000000..7fc66eb --- /dev/null +++ b/src/oci/layer-apply.c @@ -0,0 +1,616 @@ +/* OCI layer applier implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oci/layer-apply.h" + +#define LA_PATH_MAX 4096 +#define LA_CHUNK 65536 + +/* strchrnul is gated behind macOS 15.4 deployment target; emulate it + * inline so the applier builds against older SDKs without an + * availability check. + */ +static const char *la_strchrnul(const char *s, int c) +{ + const char *p = strchr(s, c); + return p ? p : s + strlen(s); +} + +static int set_err(const char **err, const char *msg, int err_no) +{ + if (err) + *err = msg; + errno = err_no; + return -1; +} + +static const char *basename_of(const char *p) +{ + const char *slash = strrchr(p, '/'); + return slash ? slash + 1 : p; +} + +/* Normalize the parent prefix of guest_path into parent_out (without + * trailing slash). parent_out may be the same buffer as guest_path's + * substring source, but for safety the caller passes a fresh buffer. + */ +static void parent_of(const char *guest_path, char *parent_out, size_t cap) +{ + const char *slash = strrchr(guest_path, '/'); + if (!slash) { + parent_out[0] = '\0'; + return; + } + size_t n = (size_t) (slash - guest_path); + if (n >= cap) + n = cap - 1; + memcpy(parent_out, guest_path, n); + parent_out[n] = '\0'; +} + +int oci_path_join_safe(const char *root_dir, + const char *guest_path, + char *out, + size_t cap, + const char **err) +{ + if (!root_dir || !guest_path || !out || cap == 0) + return set_err(err, "path join: NULL argument", EINVAL); + if (guest_path[0] == '/') + return set_err(err, "path join: absolute guest path", EINVAL); + if (guest_path[0] == '\0' || strcmp(guest_path, ".") == 0) + return set_err(err, "path join: empty path", EINVAL); + + /* Walk segments and reject `..` outright. Mirrors the no-follow + * basename rule from src/syscall/path.h::path_translate_at. + */ + const char *p = guest_path; + while (*p) { + const char *next = la_strchrnul(p, '/'); + size_t seglen = (size_t) (next - p); + if (seglen == 2 && p[0] == '.' && p[1] == '.') + return set_err(err, "path join: segment is ..", EINVAL); + p = *next ? next + 1 : next; + } + + size_t rl = strlen(root_dir); + size_t gl = strlen(guest_path); + if (rl + 1 + gl + 1 > cap) + return set_err(err, "path join: assembled length overflow", + ENAMETOOLONG); + memcpy(out, root_dir, rl); + out[rl] = '/'; + memcpy(out + rl + 1, guest_path, gl + 1); + return 0; +} + +int oci_symlink_target_check(const char *link_dir, const char *target) +{ + if (!target) + return set_err(NULL, NULL, EINVAL); + + /* Compose the conceptual destination relative to the unpack root. + * Absolute targets treat '/' as the unpack root (the symlink is + * unpacked into the layer sysroot, so absolute means root-relative); + * relative targets start from link_dir. + */ + char buf[LA_PATH_MAX]; + if (target[0] == '/') { + if (strlcpy(buf, target + 1, sizeof(buf)) >= sizeof(buf)) + return set_err(NULL, NULL, ENAMETOOLONG); + } else { + if (link_dir && link_dir[0]) { + if ((size_t) snprintf(buf, sizeof(buf), "%s/%s", link_dir, + target) >= sizeof(buf)) + return set_err(NULL, NULL, ENAMETOOLONG); + } else { + if (strlcpy(buf, target, sizeof(buf)) >= sizeof(buf)) + return set_err(NULL, NULL, ENAMETOOLONG); + } + } + + /* Track depth as we walk segments. `..` decrements; `.` and empty + * stay. Any drop below zero means a follower would step above the + * unpack root, which is rejected. + */ + int depth = 0; + char *save = NULL; + char *seg = strtok_r(buf, "/", &save); + while (seg) { + if (strcmp(seg, "..") == 0) { + if (--depth < 0) + return set_err(NULL, NULL, ELOOP); + } else if (strcmp(seg, ".") != 0 && seg[0] != '\0') { + depth++; + } + seg = strtok_r(NULL, "/", &save); + } + return 0; +} + +/* Recursive rm-rf rooted at path. Used by whiteout and opaque dir. */ +static int rm_recursive(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + return unlink(path); + } + DIR *d = opendir(path); + if (!d) + return -1; + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[LA_PATH_MAX]; + if ((size_t) snprintf(child, sizeof(child), "%s/%s", path, + de->d_name) >= sizeof(child)) { + rc = -1; + errno = ENAMETOOLONG; + break; + } + if (rm_recursive(child) < 0) { + rc = -1; + break; + } + } + closedir(d); + if (rc == 0) + rc = rmdir(path); + return rc; +} + +/* mkdir -p for the directory containing 'host_path'. Mode 0755 for + * implicit parents; the entry's own mode is applied later when the + * tar provides it. + */ +static int mkdir_parents(const char *host_path, const char **err) +{ + char buf[LA_PATH_MAX]; + if (strlcpy(buf, host_path, sizeof(buf)) >= sizeof(buf)) + return set_err(err, "mkdir parents: path overflow", ENAMETOOLONG); + char *slash = strrchr(buf, '/'); + if (!slash || slash == buf) + return 0; + *slash = '\0'; + /* Walk components and mkdir each. */ + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return set_err(err, "mkdir parents: mkdir failed", errno); + *p = '/'; + } + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return set_err(err, "mkdir parents: mkdir failed", errno); + return 0; +} + +static int copy_payload_to_fd(oci_tar_reader_t *r, + int fd, + uint64_t want, + const char **err) +{ + uint8_t buf[LA_CHUNK]; + while (want > 0) { + size_t got = 0; + size_t take = want > sizeof(buf) ? sizeof(buf) : (size_t) want; + const char *terr = NULL; + if (oci_tar_read_payload(r, buf, take, &got, &terr) < 0) + return set_err(err, terr ? terr : "tar payload read failed", EIO); + if (got == 0) + return set_err(err, "tar payload truncated", EIO); + const uint8_t *p = buf; + size_t remaining = got; + while (remaining > 0) { + ssize_t n = write(fd, p, remaining); + if (n < 0) { + if (errno == EINTR) + continue; + return set_err(err, "layer apply: file write failed", errno); + } + p += n; + remaining -= (size_t) n; + } + want -= got; + } + return 0; +} + +static int apply_regular(oci_tar_reader_t *r, + const oci_tar_entry_t *e, + const char *host_path, + oci_layer_apply_stats_t *stats, + const char **err) +{ + if (mkdir_parents(host_path, err) < 0) + return -1; + /* Remove any existing entry first so we never accidentally write + * through a symlink left by a lower layer. + */ + if (unlink(host_path) < 0 && errno != ENOENT) { + if (errno == EISDIR && rm_recursive(host_path) < 0) + return set_err(err, "layer apply: cannot remove existing dir", + errno); + else if (errno != EISDIR) + return set_err(err, "layer apply: cannot remove existing entry", + errno); + } + int fd = open(host_path, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0644); + if (fd < 0) + return set_err(err, "layer apply: open file failed", errno); + if (copy_payload_to_fd(r, fd, e->size, err) < 0) { + close(fd); + unlink(host_path); + return -1; + } + /* fchmod to the requested mode bits (low 12). The host inode may + * silently drop bits the running user cannot set, but the sidecar + * carries the authoritative value regardless. + */ + if (fchmod(fd, (mode_t) (e->mode & 07777)) < 0 && errno != EPERM) + return set_err(err, "layer apply: fchmod failed", errno); + close(fd); + if (stats) + stats->files++; + return 0; +} + +static int apply_dir(const oci_tar_entry_t *e, + const char *host_path, + oci_layer_apply_stats_t *stats, + const char **err) +{ + if (mkdir_parents(host_path, err) < 0) + return -1; + if (mkdir(host_path, 0755) < 0 && errno != EEXIST) + return set_err(err, "layer apply: mkdir failed", errno); + /* Apply mode bits; reuse fchmod via opening the dir so trailing + * symlinks are not followed (open with O_NOFOLLOW + O_DIRECTORY). + */ + int fd = open(host_path, O_RDONLY | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); + if (fd >= 0) { + (void) fchmod(fd, (mode_t) (e->mode & 07777)); + close(fd); + } + if (stats) + stats->dirs++; + return 0; +} + +static int apply_symlink(const oci_tar_entry_t *e, + const char *host_path, + const char *guest_path, + oci_layer_apply_stats_t *stats, + const char **err) +{ + if (!e->linkname || !e->linkname[0]) + return set_err(err, "layer apply: empty symlink target", EINVAL); + + char parent[LA_PATH_MAX]; + parent_of(guest_path, parent, sizeof(parent)); + if (oci_symlink_target_check(parent, e->linkname) < 0) + return set_err(err, "layer apply: symlink target escapes root", ELOOP); + + if (mkdir_parents(host_path, err) < 0) + return -1; + if (unlink(host_path) < 0 && errno != ENOENT) { + if (errno == EISDIR && rm_recursive(host_path) < 0) + return set_err(err, "layer apply: cannot remove existing dir", + errno); + else if (errno != EISDIR) + return set_err(err, "layer apply: cannot remove existing entry", + errno); + } + if (symlink(e->linkname, host_path) < 0) + return set_err(err, "layer apply: symlink failed", errno); + if (stats) + stats->symlinks++; + return 0; +} + +static int apply_hardlink(const oci_tar_entry_t *e, + const char *root_dir, + const char *host_path, + oci_layer_apply_stats_t *stats, + const char **err) +{ + if (!e->linkname || !e->linkname[0]) + return set_err(err, "layer apply: empty hardlink target", EINVAL); + + char target_host[LA_PATH_MAX]; + /* The hardlink target is an intra-archive guest path. Validate it + * the same way as a regular entry's path. + */ + if (oci_path_join_safe(root_dir, e->linkname, target_host, + sizeof(target_host), err) < 0) + return -1; + /* The target must exist already; OCI mandates entries in apply + * order, and a hardlink that names a missing file is a malformed + * archive (or a forward reference, which we explicitly reject). + */ + struct stat st; + if (lstat(target_host, &st) < 0) + return set_err(err, "layer apply: hardlink target missing", ENOLINK); + + if (mkdir_parents(host_path, err) < 0) + return -1; + if (unlink(host_path) < 0 && errno != ENOENT) + return set_err(err, "layer apply: cannot remove existing entry", errno); + if (link(target_host, host_path) < 0) + return set_err(err, "layer apply: link failed", errno); + if (stats) + stats->hardlinks++; + return 0; +} + +static int apply_whiteout(const oci_tar_entry_t *e, + const char *root_dir, + oci_meta_table_t *meta, + oci_layer_apply_stats_t *stats, + const char **err) +{ + /* Path is "...dir/.wh."; the entry being whited out is + * "...dir/". + */ + const char *base = basename_of(e->path); + if (strncmp(base, ".wh.", 4) != 0 || base[4] == '\0') + return set_err(err, "layer apply: malformed whiteout entry", EINVAL); + + size_t parent_len = (size_t) (base - e->path); + char target[LA_PATH_MAX]; + if (parent_len + strlen(base + 4) + 1 > sizeof(target)) + return set_err(err, "layer apply: whiteout path overflow", + ENAMETOOLONG); + memcpy(target, e->path, parent_len); + snprintf(target + parent_len, sizeof(target) - parent_len, "%s", base + 4); + + char host_path[LA_PATH_MAX]; + if (oci_path_join_safe(root_dir, target, host_path, sizeof(host_path), + err) < 0) + return -1; + if (rm_recursive(host_path) < 0 && errno != ENOENT) + return set_err(err, "layer apply: whiteout removal failed", errno); + if (meta) + oci_meta_remove(meta, target); + if (stats) + stats->whiteouts++; + return 0; +} + +static int apply_opaque_whiteout(const oci_tar_entry_t *e, + const char *root_dir, + oci_meta_table_t *meta, + oci_layer_apply_stats_t *stats, + const char **err) +{ + /* Path is "...dir/.wh..wh..opq"; clear all CHILDREN of "...dir/" + * but leave the directory itself. + */ + const char *base = basename_of(e->path); + if (strcmp(base, ".wh..wh..opq") != 0) + return set_err(err, "layer apply: malformed opaque marker", EINVAL); + size_t parent_len = (size_t) (base - e->path); + char dir[LA_PATH_MAX]; + if (parent_len == 0) { + /* Opaque at layer root: clear top-level lower-layer entries. */ + dir[0] = '\0'; + } else { + /* Drop the trailing slash before the marker. */ + size_t copy = parent_len - 1; + if (copy + 1 > sizeof(dir)) + return set_err(err, "layer apply: opaque path overflow", + ENAMETOOLONG); + memcpy(dir, e->path, copy); + dir[copy] = '\0'; + } + + char host_dir[LA_PATH_MAX]; + if (dir[0] == '\0') { + if ((size_t) snprintf(host_dir, sizeof(host_dir), "%s", root_dir) >= + sizeof(host_dir)) + return set_err(err, "layer apply: opaque host path overflow", + ENAMETOOLONG); + } else { + if (oci_path_join_safe(root_dir, dir, host_dir, sizeof(host_dir), err) < + 0) + return -1; + } + + DIR *d = opendir(host_dir); + if (!d) { + if (errno == ENOENT) { + if (stats) + stats->opaques++; + return 0; + } + return set_err(err, "layer apply: opaque opendir failed", errno); + } + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + if (strncmp(de->d_name, ".wh.", 4) == 0) + continue; /* let the marker entry itself stay for the iter */ + char child_host[LA_PATH_MAX]; + if ((size_t) snprintf(child_host, sizeof(child_host), "%s/%s", host_dir, + de->d_name) >= sizeof(child_host)) { + rc = -1; + errno = ENAMETOOLONG; + break; + } + if (rm_recursive(child_host) < 0) { + rc = -1; + break; + } + if (meta) { + char child_guest[LA_PATH_MAX]; + if (dir[0]) + snprintf(child_guest, sizeof(child_guest), "%s/%s", dir, + de->d_name); + else + snprintf(child_guest, sizeof(child_guest), "%s", de->d_name); + oci_meta_remove(meta, child_guest); + } + } + closedir(d); + if (rc < 0) + return set_err(err, "layer apply: opaque clear failed", errno); + if (stats) + stats->opaques++; + return 0; +} + +/* Whiteout-handling discipline shared by oci_layer_apply (overlay) and + * oci_layer_apply_raw_tar (Plan 3 C3.3 raw per-layer cache populate). + * Overlay mode interprets .wh. and .wh..wh..opq tar entries as + * delete / clear directives against root_dir; raw mode leaves them as + * regular 0-byte files at their tar path so the assembler can replay + * the whiteout intent against the running work_dir later. + */ +typedef enum { + APPLY_MODE_OVERLAY, + APPLY_MODE_RAW_TAR, +} apply_mode_t; + +static int layer_apply_impl(oci_tar_reader_t *r, + const char *root_dir, + apply_mode_t mode, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!r || !root_dir) + return set_err(err, "layer apply: NULL argument", EINVAL); + + for (;;) { + oci_tar_entry_t e; + const char *terr = NULL; + int rc = oci_tar_next(r, &e, &terr); + if (rc < 0) + return set_err(err, terr ? terr : "tar next failed", + errno ? errno : EIO); + if (rc == 0) + return 0; + + /* Strip a leading slash if present; OCI layer paths are + * relative, but some encoders ship them with a leading slash. + */ + const char *gp = e.path; + while (gp[0] == '/') + gp++; + + /* Root-directory tar entry: docker/buildkit emit "./" as the + * first entry of a layer; the DIR-type trailing-slash strip + * upstream collapses it to ".". The unpack root already + * exists by the time the assembler enters this loop, so the + * root entry has no work to drive. Skip empty paths the same + * way for archives that record a zero-length root name. + */ + if (gp[0] == '\0' || (gp[0] == '.' && gp[1] == '\0')) + continue; + + if (mode == APPLY_MODE_OVERLAY) { + if (e.is_opaque_whiteout) { + oci_tar_entry_t e2 = e; + e2.path = gp; + if (apply_opaque_whiteout(&e2, root_dir, meta, stats, err) < 0) + return -1; + continue; + } + if (e.is_whiteout) { + oci_tar_entry_t e2 = e; + e2.path = gp; + if (apply_whiteout(&e2, root_dir, meta, stats, err) < 0) + return -1; + continue; + } + } + /* Raw-tar mode falls through: .wh. and .wh..wh..opq are + * typeflag '0' regular tar entries with zero payload, and the + * regular-file dispatch below writes them on disk as 0-byte + * files at their tar path. The assembler consumes the markers + * later. + */ + + if (e.type == OCI_TAR_UNSUPPORTED) + return set_err(err, "layer apply: unsupported entry type", ENOTSUP); + + char host_path[LA_PATH_MAX]; + if (oci_path_join_safe(root_dir, gp, host_path, sizeof(host_path), + err) < 0) + return -1; + + oci_tar_entry_t e2 = e; + e2.path = gp; + switch (e.type) { + case OCI_TAR_REG: + if (apply_regular(r, &e2, host_path, stats, err) < 0) + return -1; + break; + case OCI_TAR_DIR: + if (apply_dir(&e2, host_path, stats, err) < 0) + return -1; + break; + case OCI_TAR_SYMLINK: + if (apply_symlink(&e2, host_path, gp, stats, err) < 0) + return -1; + break; + case OCI_TAR_HARDLINK: + if (apply_hardlink(&e2, root_dir, host_path, stats, err) < 0) + return -1; + break; + case OCI_TAR_UNSUPPORTED: + /* Already handled above; here for switch completeness. */ + return set_err(err, "layer apply: unsupported entry type", ENOTSUP); + } + + if (meta && e.type != OCI_TAR_HARDLINK) + (void) oci_meta_record(meta, gp, e.uid, e.gid, e.mode); + } +} + +int oci_layer_apply(oci_tar_reader_t *r, + const char *root_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char **err) +{ + return layer_apply_impl(r, root_dir, APPLY_MODE_OVERLAY, stats, meta, err); +} + +int oci_layer_apply_raw_tar(oci_tar_reader_t *r, + const char *root_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char **err) +{ + return layer_apply_impl(r, root_dir, APPLY_MODE_RAW_TAR, stats, meta, err); +} diff --git a/src/oci/layer-apply.h b/src/oci/layer-apply.h new file mode 100644 index 0000000..6ebcf1a --- /dev/null +++ b/src/oci/layer-apply.h @@ -0,0 +1,131 @@ +/* OCI layer applier: drive a tar stream into an unpack root + * + * Consumes oci_tar_entry_t records from oci_tar_reader_t (fed by the + * decompression dispatch) and applies them under root_dir. Walks + * entries in strict order so whiteouts and opaque markers interact + * with prior upper-layer state correctly. + * + * Path containment rules mirror src/syscall/path.h::path_translate_at + * from PR #33: reject `..` traversal in any path component, reject + * absolute components past the layer root, and reject symlinks whose + * resolved target would escape the unpack root. The unpack-time check + * uses oci_path_join_safe + oci_symlink_target_check so a malicious + * tar that smuggles a relative `..` through the prefix slot cannot + * touch host paths. + * + * Whiteouts: + * .wh. remove the upper-layer entry from this dir + * .wh..wh..opq clear the containing dir's lower-layer contents + * + * Mode bits and uid/gid go into the running oci_meta_table_t; the host + * inode is created with the running user's identity, and the sidecar + * lookup at runtime restores the Linux view. Block, char, fifo, and + * socket entries are rejected with ENOTSUP, matching the asymmetric + * subset in oci-roadmap.md Q3. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include "oci/layer-meta.h" +#include "oci/tar.h" + +typedef struct { + size_t files; + size_t dirs; + size_t symlinks; + size_t hardlinks; + size_t whiteouts; + size_t opaques; +} oci_layer_apply_stats_t; + +/* Apply every entry from r into root_dir. root_dir must already exist + * and be writable; the applier creates parent directories on demand. + * stats and meta may be NULL; passing both lets the caller drive a + * dry-run for fixture validation. + * + * Whiteout entries (.wh.) and opaque markers (.wh..wh..opq) are + * processed against root_dir: .wh. rm-rfs the named upper-layer + * entry, and .wh..wh..opq clears the marker's directory of lower-layer + * contents. This is the canonical overlay-merge behaviour used by the + * top-level oci_unpack assembly path. For populating a Plan 3 C3.3 raw + * per-layer cache where whiteout markers must remain on disk as files, + * use oci_layer_apply_raw_tar instead. + * + * Returns 0 on success, -1 on error with *err set and errno carrying + * one of: + * ENOTSUP - tar entry type rejected (block/char/fifo/socket) + * ELOOP - symlink target escapes the unpack root + * ENOLINK - hardlink target was not seen earlier in the same + * layer or as a previously-unpacked file under root + * ENAMETOOLONG - assembled host path overflows PATH_MAX + * EINVAL - tar entry path contains `..` or is absolute + * ENAMETOOLONG / EIO - tar reader / payload write surface + */ +int oci_layer_apply(oci_tar_reader_t *r, + const char *root_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char **err); + +/* Plan 3 C3.3: raw per-layer extract for the cross-image dedup cache. + * + * Identical to oci_layer_apply except .wh. and .wh..wh..opq tar + * entries are NOT interpreted as delete / clear directives: they fall + * through to the regular-file dispatch and land on disk as 0-byte + * regular files at their tar path. The on-disk shape of root_dir then + * preserves the layer's whiteout intent for the later assembly pass + * (the assembler walks the raw directory, applies whiteouts against + * the running work_dir, then copies non-whiteout entries on top). + * + * Consequences vs overlay mode: + * - stats->whiteouts and stats->opaques stay at zero; the markers + * are counted in stats->files because they exist on disk as + * regular zero-length files + * - the per-layer oci_meta_table_t records the markers like any + * other regular file (uid/gid/mode tuples from the tar header) + * - root_dir is treated as a fresh extraction target: there is no + * pre-existing upper-layer state to delete or clear, so the + * whiteout branches would have been no-ops even if executed + * + * Returns 0 on success, -1 on error with the same errno surface as + * oci_layer_apply (the dispatch and path validation logic is shared). + */ +int oci_layer_apply_raw_tar(oci_tar_reader_t *r, + const char *root_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char **err); + +/* Compose root_dir + '/' + guest_path into out. Rejects: + * - leading '/' in guest_path (absolute is not allowed for tar entries) + * - any path component equal to `..` (escape) + * - empty path or `.` (no-op write target) + * - assembled length >= cap + * Exposed for unit tests; the applier consumes it internally. + */ +int oci_path_join_safe(const char *root_dir, + const char *guest_path, + char *out, + size_t cap, + const char **err); + +/* Validate that a symlink at link_dir pointing to target stays under + * the unpack root if a follower started at link_dir. Pure string-level + * analysis: parses absolute vs relative target, normalizes + * `.`/`..`/empty segments, and asserts the running depth never drops + * below zero relative to the unpack root. + * + * link_dir is the symlink's containing directory expressed as a + * sysroot-relative path (without leading `/`), e.g. "etc/links" for a + * symlink at "etc/links/foo". An empty link_dir means the symlink + * lives directly under the unpack root. + * + * Returns 0 when the target is safe, -1 with errno=ELOOP otherwise. + */ +int oci_symlink_target_check(const char *link_dir, const char *target); diff --git a/src/oci/layer-meta.c b/src/oci/layer-meta.c new file mode 100644 index 0000000..e097239 --- /dev/null +++ b/src/oci/layer-meta.c @@ -0,0 +1,448 @@ +/* OCI sidecar metadata implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "oci/layer-meta.h" + +#define OCI_META_FILE ".elfuse-meta.json" +#define OCI_META_VERSION 1 + +typedef struct { + char *path; + uint64_t uid; + uint64_t gid; + uint32_t mode; +} oci_meta_entry_t; + +struct oci_meta_table { + oci_meta_entry_t *entries; + size_t len; + size_t cap; +}; + +oci_meta_table_t *oci_meta_table_new(void) +{ + return calloc(1, sizeof(oci_meta_table_t)); +} + +static void entry_clear(oci_meta_entry_t *e) +{ + free(e->path); + e->path = NULL; +} + +void oci_meta_table_free(oci_meta_table_t *t) +{ + if (!t) + return; + for (size_t i = 0; i < t->len; i++) + entry_clear(&t->entries[i]); + free(t->entries); + free(t); +} + +static int grow_if_needed(oci_meta_table_t *t) +{ + if (t->len < t->cap) + return 0; + size_t nc = t->cap == 0 ? 16 : t->cap * 2; + oci_meta_entry_t *grown = realloc(t->entries, nc * sizeof(*grown)); + if (!grown) + return -1; + t->entries = grown; + t->cap = nc; + return 0; +} + +static ssize_t find_index(const oci_meta_table_t *t, const char *path) +{ + /* Linear scan. OCI layers typically hold a few hundred to a few + * thousand entries; the wall cost is dwarfed by the IO each entry + * already triggers. If profiling later shows this is hot, swap in + * an open-addressing hash keyed by FNV-1a of the path string. + */ + for (size_t i = 0; i < t->len; i++) + if (strcmp(t->entries[i].path, path) == 0) + return (ssize_t) i; + return -1; +} + +int oci_meta_record(oci_meta_table_t *t, + const char *guest_path, + uint64_t uid, + uint64_t gid, + uint32_t mode) +{ + if (!t || !guest_path) { + errno = EINVAL; + return -1; + } + ssize_t idx = find_index(t, guest_path); + if (idx >= 0) { + t->entries[idx].uid = uid; + t->entries[idx].gid = gid; + t->entries[idx].mode = mode; + return 0; + } + if (grow_if_needed(t) < 0) { + errno = ENOMEM; + return -1; + } + char *dup = strdup(guest_path); + if (!dup) { + errno = ENOMEM; + return -1; + } + t->entries[t->len].path = dup; + t->entries[t->len].uid = uid; + t->entries[t->len].gid = gid; + t->entries[t->len].mode = mode; + t->len++; + return 0; +} + +void oci_meta_remove(oci_meta_table_t *t, const char *guest_path) +{ + if (!t || !guest_path) + return; + ssize_t idx = find_index(t, guest_path); + if (idx < 0) + return; + entry_clear(&t->entries[idx]); + /* Move last entry into the freed slot to keep the array compact. */ + if ((size_t) idx != t->len - 1) + t->entries[idx] = t->entries[t->len - 1]; + t->len--; +} + +int oci_meta_lookup(const oci_meta_table_t *t, + const char *guest_path, + uint64_t *out_uid, + uint64_t *out_gid, + uint32_t *out_mode) +{ + if (!t || !guest_path) { + errno = EINVAL; + return -1; + } + ssize_t idx = find_index(t, guest_path); + if (idx < 0) { + errno = ENOENT; + return -1; + } + if (out_uid) + *out_uid = t->entries[idx].uid; + if (out_gid) + *out_gid = t->entries[idx].gid; + if (out_mode) + *out_mode = t->entries[idx].mode; + return 0; +} + +size_t oci_meta_count(const oci_meta_table_t *t) +{ + return t ? t->len : 0; +} + +static char *build_path(const char *root_dir, const char *name, bool tmp) +{ + size_t want = strlen(root_dir) + 1 + strlen(name) + (tmp ? 4 : 0) + 1; + char *p = malloc(want); + if (!p) + return NULL; + snprintf(p, want, "%s/%s%s", root_dir, name, tmp ? ".tmp" : ""); + return p; +} + +static bool valid_basename(const char *filename) +{ + if (!filename || !*filename) + return false; + for (const char *p = filename; *p; p++) + if (*p == '/') + return false; + return true; +} + +int oci_meta_write_named(const oci_meta_table_t *t, + const char *root_dir, + const char *filename, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!root_dir) { + *err = "meta write: NULL root"; + errno = EINVAL; + return -1; + } + if (!valid_basename(filename)) { + *err = "meta write: filename must be a non-empty basename"; + errno = EINVAL; + return -1; + } + + cJSON *root = cJSON_CreateObject(); + if (!root) { + *err = "meta write: cJSON_CreateObject failed"; + errno = ENOMEM; + return -1; + } + if (!cJSON_AddNumberToObject(root, "version", OCI_META_VERSION)) { + cJSON_Delete(root); + *err = "meta write: version add failed"; + errno = ENOMEM; + return -1; + } + cJSON *arr = cJSON_AddArrayToObject(root, "entries"); + if (!arr) { + cJSON_Delete(root); + *err = "meta write: entries add failed"; + errno = ENOMEM; + return -1; + } + for (size_t i = 0; t && i < t->len; i++) { + cJSON *e = cJSON_CreateObject(); + if (!e) { + cJSON_Delete(root); + *err = "meta write: entry object failed"; + errno = ENOMEM; + return -1; + } + cJSON_AddItemToArray(arr, e); + if (!cJSON_AddStringToObject(e, "p", t->entries[i].path) || + !cJSON_AddNumberToObject(e, "u", (double) t->entries[i].uid) || + !cJSON_AddNumberToObject(e, "g", (double) t->entries[i].gid) || + !cJSON_AddNumberToObject(e, "m", (double) t->entries[i].mode)) { + cJSON_Delete(root); + *err = "meta write: entry field failed"; + errno = ENOMEM; + return -1; + } + } + + char *json = cJSON_PrintUnformatted(root); + cJSON_Delete(root); + if (!json) { + *err = "meta write: cJSON_Print failed"; + errno = ENOMEM; + return -1; + } + size_t jlen = strlen(json); + + char *tmp_path = build_path(root_dir, filename, true); + char *final_path = build_path(root_dir, filename, false); + if (!tmp_path || !final_path) { + free(tmp_path); + free(final_path); + free(json); + *err = "meta write: path allocation failed"; + errno = ENOMEM; + return -1; + } + + int fd = open(tmp_path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (fd < 0) { + *err = "meta write: open tmp failed"; + goto fail_paths; + } + if (write(fd, json, jlen) != (ssize_t) jlen) { + *err = "meta write: write failed"; + close(fd); + unlink(tmp_path); + goto fail_paths; + } + if (fsync(fd) < 0) { + *err = "meta write: fsync failed"; + close(fd); + unlink(tmp_path); + goto fail_paths; + } + close(fd); + if (rename(tmp_path, final_path) < 0) { + *err = "meta write: rename failed"; + unlink(tmp_path); + goto fail_paths; + } + + free(tmp_path); + free(final_path); + free(json); + return 0; + +fail_paths: + free(tmp_path); + free(final_path); + free(json); + return -1; +} + +int oci_meta_read_named(const char *root_dir, + const char *filename, + oci_meta_table_t **out, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!root_dir || !out) { + *err = "meta read: NULL argument"; + errno = EINVAL; + return -1; + } + *out = NULL; + if (!valid_basename(filename)) { + *err = "meta read: filename must be a non-empty basename"; + errno = EINVAL; + return -1; + } + + char *path = build_path(root_dir, filename, false); + if (!path) { + *err = "meta read: path allocation failed"; + errno = ENOMEM; + return -1; + } + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + *err = "meta read: open failed"; + free(path); + return -1; /* errno already ENOENT or similar */ + } + free(path); + + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + *err = "meta read: fstat failed"; + return -1; + } + /* Cap at 64 MiB so a corrupt sidecar cannot drag the host into + * swap; real-world tables are under a few hundred KiB. + */ + if (st.st_size <= 0 || st.st_size > (off_t) (64 * 1024 * 1024)) { + close(fd); + *err = "meta read: file size out of bounds"; + errno = EINVAL; + return -1; + } + char *buf = malloc((size_t) st.st_size + 1); + if (!buf) { + close(fd); + *err = "meta read: buffer allocation failed"; + errno = ENOMEM; + return -1; + } + ssize_t got = read(fd, buf, (size_t) st.st_size); + close(fd); + if (got != st.st_size) { + free(buf); + *err = "meta read: short read"; + errno = EIO; + return -1; + } + buf[got] = '\0'; + + cJSON *root = cJSON_Parse(buf); + free(buf); + if (!root) { + *err = "meta read: malformed JSON"; + errno = EINVAL; + return -1; + } + cJSON *version = cJSON_GetObjectItemCaseSensitive(root, "version"); + if (!cJSON_IsNumber(version) || version->valueint != OCI_META_VERSION) { + cJSON_Delete(root); + *err = "meta read: unsupported version"; + errno = EINVAL; + return -1; + } + cJSON *arr = cJSON_GetObjectItemCaseSensitive(root, "entries"); + if (!cJSON_IsArray(arr)) { + cJSON_Delete(root); + *err = "meta read: missing entries array"; + errno = EINVAL; + return -1; + } + + oci_meta_table_t *table = oci_meta_table_new(); + if (!table) { + cJSON_Delete(root); + *err = "meta read: table allocation failed"; + errno = ENOMEM; + return -1; + } + + cJSON *e; + cJSON_ArrayForEach(e, arr) + { + cJSON *p = cJSON_GetObjectItemCaseSensitive(e, "p"); + cJSON *u = cJSON_GetObjectItemCaseSensitive(e, "u"); + cJSON *g = cJSON_GetObjectItemCaseSensitive(e, "g"); + cJSON *m = cJSON_GetObjectItemCaseSensitive(e, "m"); + if (!cJSON_IsString(p) || !cJSON_IsNumber(u) || !cJSON_IsNumber(g) || + !cJSON_IsNumber(m)) { + oci_meta_table_free(table); + cJSON_Delete(root); + *err = "meta read: entry shape invalid"; + errno = EINVAL; + return -1; + } + if (oci_meta_record(table, p->valuestring, (uint64_t) u->valuedouble, + (uint64_t) g->valuedouble, + (uint32_t) m->valuedouble) < 0) { + oci_meta_table_free(table); + cJSON_Delete(root); + *err = "meta read: record failed"; + return -1; + } + } + + cJSON_Delete(root); + *out = table; + return 0; +} + +int oci_meta_write(const oci_meta_table_t *t, + const char *root_dir, + const char **err) +{ + return oci_meta_write_named(t, root_dir, OCI_META_FILE, err); +} + +int oci_meta_read(const char *root_dir, + oci_meta_table_t **out, + const char **err) +{ + return oci_meta_read_named(root_dir, OCI_META_FILE, out, err); +} + +int oci_meta_merge(oci_meta_table_t *dst, const oci_meta_table_t *src) +{ + if (!dst || !src) { + errno = EINVAL; + return -1; + } + for (size_t i = 0; i < src->len; i++) { + const oci_meta_entry_t *e = &src->entries[i]; + if (oci_meta_record(dst, e->path, e->uid, e->gid, e->mode) < 0) + return -1; + } + return 0; +} diff --git a/src/oci/layer-meta.h b/src/oci/layer-meta.h new file mode 100644 index 0000000..2337288 --- /dev/null +++ b/src/oci/layer-meta.h @@ -0,0 +1,114 @@ +/* OCI sidecar metadata for unpacked layers + * + * elfuse unpacks layers as the invoking macOS user; it cannot chown to + * arbitrary uids/gids, and the host inode mode cannot always carry the + * full set of permission bits Linux expects. The sidecar records the + * authoritative uid/gid/mode per guest path so Phase 3's syscall layer + * can present the guest with the original Linux view. + * + * Serialization format: /.elfuse-meta.json with the shape + * { "version": 1, + * "entries": [ { "p": "/path", "u": NNN, "g": NNN, "m": NNN } ] } + * Mode bits are stored decimal (cJSON has no native octal). Setuid, + * setgid, and sticky bits are encoded in the bottom 12 bits along with + * the rwx triplets, per oci-roadmap.md Q3. + * + * xattrs are intentionally absent: Q3 commits Phase 2 to ignore-with- + * warning on xattr entries rather than fabricate a half-supported + * namespace mapping between Linux user/security/system xattrs and the + * macOS extended-attribute domain. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +typedef struct oci_meta_table oci_meta_table_t; + +/* Allocate an empty table. Returns NULL on OOM. */ +oci_meta_table_t *oci_meta_table_new(void); + +/* Release the table and all owned strings. Safe on NULL. */ +void oci_meta_table_free(oci_meta_table_t *t); + +/* Insert or update an entry. Idempotent: re-recording the same path + * overwrites the previous tuple. Returns 0 on success, -1 with errno + * set on allocation failure. + */ +int oci_meta_record(oci_meta_table_t *t, + const char *guest_path, + uint64_t uid, + uint64_t gid, + uint32_t mode); + +/* Remove a path from the table. No-op if the path is not recorded. + * Whiteouts and tar-overwrites both rely on this so the persisted + * sidecar does not accumulate stale tuples for files that no longer + * exist in the unpacked tree. + */ +void oci_meta_remove(oci_meta_table_t *t, const char *guest_path); + +/* Look up a path. Returns 0 with out-params filled, or -1 with + * errno=ENOENT if the path was never recorded. Any out param may be + * NULL to discard that field. + */ +int oci_meta_lookup(const oci_meta_table_t *t, + const char *guest_path, + uint64_t *out_uid, + uint64_t *out_gid, + uint32_t *out_mode); + +/* Number of live entries. */ +size_t oci_meta_count(const oci_meta_table_t *t); + +/* Serialize the table to / via atomic rename. + * Returns 0 on success, -1 on failure with errno and *err set. Passing + * an empty table writes a valid file containing an empty entries array. + * + * filename must be a relative basename (no embedded '/'); EINVAL is + * returned otherwise. The Plan 3 C3.3c raw per-layer cache writes + * ".elfuse-meta.layer.json" so that the assembled stack snapshot can + * keep the default ".elfuse-meta.json" name for the cumulative table + * without collisions during clonefile-stacked assembly. + */ +int oci_meta_write_named(const oci_meta_table_t *t, + const char *root_dir, + const char *filename, + const char **err); + +/* Parse / and populate a fresh table. Caller takes + * ownership via *out (freed with oci_meta_table_free). Missing file + * returns -1 with errno=ENOENT; malformed JSON or version mismatch + * returns -1 with errno=EINVAL. filename constraints match + * oci_meta_write_named (relative basename, no embedded '/'). + */ +int oci_meta_read_named(const char *root_dir, + const char *filename, + oci_meta_table_t **out, + const char **err); + +/* Thin wrappers passing the default ".elfuse-meta.json" filename. Used + * by the cumulative-sidecar paths (stack snapshot, final unpack tree). + */ +int oci_meta_write(const oci_meta_table_t *t, + const char *root_dir, + const char **err); + +int oci_meta_read(const char *root_dir, + oci_meta_table_t **out, + const char **err); + +/* Copy every entry from src into dst via oci_meta_record. Existing dst + * entries with the same guest path are overwritten (record's idempotent + * upsert semantics). Used by the Plan 3 C3.2 unpack layer cache hit path + * so a clonefile-restored layer's persisted sidecar repopulates the in- + * memory meta table that subsequent layer applies extend. Returns 0 on + * success or -1 with errno set (EINVAL on NULL inputs, ENOMEM on record + * allocation failure). Partial merges may leave dst with a subset of src + * already applied; the caller treats this as a fatal error. + */ +int oci_meta_merge(oci_meta_table_t *dst, const oci_meta_table_t *src); diff --git a/src/oci/manifest.c b/src/oci/manifest.c new file mode 100644 index 0000000..8d70849 --- /dev/null +++ b/src/oci/manifest.c @@ -0,0 +1,727 @@ +/* OCI image manifest, image index, and image config parsers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "manifest.h" + +#include +#include +#include +#include +#include + +#include + +/* Maximum representable size that can survive a double round-trip without + * silent precision loss. JSON numbers parse through double in cJSON, so any + * size beyond 2^53 - 1 would already be off by ones; sizes well below that + * cover every realistic OCI layer. + */ +#define SIZE_MAX_SAFE INT64_C(0x1fffffffffffff) + +/* Optional string helper. JSON value may be absent (NULL item) or string + * type. Returns 1 on accepted (out_dup may be empty on success when allow + * is true), 0 on absent (out_dup left as default), -1 on type error. The + * caller owns the returned string. + */ +static int dup_optional_string(const cJSON *parent, + const char *key, + char **out_dup, + const char **err_msg, + const char *type_err) +{ + const cJSON *item = cJSON_GetObjectItemCaseSensitive(parent, key); + if (!item) + return 0; + if (!cJSON_IsString(item) || !item->valuestring) { + if (err_msg) + *err_msg = type_err; + return -1; + } + char *dup = strdup(item->valuestring); + if (!dup) { + if (err_msg) + *err_msg = "out of memory copying string field"; + return -1; + } + free(*out_dup); + *out_dup = dup; + return 1; +} + +static int require_string(const cJSON *parent, + const char *key, + char **out_dup, + const char **err_msg, + const char *missing_msg, + const char *type_msg) +{ + const cJSON *item = cJSON_GetObjectItemCaseSensitive(parent, key); + if (!item) { + if (err_msg) + *err_msg = missing_msg; + return -1; + } + if (!cJSON_IsString(item) || !item->valuestring) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + char *dup = strdup(item->valuestring); + if (!dup) { + if (err_msg) + *err_msg = "out of memory copying required string"; + return -1; + } + free(*out_dup); + *out_dup = dup; + return 0; +} + +/* Convert a JSON string array into a NULL-terminated char** array. Returns + * 0 on success, -1 on type error or allocation failure. On absent field + * the function returns 1 and leaves *out_array untouched. + */ +static int dup_string_array(const cJSON *parent, + const char *key, + char ***out_array, + const char **err_msg, + const char *type_msg, + bool required) +{ + const cJSON *item = cJSON_GetObjectItemCaseSensitive(parent, key); + if (!item) { + if (required) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + return 1; + } + if (!cJSON_IsArray(item)) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + int n = cJSON_GetArraySize(item); + if (n < 0) + n = 0; + char **arr = calloc((size_t) n + 1, sizeof(*arr)); + if (!arr) { + if (err_msg) + *err_msg = "out of memory allocating string array"; + return -1; + } + for (int i = 0; i < n; i++) { + const cJSON *elem = cJSON_GetArrayItem(item, i); + if (!cJSON_IsString(elem) || !elem->valuestring) { + if (err_msg) + *err_msg = type_msg; + goto fail; + } + arr[i] = strdup(elem->valuestring); + if (!arr[i]) { + if (err_msg) + *err_msg = "out of memory copying string-array element"; + goto fail; + } + } + arr[n] = NULL; + /* Free any prior value before publishing the new one. */ + if (*out_array) { + for (char **p = *out_array; *p; p++) + free(*p); + free(*out_array); + } + *out_array = arr; + return 0; +fail: + for (int i = 0; i < n; i++) + free(arr[i]); + free(arr); + return -1; +} + +/* Parse a non-negative integer-valued JSON number. cJSON keeps numbers in + * a double so the practical upper bound is 2^53 - 1; OCI layer sizes are + * well below that. + */ +static int parse_size_field(const cJSON *parent, + const char *key, + int64_t *out, + const char **err_msg) +{ + const cJSON *item = cJSON_GetObjectItemCaseSensitive(parent, key); + if (!item) { + if (err_msg) + *err_msg = "descriptor missing size field"; + return -1; + } + if (!cJSON_IsNumber(item)) { + if (err_msg) + *err_msg = "descriptor size field is not a number"; + return -1; + } + double v = item->valuedouble; + if (!(v >= 0.0) || v > (double) SIZE_MAX_SAFE) { + if (err_msg) + *err_msg = "descriptor size out of representable range"; + return -1; + } + /* Round-trip check: the JSON number must already be an integer. The + * double-to-int64 cast truncates; reject anything with a fractional part + * before truncation hides the divergence. + */ + int64_t as_int = (int64_t) v; + if ((double) as_int != v) { + if (err_msg) + *err_msg = "descriptor size field is not an integer"; + return -1; + } + *out = as_int; + return 0; +} + +static int parse_descriptor(const cJSON *obj, + oci_descriptor_t *out, + const char **err_msg) +{ + memset(out, 0, sizeof(*out)); + + /* mediaType: optional per OCI image-spec (some legacy responses omit it + * on the implicit root), but every descriptor that lives inside another + * document does carry it. Treat it as required at parse time and let + * the caller relax it for the top-level document if needed. + */ + char *raw_mt = NULL; + if (require_string(obj, "mediaType", &raw_mt, err_msg, + "descriptor missing mediaType", + "descriptor mediaType must be a string") < 0) + goto fail; + out->raw_media_type = raw_mt; + out->media_type = oci_media_type_parse(raw_mt); + + if (require_string(obj, "digest", &out->digest_str, err_msg, + "descriptor missing digest", + "descriptor digest must be a string") < 0) + goto fail; + if (!oci_digest_parse(out->digest_str, &out->algo, out->hex)) { + if (err_msg) + *err_msg = "descriptor digest is malformed or not lowercase"; + goto fail; + } + + if (parse_size_field(obj, "size", &out->size, err_msg) < 0) + goto fail; + return 0; +fail: + oci_descriptor_free(out); + return -1; +} + +static int parse_platform(const cJSON *obj, + oci_platform_t *out, + const char **err_msg) +{ + memset(out, 0, sizeof(*out)); + if (!obj || !cJSON_IsObject(obj)) { + if (err_msg) + *err_msg = "platform field missing or not an object"; + return -1; + } + if (require_string(obj, "architecture", &out->architecture, err_msg, + "platform missing architecture", + "platform architecture must be a string") < 0) + goto fail; + if (require_string(obj, "os", &out->os, err_msg, "platform missing os", + "platform os must be a string") < 0) + goto fail; + + /* variant and os.version default to "" so callers can compare without + * NULL checks. dup_optional_string sets the field only when present. + */ + if (dup_optional_string(obj, "variant", &out->variant, err_msg, + "platform variant must be a string") < 0) + goto fail; + if (!out->variant) { + out->variant = strdup(""); + if (!out->variant) { + if (err_msg) + *err_msg = "out of memory defaulting variant"; + goto fail; + } + } + if (dup_optional_string(obj, "os.version", &out->os_version, err_msg, + "platform os.version must be a string") < 0) + goto fail; + if (!out->os_version) { + out->os_version = strdup(""); + if (!out->os_version) { + if (err_msg) + *err_msg = "out of memory defaulting os.version"; + goto fail; + } + } + return 0; +fail: + oci_platform_free(out); + return -1; +} + +static int parse_int_field(const cJSON *parent, + const char *key, + int *out, + bool required, + const char **err_msg, + const char *missing_msg, + const char *type_msg) +{ + const cJSON *item = cJSON_GetObjectItemCaseSensitive(parent, key); + if (!item) { + if (required) { + if (err_msg) + *err_msg = missing_msg; + return -1; + } + return 1; + } + if (!cJSON_IsNumber(item)) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + /* Round-trip check: cJSON's valueint truncates the JSON number, so a + * fractional value like "schemaVersion": 2.7 would otherwise pass an + * == 2 comparison downstream. Reject anything that is out of int range + * or carries a fractional part before the truncation hides it. + */ + double v = item->valuedouble; + if (v < (double) INT_MIN || v > (double) INT_MAX) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + int as_int = (int) v; + if ((double) as_int != v) { + if (err_msg) + *err_msg = type_msg; + return -1; + } + *out = as_int; + return 0; +} + +/* Convert a cJSON parse failure into our diagnostic message space. cJSON's + * cJSON_GetErrorPtr is process-global; the message we set is static and the + * caller never frees it. + */ +static void set_parse_err(const char **err_msg, const char *fallback) +{ + if (err_msg) + *err_msg = fallback; + errno = EINVAL; +} + +int oci_manifest_parse(const char *json, + size_t len, + oci_manifest_t *out, + const char **err_msg) +{ + if (!json || !out) { + set_parse_err(err_msg, "oci_manifest_parse: NULL input"); + return -1; + } + memset(out, 0, sizeof(*out)); + + cJSON *root = cJSON_ParseWithLength(json, len); + if (!root) { + set_parse_err(err_msg, "manifest JSON is malformed"); + return -1; + } + if (!cJSON_IsObject(root)) { + set_parse_err(err_msg, "manifest JSON root is not an object"); + goto fail; + } + + if (parse_int_field(root, "schemaVersion", &out->schema_version, true, + err_msg, "manifest missing schemaVersion", + "manifest schemaVersion must be a number") < 0) + goto fail; + if (out->schema_version != 2) { + set_parse_err(err_msg, "manifest schemaVersion must be 2"); + goto fail; + } + + /* mediaType on the manifest itself is optional in some Docker responses + * (the Content-Type header is canonical there); record raw and parsed + * forms but do not reject on absence. + */ + if (dup_optional_string(root, "mediaType", &out->raw_media_type, err_msg, + "manifest mediaType must be a string") < 0) + goto fail; + out->media_type = out->raw_media_type + ? oci_media_type_parse(out->raw_media_type) + : OCI_MT_UNKNOWN; + + const cJSON *cfg = cJSON_GetObjectItemCaseSensitive(root, "config"); + if (!cfg || !cJSON_IsObject(cfg)) { + set_parse_err(err_msg, "manifest config descriptor missing"); + goto fail; + } + if (parse_descriptor(cfg, &out->config, err_msg) < 0) + goto fail; + if (!oci_media_type_is_config(out->config.media_type)) { + set_parse_err(err_msg, "manifest config has non-config media type"); + goto fail; + } + + const cJSON *layers = cJSON_GetObjectItemCaseSensitive(root, "layers"); + if (!layers || !cJSON_IsArray(layers)) { + set_parse_err(err_msg, "manifest layers array missing"); + goto fail; + } + int nlayers = cJSON_GetArraySize(layers); + if (nlayers < 0) + nlayers = 0; + if (nlayers > 0) { + out->layers = calloc((size_t) nlayers, sizeof(*out->layers)); + if (!out->layers) { + set_parse_err(err_msg, "out of memory allocating layer array"); + errno = ENOMEM; + goto fail; + } + } + for (int i = 0; i < nlayers; i++) { + const cJSON *desc = cJSON_GetArrayItem(layers, i); + if (!cJSON_IsObject(desc)) { + set_parse_err(err_msg, "manifest layer entry is not an object"); + goto fail; + } + if (parse_descriptor(desc, &out->layers[out->nlayers], err_msg) < 0) + goto fail; + oci_media_type_t lmt = out->layers[out->nlayers].media_type; + /* Count the slot now that parse_descriptor has populated (and + * possibly heap-allocated) it. The validation below can still + * goto fail; counting first lets oci_manifest_free reclaim it. + */ + out->nlayers++; + if (!oci_media_type_is_layer(lmt)) { + set_parse_err(err_msg, "manifest layer has non-layer media type"); + goto fail; + } + if (oci_media_type_is_foreign(lmt)) { + set_parse_err(err_msg, + "manifest references foreign (nondistributable) " + "layer; not supported"); + goto fail; + } + if (!oci_media_type_is_layer_supported(lmt)) { + set_parse_err(err_msg, + "manifest layer media type is not supported " + "(only tar / tar+gzip / tar+zstd)"); + goto fail; + } + } + + cJSON_Delete(root); + return 0; +fail: + cJSON_Delete(root); + oci_manifest_free(out); + return -1; +} + +int oci_index_parse(const char *json, + size_t len, + oci_index_t *out, + const char **err_msg) +{ + if (!json || !out) { + set_parse_err(err_msg, "oci_index_parse: NULL input"); + return -1; + } + memset(out, 0, sizeof(*out)); + + cJSON *root = cJSON_ParseWithLength(json, len); + if (!root) { + set_parse_err(err_msg, "index JSON is malformed"); + return -1; + } + if (!cJSON_IsObject(root)) { + set_parse_err(err_msg, "index JSON root is not an object"); + goto fail; + } + + if (parse_int_field(root, "schemaVersion", &out->schema_version, true, + err_msg, "index missing schemaVersion", + "index schemaVersion must be a number") < 0) + goto fail; + if (out->schema_version != 2) { + set_parse_err(err_msg, "index schemaVersion must be 2"); + goto fail; + } + + if (dup_optional_string(root, "mediaType", &out->raw_media_type, err_msg, + "index mediaType must be a string") < 0) + goto fail; + out->media_type = out->raw_media_type + ? oci_media_type_parse(out->raw_media_type) + : OCI_MT_UNKNOWN; + + const cJSON *manifests = + cJSON_GetObjectItemCaseSensitive(root, "manifests"); + if (!manifests || !cJSON_IsArray(manifests)) { + set_parse_err(err_msg, "index manifests array missing"); + goto fail; + } + int n = cJSON_GetArraySize(manifests); + if (n < 0) + n = 0; + if (n > 0) { + out->entries = calloc((size_t) n, sizeof(*out->entries)); + if (!out->entries) { + set_parse_err(err_msg, "out of memory allocating index entries"); + errno = ENOMEM; + goto fail; + } + } + for (int i = 0; i < n; i++) { + const cJSON *entry = cJSON_GetArrayItem(manifests, i); + if (!cJSON_IsObject(entry)) { + set_parse_err(err_msg, "index manifest entry is not an object"); + goto fail; + } + oci_index_entry_t *slot = &out->entries[out->nentries]; + if (parse_descriptor(entry, &slot->desc, err_msg) < 0) + goto fail; + /* Count the entry now that slot->desc holds allocated strings; the + * platform parse below can still goto fail, where oci_index_free + * must see this slot to reclaim both desc and platform. + */ + out->nentries++; + const cJSON *plat = cJSON_GetObjectItemCaseSensitive(entry, "platform"); + if (parse_platform(plat, &slot->platform, err_msg) < 0) + goto fail; + } + + cJSON_Delete(root); + return 0; +fail: + cJSON_Delete(root); + oci_index_free(out); + return -1; +} + +int oci_image_config_parse(const char *json, + size_t len, + oci_image_config_t *out, + const char **err_msg) +{ + if (!json || !out) { + set_parse_err(err_msg, "oci_image_config_parse: NULL input"); + return -1; + } + memset(out, 0, sizeof(*out)); + + cJSON *root = cJSON_ParseWithLength(json, len); + if (!root) { + set_parse_err(err_msg, "image config JSON is malformed"); + return -1; + } + if (!cJSON_IsObject(root)) { + set_parse_err(err_msg, "image config JSON root is not an object"); + goto fail; + } + + if (require_string(root, "architecture", &out->architecture, err_msg, + "image config missing architecture", + "image config architecture must be a string") < 0) + goto fail; + if (require_string(root, "os", &out->os, err_msg, "image config missing os", + "image config os must be a string") < 0) + goto fail; + if (dup_optional_string(root, "variant", &out->variant, err_msg, + "image config variant must be a string") < 0) + goto fail; + + const cJSON *cfg = cJSON_GetObjectItemCaseSensitive(root, "config"); + if (cfg) { + if (!cJSON_IsObject(cfg)) { + set_parse_err(err_msg, "image config.config must be an object"); + goto fail; + } + if (dup_optional_string(cfg, "User", &out->config.user, err_msg, + "image config User must be a string") < 0) + goto fail; + if (dup_optional_string(cfg, "WorkingDir", &out->config.working_dir, + err_msg, + "image config WorkingDir must be a string") < 0) + goto fail; + if (dup_string_array(cfg, "Env", &out->config.env, err_msg, + "image config Env must be a string array", + false) < 0) + goto fail; + if (dup_string_array( + cfg, "Entrypoint", &out->config.entrypoint, err_msg, + "image config Entrypoint must be a string array", false) < 0) + goto fail; + if (dup_string_array(cfg, "Cmd", &out->config.cmd, err_msg, + "image config Cmd must be a string array", + false) < 0) + goto fail; + } + + const cJSON *rootfs = cJSON_GetObjectItemCaseSensitive(root, "rootfs"); + if (!rootfs || !cJSON_IsObject(rootfs)) { + set_parse_err(err_msg, "image config rootfs object missing"); + goto fail; + } + const cJSON *type = cJSON_GetObjectItemCaseSensitive(rootfs, "type"); + if (!type || !cJSON_IsString(type) || !type->valuestring || + strcmp(type->valuestring, "layers") != 0) { + set_parse_err(err_msg, "image config rootfs.type must be \"layers\""); + goto fail; + } + if (dup_string_array(rootfs, "diff_ids", &out->rootfs_diff_ids, err_msg, + "image config rootfs.diff_ids must be a string " + "array", + true) < 0) + goto fail; + /* Validate every diff_id is a recognized digest. */ + for (char **p = out->rootfs_diff_ids; p && *p; p++) { + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(*p, &algo, hex)) { + set_parse_err(err_msg, + "image config rootfs.diff_ids entry is malformed " + "or not lowercase"); + goto fail; + } + } + + cJSON_Delete(root); + return 0; +fail: + cJSON_Delete(root); + oci_image_config_free(out); + return -1; +} + +void oci_descriptor_free(oci_descriptor_t *d) +{ + if (!d) + return; + free(d->digest_str); + free(d->raw_media_type); + memset(d, 0, sizeof(*d)); +} + +void oci_platform_free(oci_platform_t *p) +{ + if (!p) + return; + free(p->architecture); + free(p->os); + free(p->variant); + free(p->os_version); + memset(p, 0, sizeof(*p)); +} + +static void runtime_free(oci_image_runtime_t *r) +{ + if (!r) + return; + free(r->user); + free(r->working_dir); + if (r->env) { + for (char **p = r->env; *p; p++) + free(*p); + free(r->env); + } + if (r->entrypoint) { + for (char **p = r->entrypoint; *p; p++) + free(*p); + free(r->entrypoint); + } + if (r->cmd) { + for (char **p = r->cmd; *p; p++) + free(*p); + free(r->cmd); + } + memset(r, 0, sizeof(*r)); +} + +void oci_manifest_free(oci_manifest_t *m) +{ + if (!m) + return; + free(m->raw_media_type); + oci_descriptor_free(&m->config); + for (size_t i = 0; i < m->nlayers; i++) + oci_descriptor_free(&m->layers[i]); + free(m->layers); + memset(m, 0, sizeof(*m)); +} + +void oci_index_free(oci_index_t *idx) +{ + if (!idx) + return; + free(idx->raw_media_type); + for (size_t i = 0; i < idx->nentries; i++) { + oci_descriptor_free(&idx->entries[i].desc); + oci_platform_free(&idx->entries[i].platform); + } + free(idx->entries); + memset(idx, 0, sizeof(*idx)); +} + +void oci_image_config_free(oci_image_config_t *c) +{ + if (!c) + return; + free(c->architecture); + free(c->os); + free(c->variant); + runtime_free(&c->config); + if (c->rootfs_diff_ids) { + for (char **p = c->rootfs_diff_ids; *p; p++) + free(*p); + free(c->rootfs_diff_ids); + } + memset(c, 0, sizeof(*c)); +} + +const oci_index_entry_t *oci_index_pick_linux_arm64(const oci_index_t *idx) +{ + if (!idx || !idx->entries) + return NULL; + + const oci_index_entry_t *fallback_empty = NULL; + const oci_index_entry_t *fallback_any = NULL; + + for (size_t i = 0; i < idx->nentries; i++) { + const oci_index_entry_t *e = &idx->entries[i]; + if (strcmp(e->platform.os, "linux") != 0) + continue; + if (strcmp(e->platform.architecture, "arm64") != 0) + continue; + /* Skip foreign or unrecognized manifest media types: the registry + * fetch path cannot consume them anyway, so they are not viable + * even when the platform matches. + */ + if (!oci_media_type_is_manifest(e->desc.media_type)) + continue; + if (strcmp(e->platform.variant, "v8") == 0) + return e; + if (e->platform.variant[0] == '\0') { + if (!fallback_empty) + fallback_empty = e; + } else if (!fallback_any) { + fallback_any = e; + } + } + return fallback_empty ? fallback_empty : fallback_any; +} diff --git a/src/oci/manifest.h b/src/oci/manifest.h new file mode 100644 index 0000000..66ff14d --- /dev/null +++ b/src/oci/manifest.h @@ -0,0 +1,160 @@ +/* OCI image manifest, image index, and image config parsers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Parses the three JSON document types served by an OCI / Docker registry: + * + * - image manifest: config descriptor + ordered layer descriptors + * - image index: platform-tagged manifest descriptors (multi-arch) + * - image config: architecture/os + runtime fields + rootfs diff_ids + * + * Phase 1 keeps the model offline: parsers operate on in-memory JSON bytes + * the caller already obtained from a registry fetch or disk fixture. The + * registry client lives in a later slice; the manifest model exists now so + * the fetch path can deserialize responses, and so the blob store can + * persist the parsed graph without round-tripping through opaque JSON. + * + * Every descriptor digest is validated up-front with oci/digest.c, so a + * parsed oci_descriptor_t is guaranteed to have a lowercase + * : form and a populated (algo, hex[]) pair the blob store can + * consume directly. + * + * Unknown / extension media types do not fail the parse; they are recorded + * with raw_media_type set and media_type == OCI_MT_UNKNOWN so callers can + * decide whether to ignore or reject. The selection helper for + * linux/arm64 manifests intentionally skips any entry that already failed + * media-type recognition because the registry fetch path cannot resolve + * it anyway. + */ + +#pragma once + +#include +#include + +#include "digest.h" +#include "media-type.h" + +typedef struct { + /* Original ":" string, lowercase, never NULL after parse. */ + char *digest_str; + /* Parsed digest algorithm. */ + oci_digest_algo_t algo; + /* Parsed lowercase hex (NUL-terminated). */ + char hex[OCI_DIGEST_HEX_MAX + 1]; + /* Declared size in bytes. Negative values are rejected at parse. */ + int64_t size; + /* Canonical media-type enum, OCI_MT_UNKNOWN if not in the recognized + * table. + */ + oci_media_type_t media_type; + /* Original media-type string for diagnostics. NULL if absent. */ + char *raw_media_type; +} oci_descriptor_t; + +typedef struct { + /* "arm64", "amd64", "ppc64le", ... Never NULL after parse. */ + char *architecture; + /* "linux", "windows", ... Never NULL after parse. */ + char *os; + /* "v8", "v7", "" (empty string when absent in JSON). */ + char *variant; + /* "10.0.14393.1066" for Windows builds, "" otherwise. */ + char *os_version; +} oci_platform_t; + +typedef struct { + oci_descriptor_t desc; + /* Empty platform fields ("" strings, not NULL) when JSON omits them so + * predicates can compare unconditionally. + */ + oci_platform_t platform; +} oci_index_entry_t; + +typedef struct { + int schema_version; + /* Top-level mediaType field. OCI manifests carry an explicit mediaType; + * Docker manifests historically rely on the descriptor or HTTP + * Content-Type. The parser falls back to OCI_MT_UNKNOWN if the JSON + * field is missing and lets the caller cross-check against the + * registry's Content-Type. + */ + oci_media_type_t media_type; + /* Original mediaType string, NULL if absent. */ + char *raw_media_type; + oci_index_entry_t *entries; + size_t nentries; +} oci_index_t; + +typedef struct { + int schema_version; + oci_media_type_t media_type; + char *raw_media_type; + oci_descriptor_t config; + oci_descriptor_t *layers; + size_t nlayers; +} oci_manifest_t; + +/* Image config runtime block (the inner "config" object). Phase 3 of the + * OCI roadmap consumes these fields; the model exists in Phase 1 to support + * elfuse oci inspect rendering. NULL-terminated string arrays are NULL when + * the JSON omits the field; empty arrays are represented as an allocated + * one-element array containing only the NULL terminator. + */ +typedef struct { + char *user; + char *working_dir; + char **env; + char **entrypoint; + char **cmd; +} oci_image_runtime_t; + +typedef struct { + char *architecture; + char *os; + char *variant; + oci_image_runtime_t config; + /* rootfs.diff_ids, NULL-terminated. Always populated (the OCI image-spec + * requires "rootfs"); a parse without this field returns -1. + */ + char **rootfs_diff_ids; +} oci_image_config_t; + +/* Parsers. Each takes raw JSON bytes (need not be NUL-terminated; pass the + * exact length). On success returns 0 and populates out. On failure returns + * -1 with errno preserved when set (ENOMEM, EINVAL) and writes a static + * diagnostic message into *err_msg (when err_msg != NULL). + */ +int oci_manifest_parse(const char *json, + size_t len, + oci_manifest_t *out, + const char **err_msg); + +int oci_index_parse(const char *json, + size_t len, + oci_index_t *out, + const char **err_msg); + +int oci_image_config_parse(const char *json, + size_t len, + oci_image_config_t *out, + const char **err_msg); + +/* Release any heap fields. Safe on zero-initialised structs and on NULL. */ +void oci_manifest_free(oci_manifest_t *m); +void oci_index_free(oci_index_t *idx); +void oci_image_config_free(oci_image_config_t *c); +void oci_descriptor_free(oci_descriptor_t *d); +void oci_platform_free(oci_platform_t *p); + +/* Select the linux/arm64 manifest from an index. Returns a pointer into + * idx->entries on success (caller does not free) or NULL when no acceptable + * platform is present. Preference order, highest first: + * 1. os=="linux" && arch=="arm64" && variant=="v8" + * 2. os=="linux" && arch=="arm64" && variant=="" + * 3. os=="linux" && arch=="arm64" (any other variant; first wins) + * Foreign / unsupported media types are skipped: even if a foreign-layer + * manifest claims linux/arm64, the registry fetch path cannot consume it. + */ +const oci_index_entry_t *oci_index_pick_linux_arm64(const oci_index_t *idx); diff --git a/src/oci/media-type.c b/src/oci/media-type.c new file mode 100644 index 0000000..0c6f9a7 --- /dev/null +++ b/src/oci/media-type.c @@ -0,0 +1,194 @@ +/* OCI / Docker media-type canonicalization + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "media-type.h" + +#include +#include +#include +#include + +struct mt_entry { + const char *name; + oci_media_type_t kind; +}; + +/* All recognized OCI and Docker media types in a single table. Order has no + * semantic meaning; the lookup is linear because the table is small (~16 + * entries) and runs at most once per descriptor parse. + */ +static const struct mt_entry MEDIA_TYPES[] = { + /* Manifest documents. */ + {"application/vnd.oci.image.manifest.v1+json", OCI_MT_MANIFEST_OCI}, + {"application/vnd.docker.distribution.manifest.v2+json", + OCI_MT_MANIFEST_DOCKER}, + + /* Image indexes / manifest lists. */ + {"application/vnd.oci.image.index.v1+json", OCI_MT_INDEX_OCI}, + {"application/vnd.docker.distribution.manifest.list.v2+json", + OCI_MT_INDEX_DOCKER}, + + /* Image config. */ + {"application/vnd.oci.image.config.v1+json", OCI_MT_CONFIG_OCI}, + {"application/vnd.docker.container.image.v1+json", OCI_MT_CONFIG_DOCKER}, + + /* Supported layer payloads. */ + {"application/vnd.oci.image.layer.v1.tar", OCI_MT_LAYER_OCI_TAR}, + {"application/vnd.oci.image.layer.v1.tar+gzip", OCI_MT_LAYER_OCI_TAR_GZIP}, + {"application/vnd.oci.image.layer.v1.tar+zstd", OCI_MT_LAYER_OCI_TAR_ZSTD}, + {"application/vnd.docker.image.rootfs.diff.tar.gzip", + OCI_MT_LAYER_DOCKER_TAR_GZIP}, + {"application/vnd.docker.image.rootfs.diff.tar.zstd", + OCI_MT_LAYER_DOCKER_TAR_ZSTD}, + + /* Foreign (nondistributable) layers. Recognized so the parser can produce + * a precise rejection message instead of falling through to UNKNOWN. + */ + {"application/vnd.oci.image.layer.nondistributable.v1.tar", + OCI_MT_LAYER_FOREIGN_OCI}, + {"application/vnd.oci.image.layer.nondistributable.v1.tar+gzip", + OCI_MT_LAYER_FOREIGN_OCI_GZIP}, + {"application/vnd.docker.image.rootfs.foreign.diff.tar", + OCI_MT_LAYER_FOREIGN_DOCKER}, + {"application/vnd.docker.image.rootfs.foreign.diff.tar.gzip", + OCI_MT_LAYER_FOREIGN_DOCKER_GZIP}, +}; + +#define MEDIA_TYPE_COUNT (sizeof(MEDIA_TYPES) / sizeof(MEDIA_TYPES[0])) + +/* Strip surrounding whitespace and any parameters after ';'. Writes the + * canonical span into out. Returns the canonical length or 0 if the input + * collapses to empty. + */ +static size_t canonicalize(const char *s, char *out, size_t out_size) +{ + if (!s || out_size == 0) + return 0; + + while (*s == ' ' || *s == '\t') + s++; + + const char *end = s; + while (*end && *end != ';') + end++; + while (end > s && (end[-1] == ' ' || end[-1] == '\t')) + end--; + + size_t len = (size_t) (end - s); + if (len == 0 || len >= out_size) + return 0; + memcpy(out, s, len); + out[len] = '\0'; + return len; +} + +oci_media_type_t oci_media_type_parse(const char *s) +{ + if (!s) + return OCI_MT_UNKNOWN; + + /* Media-type values in OCI manifests are short; 192 bytes covers every + * canonical name in the table with room for adversarial whitespace. + */ + char buf[192]; + if (canonicalize(s, buf, sizeof(buf)) == 0) + return OCI_MT_UNKNOWN; + + /* RFC 6838: media type and subtype tokens are case-insensitive. The + * parameter span (after ';') is already stripped by canonicalize, so the + * whole of buf is type/subtype and can be matched case-insensitively. + */ + for (size_t i = 0; i < MEDIA_TYPE_COUNT; i++) { + if (!strcasecmp(MEDIA_TYPES[i].name, buf)) + return MEDIA_TYPES[i].kind; + } + return OCI_MT_UNKNOWN; +} + +const char *oci_media_type_name(oci_media_type_t mt) +{ + for (size_t i = 0; i < MEDIA_TYPE_COUNT; i++) { + if (MEDIA_TYPES[i].kind == mt) + return MEDIA_TYPES[i].name; + } + return NULL; +} + +bool oci_media_type_is_manifest(oci_media_type_t mt) +{ + return mt == OCI_MT_MANIFEST_OCI || mt == OCI_MT_MANIFEST_DOCKER; +} + +bool oci_media_type_is_index(oci_media_type_t mt) +{ + return mt == OCI_MT_INDEX_OCI || mt == OCI_MT_INDEX_DOCKER; +} + +bool oci_media_type_is_config(oci_media_type_t mt) +{ + return mt == OCI_MT_CONFIG_OCI || mt == OCI_MT_CONFIG_DOCKER; +} + +bool oci_media_type_is_layer(oci_media_type_t mt) +{ + switch (mt) { + case OCI_MT_LAYER_OCI_TAR: + case OCI_MT_LAYER_OCI_TAR_GZIP: + case OCI_MT_LAYER_OCI_TAR_ZSTD: + case OCI_MT_LAYER_DOCKER_TAR_GZIP: + case OCI_MT_LAYER_DOCKER_TAR_ZSTD: + case OCI_MT_LAYER_FOREIGN_OCI: + case OCI_MT_LAYER_FOREIGN_OCI_GZIP: + case OCI_MT_LAYER_FOREIGN_DOCKER: + case OCI_MT_LAYER_FOREIGN_DOCKER_GZIP: + return true; + default: + return false; + } +} + +bool oci_media_type_is_layer_supported(oci_media_type_t mt) +{ + switch (mt) { + case OCI_MT_LAYER_OCI_TAR: + case OCI_MT_LAYER_OCI_TAR_GZIP: + case OCI_MT_LAYER_OCI_TAR_ZSTD: + case OCI_MT_LAYER_DOCKER_TAR_GZIP: + case OCI_MT_LAYER_DOCKER_TAR_ZSTD: + return true; + default: + return false; + } +} + +bool oci_media_type_is_foreign(oci_media_type_t mt) +{ + switch (mt) { + case OCI_MT_LAYER_FOREIGN_OCI: + case OCI_MT_LAYER_FOREIGN_OCI_GZIP: + case OCI_MT_LAYER_FOREIGN_DOCKER: + case OCI_MT_LAYER_FOREIGN_DOCKER_GZIP: + return true; + default: + return false; + } +} + +oci_compression_t oci_media_type_compression(oci_media_type_t mt) +{ + switch (mt) { + case OCI_MT_LAYER_OCI_TAR_GZIP: + case OCI_MT_LAYER_DOCKER_TAR_GZIP: + case OCI_MT_LAYER_FOREIGN_OCI_GZIP: + case OCI_MT_LAYER_FOREIGN_DOCKER_GZIP: + return OCI_COMPRESSION_GZIP; + case OCI_MT_LAYER_OCI_TAR_ZSTD: + case OCI_MT_LAYER_DOCKER_TAR_ZSTD: + return OCI_COMPRESSION_ZSTD; + default: + return OCI_COMPRESSION_NONE; + } +} diff --git a/src/oci/media-type.h b/src/oci/media-type.h new file mode 100644 index 0000000..66a2a1b --- /dev/null +++ b/src/oci/media-type.h @@ -0,0 +1,93 @@ +/* OCI / Docker media-type canonicalization + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * OCI image references carry media-type strings on every descriptor. The + * registry client, manifest parser, and unpack stage all branch on the media + * type, so a single canonical enum lookup keeps the comparisons one place + * away from string typos. Docker registries continue to serve the legacy + * docker-namespaced media types (vnd.docker.distribution.manifest.v2+json) + * even when the image-spec wire format is OCI v1; the table accepts both. + * + * Foreign (nondistributable) layers are recognized but classified as + * unsupported per oci-roadmap.md Q3: elfuse cannot fetch the out-of-band + * payload those layers reference, so rejecting them at parse time is the + * honest answer rather than carrying a half-supported code path. + */ + +#pragma once + +#include + +typedef enum { + OCI_MT_UNKNOWN = 0, + + /* Manifest documents (single platform). */ + OCI_MT_MANIFEST_OCI, + OCI_MT_MANIFEST_DOCKER, + + /* Image index / manifest list (multi-platform). */ + OCI_MT_INDEX_OCI, + OCI_MT_INDEX_DOCKER, + + /* Image config blob. */ + OCI_MT_CONFIG_OCI, + OCI_MT_CONFIG_DOCKER, + + /* Layer blobs that elfuse can actually consume. */ + OCI_MT_LAYER_OCI_TAR, + OCI_MT_LAYER_OCI_TAR_GZIP, + OCI_MT_LAYER_OCI_TAR_ZSTD, + OCI_MT_LAYER_DOCKER_TAR_GZIP, + OCI_MT_LAYER_DOCKER_TAR_ZSTD, + + /* Foreign layers: distinguishable but explicitly unsupported. */ + OCI_MT_LAYER_FOREIGN_OCI, + OCI_MT_LAYER_FOREIGN_OCI_GZIP, + OCI_MT_LAYER_FOREIGN_DOCKER, + OCI_MT_LAYER_FOREIGN_DOCKER_GZIP, +} oci_media_type_t; + +typedef enum { + OCI_COMPRESSION_NONE, + OCI_COMPRESSION_GZIP, + OCI_COMPRESSION_ZSTD, +} oci_compression_t; + +/* Classify a media-type string. Trailing parameters after ';' (e.g. charset) + * are stripped before matching; surrounding whitespace is ignored. Returns + * OCI_MT_UNKNOWN for any string not in the recognized table. NULL is treated + * as OCI_MT_UNKNOWN. + */ +oci_media_type_t oci_media_type_parse(const char *s); + +/* Lookup the canonical name string for a media-type enum. Returns NULL for + * OCI_MT_UNKNOWN or an out-of-range enum value. The returned pointer is to + * static storage. + */ +const char *oci_media_type_name(oci_media_type_t mt); + +/* Predicates by document category. Each returns false for OCI_MT_UNKNOWN. */ +bool oci_media_type_is_manifest(oci_media_type_t mt); +bool oci_media_type_is_index(oci_media_type_t mt); +bool oci_media_type_is_config(oci_media_type_t mt); +bool oci_media_type_is_layer(oci_media_type_t mt); + +/* True when the layer media type is one elfuse can actually decode. Foreign + * layers and OCI_MT_UNKNOWN return false; the manifest parser rejects layer + * descriptors that fail this check. + */ +bool oci_media_type_is_layer_supported(oci_media_type_t mt); + +/* True for the four foreign-layer media types. The manifest parser keeps + * these distinguishable so the error message can name the actual layer type + * instead of a generic 'unsupported'. + */ +bool oci_media_type_is_foreign(oci_media_type_t mt); + +/* Compression algorithm carried by a layer media type. Non-layer or unknown + * inputs return OCI_COMPRESSION_NONE; callers should gate on + * oci_media_type_is_layer first. + */ +oci_compression_t oci_media_type_compression(oci_media_type_t mt); diff --git a/src/oci/origin-meta.c b/src/oci/origin-meta.c new file mode 100644 index 0000000..7726550 --- /dev/null +++ b/src/oci/origin-meta.c @@ -0,0 +1,330 @@ +/* OCI unpacked-tree provenance sidecar implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "oci/origin-meta.h" + +#define OCI_ORIGIN_FILE ".elfuse-origin.json" + +/* Conservative ceiling. A realistic sidecar is a few hundred bytes + * (three short JSON strings plus an array of ~64 byte diff_id entries). + * Anything past 1 MiB indicates a corrupt or hostile file and is + * rejected so the reader does not try to malloc gigabytes from a + * stat-spoofed input. + */ +#define OCI_ORIGIN_MAX_BYTES (1u << 20) + +static int set_err(const char **err, const char *msg, int err_no) +{ + if (err) + *err = msg; + errno = err_no; + return -1; +} + +static char *build_path(const char *root_dir, const char *name, int tmp) +{ + size_t want = strlen(root_dir) + 1 + strlen(name) + (tmp ? 4 : 0) + 1; + char *p = malloc(want); + if (!p) + return NULL; + snprintf(p, want, "%s/%s%s", root_dir, name, tmp ? ".tmp" : ""); + return p; +} + +int oci_origin_write(const char *root_dir, + const char *manifest_digest, + const char *config_digest, + char *const *diff_ids, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!root_dir || !manifest_digest || !config_digest) + return set_err(err, "origin write: NULL argument", EINVAL); + if (!root_dir[0] || !manifest_digest[0] || !config_digest[0]) + return set_err(err, "origin write: empty argument", EINVAL); + + cJSON *root = cJSON_CreateObject(); + if (!root) + return set_err(err, "origin write: cJSON_CreateObject failed", ENOMEM); + if (!cJSON_AddStringToObject(root, "manifest_digest", manifest_digest)) { + cJSON_Delete(root); + return set_err(err, "origin write: manifest_digest add failed", ENOMEM); + } + if (!cJSON_AddStringToObject(root, "config_digest", config_digest)) { + cJSON_Delete(root); + return set_err(err, "origin write: config_digest add failed", ENOMEM); + } + cJSON *arr = cJSON_AddArrayToObject(root, "layer_diffids"); + if (!arr) { + cJSON_Delete(root); + return set_err(err, "origin write: layer_diffids add failed", ENOMEM); + } + if (diff_ids) { + for (char *const *p = diff_ids; *p; p++) { + cJSON *s = cJSON_CreateString(*p); + if (!s) { + cJSON_Delete(root); + return set_err(err, "origin write: diff_id string failed", + ENOMEM); + } + cJSON_AddItemToArray(arr, s); + } + } + + char *json = cJSON_PrintUnformatted(root); + cJSON_Delete(root); + if (!json) + return set_err(err, "origin write: cJSON_Print failed", ENOMEM); + size_t jlen = strlen(json); + + char *tmp_path = build_path(root_dir, OCI_ORIGIN_FILE, 1); + char *final_path = build_path(root_dir, OCI_ORIGIN_FILE, 0); + if (!tmp_path || !final_path) { + free(tmp_path); + free(final_path); + free(json); + return set_err(err, "origin write: path allocation failed", ENOMEM); + } + + int fd = open(tmp_path, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (fd < 0) { + int saved = errno; + free(tmp_path); + free(final_path); + free(json); + return set_err(err, "origin write: open tmp failed", saved); + } + if (write(fd, json, jlen) != (ssize_t) jlen) { + int saved = errno ? errno : EIO; + close(fd); + unlink(tmp_path); + free(tmp_path); + free(final_path); + free(json); + return set_err(err, "origin write: write failed", saved); + } + if (fsync(fd) < 0) { + int saved = errno; + close(fd); + unlink(tmp_path); + free(tmp_path); + free(final_path); + free(json); + return set_err(err, "origin write: fsync failed", saved); + } + close(fd); + if (rename(tmp_path, final_path) < 0) { + int saved = errno; + unlink(tmp_path); + free(tmp_path); + free(final_path); + free(json); + return set_err(err, "origin write: rename failed", saved); + } + + free(tmp_path); + free(final_path); + free(json); + return 0; +} + +void oci_origin_free(oci_origin_t *o) +{ + if (!o) + return; + free(o->manifest_digest); + free(o->config_digest); + if (o->layer_diffids) { + for (char **p = o->layer_diffids; *p; p++) + free(*p); + free((void *) o->layer_diffids); + } + o->manifest_digest = NULL; + o->config_digest = NULL; + o->layer_diffids = NULL; +} + +/* Slurp / into a heap buffer. Returns the + * buffer (NUL-terminated for safety against cJSON) on success and + * writes the byte length into *out_len; returns NULL on failure with + * errno and *err populated. + */ +static char *slurp_origin(const char *root_dir, + size_t *out_len, + const char **err) +{ + char *path = build_path(root_dir, OCI_ORIGIN_FILE, 0); + if (!path) { + set_err(err, "origin read: path alloc failed", ENOMEM); + return NULL; + } + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + int saved = errno; + set_err(err, "origin read: open failed", saved); + free(path); + return NULL; + } + free(path); + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + set_err(err, "origin read: fstat failed", saved); + return NULL; + } + if (st.st_size <= 0 || (unsigned long) st.st_size > OCI_ORIGIN_MAX_BYTES) { + close(fd); + set_err(err, "origin read: file size out of bounds", EINVAL); + return NULL; + } + size_t len = (size_t) st.st_size; + char *buf = malloc(len + 1); + if (!buf) { + close(fd); + set_err(err, "origin read: alloc failed", ENOMEM); + return NULL; + } + size_t off = 0; + while (off < len) { + ssize_t got = read(fd, buf + off, len - off); + if (got < 0) { + if (errno == EINTR) + continue; + int saved = errno; + free(buf); + close(fd); + set_err(err, "origin read: read failed", saved); + return NULL; + } + if (got == 0) + break; + off += (size_t) got; + } + close(fd); + if (off != len) { + free(buf); + set_err(err, "origin read: short read", EIO); + return NULL; + } + buf[len] = '\0'; + *out_len = len; + return buf; +} + +int oci_origin_read(const char *root_dir, oci_origin_t *out, const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!root_dir || !out) + return set_err(err, "origin read: NULL argument", EINVAL); + if (!root_dir[0]) + return set_err(err, "origin read: empty root_dir", EINVAL); + out->manifest_digest = NULL; + out->config_digest = NULL; + out->layer_diffids = NULL; + + size_t json_len = 0; + char *json = slurp_origin(root_dir, &json_len, err); + if (!json) + return -1; + + cJSON *root = cJSON_ParseWithLength(json, json_len); + free(json); + if (!root) + return set_err(err, "origin read: JSON parse failed", EINVAL); + + const cJSON *m_field = + cJSON_GetObjectItemCaseSensitive(root, "manifest_digest"); + const cJSON *c_field = + cJSON_GetObjectItemCaseSensitive(root, "config_digest"); + const cJSON *l_field = + cJSON_GetObjectItemCaseSensitive(root, "layer_diffids"); + if (!cJSON_IsString(m_field) || !m_field->valuestring || + !m_field->valuestring[0]) { + cJSON_Delete(root); + return set_err(err, "origin read: missing manifest_digest field", + EINVAL); + } + if (!cJSON_IsString(c_field) || !c_field->valuestring || + !c_field->valuestring[0]) { + cJSON_Delete(root); + return set_err(err, "origin read: missing config_digest field", EINVAL); + } + if (!cJSON_IsArray(l_field)) { + cJSON_Delete(root); + return set_err(err, "origin read: missing layer_diffids array", EINVAL); + } + + int n_diffs = cJSON_GetArraySize(l_field); + if (n_diffs < 0) { + cJSON_Delete(root); + return set_err(err, "origin read: layer_diffids size invalid", EINVAL); + } + + /* +1 slot for the NULL terminator so callers can iterate with the + * idiomatic `for (p = arr; *p; p++)` pattern. + */ + void *diffs_raw = calloc((size_t) n_diffs + 1, sizeof(char *)); + if (!diffs_raw) { + cJSON_Delete(root); + return set_err(err, "origin read: diff array alloc failed", ENOMEM); + } + char **diffs = (char **) diffs_raw; + for (int i = 0; i < n_diffs; i++) { + const cJSON *entry = cJSON_GetArrayItem(l_field, i); + if (!cJSON_IsString(entry) || !entry->valuestring || + !entry->valuestring[0]) { + for (int k = 0; k < i; k++) + free(diffs[k]); + free((void *) diffs); + cJSON_Delete(root); + return set_err(err, + "origin read: layer_diffids entry is not a " + "non-empty string", + EINVAL); + } + diffs[i] = strdup(entry->valuestring); + if (!diffs[i]) { + for (int k = 0; k < i; k++) + free(diffs[k]); + free((void *) diffs); + cJSON_Delete(root); + return set_err(err, "origin read: diff strdup failed", ENOMEM); + } + } + + char *m_copy = strdup(m_field->valuestring); + char *c_copy = strdup(c_field->valuestring); + cJSON_Delete(root); + if (!m_copy || !c_copy) { + free(m_copy); + free(c_copy); + for (int k = 0; k < n_diffs; k++) + free(diffs[k]); + free((void *) diffs); + return set_err(err, "origin read: digest strdup failed", ENOMEM); + } + + out->manifest_digest = m_copy; + out->config_digest = c_copy; + out->layer_diffids = diffs; + return 0; +} diff --git a/src/oci/origin-meta.h b/src/oci/origin-meta.h new file mode 100644 index 0000000..90aeea4 --- /dev/null +++ b/src/oci/origin-meta.h @@ -0,0 +1,71 @@ +/* OCI unpacked-tree provenance sidecar + * + * Records which manifest produced an unpacked image directory so the + * Plan 1 garbage collector can walk unpacked sysroots and recover the + * full set of blobs (manifest, image-config, layer tars + diff-id + * pre-images) still referenced by on-disk state. Without this file the + * mark phase has no way to attribute an unpacked tree back to a stored + * manifest, and a prune sweep would happily delete layer blobs that are + * still backing a live sysroot. + * + * Serialization format: /.elfuse-origin.json with the shape + * { "manifest_digest": "sha256:...", + * "config_digest": "sha256:...", + * "layer_diffids": ["sha256:...", "sha256:..."] } + * + * The diff_ids come from the image-config blob's rootfs.diff_ids field, + * not from the manifest's layer descriptors: per the OCI image spec a + * diff_id is the digest of the uncompressed layer tar, while the + * manifest's layer.digest references the (possibly compressed) blob on + * disk. Recording both lets C1.2's root-set walker map each unpacked + * tree back to every blob it depends on regardless of layer media + * type. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +/* Parsed contents of an unpacked image tree's .elfuse-origin.json. All + * three fields are heap-allocated and owned by the struct: free via + * oci_origin_free. layer_diffids is a NULL-terminated array of + * ":" strings; the array itself is malloc'd and so is every + * entry. An image with zero layers parses to a one-element array + * containing only the NULL terminator. + */ +typedef struct { + char *manifest_digest; + char *config_digest; + char **layer_diffids; +} oci_origin_t; + +/* Write /.elfuse-origin.json via atomic rename. manifest_digest + * and config_digest are NUL-terminated ":" strings; diff_ids + * is a NULL-terminated array of the same form (an empty array is + * permitted and serializes to []). Returns 0 on success, -1 on failure + * with errno set and *err pointing to a static diagnostic. err may be + * NULL. + */ +int oci_origin_write(const char *root_dir, + const char *manifest_digest, + const char *config_digest, + char *const *diff_ids, + const char **err); + +/* Read /.elfuse-origin.json into *out. The struct is + * populated only on success; on failure out is left zeroed. Validates + * that manifest_digest and config_digest are present and string-typed + * and that layer_diffids is an array of strings; missing or + * mistyped fields surface as -1 with errno=EINVAL so the C1.3 garbage + * collector treats a malformed sidecar as a fatal root-set hole rather + * than silently dropping the tree's blobs from the keep set. Returns 0 + * on success, -1 on failure with errno set and *err pointing to a + * static diagnostic. err may be NULL. + */ +int oci_origin_read(const char *root_dir, oci_origin_t *out, const char **err); + +/* Release every heap field in o and zero the struct. Safe on a + * zero-initialised struct and on NULL. + */ +void oci_origin_free(oci_origin_t *o); diff --git a/src/oci/path-resolve.c b/src/oci/path-resolve.c new file mode 100644 index 0000000..9304872 --- /dev/null +++ b/src/oci/path-resolve.c @@ -0,0 +1,405 @@ +/* OCI guest PATH resolver: argv0 + PATH + cwd_guest -> host_path/guest_path + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Implementation notes that did not fit in the header: + * + * The candidate probe order matters. Each PATH entry is probed in turn. + * The first entry whose candidate exists, contains, and is executable + * wins (returns success immediately, like execvp). If no entry succeeds + * but at least one entry was found-and-contained-but-not-executable, the + * call surfaces EACCES with the rejected argv0 quoted. Otherwise the + * call surfaces ENOENT with the directories actually probed listed in + * the diagnostic so the operator can tell whether PATH itself was wrong + * or whether the binary just is not in the image. + * + * "Probed" specifically means "the candidate had a realpath that landed + * inside the sysroot". Escape symlinks and broken chains do NOT show up + * in the searched-dirs annotation: they were directories that exist on + * the guest side but contributed no host candidate, which is closer to + * what an operator who reads the error wants to see. + * + * realpath(3) is used once for sysroot_dir (so the prefix is canonical + * regardless of whether the caller passed it with or without trailing + * slashes or with intermediate symlinks) and once per probed candidate + * (so the containment check sees the post-symlink path). Both calls + * pass NULL for the resolved-path argument and let libc allocate. + */ + +#include "path-resolve.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Diagnostic scratch shared across the module. Thread-local so a future + * multiplexed oci run that probes several launches in parallel does not + * trample the err pointer between calls. + */ +static _Thread_local char path_err_buf[1024]; + +static void set_err_static(const char **err, const char *msg) +{ + if (err) + *err = msg; +} + +static void set_err_fmt(const char **err, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); + +static void set_err_fmt(const char **err, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(path_err_buf, sizeof(path_err_buf), fmt, ap); + va_end(ap); + if (err) + *err = path_err_buf; +} + +/* Geometric-grow string buffer used to assemble the searched-dirs list + * for the ENOENT diagnostic. Keeping it separate from runspec's strvec + * because the surface area here is small enough that introducing a + * cross-module utility header would be over-engineering for a two-call + * use case. + */ +typedef struct { + char *buf; + size_t len; + size_t cap; +} sbuf_t; + +static int sbuf_append(sbuf_t *s, const char *str) +{ + size_t add = strlen(str); + if (s->len + add + 1 > s->cap) { + size_t newcap = s->cap ? s->cap : 64; + while (newcap < s->len + add + 1) + newcap *= 2; + char *np = realloc(s->buf, newcap); + if (!np) + return -1; + s->buf = np; + s->cap = newcap; + } + memcpy(s->buf + s->len, str, add); + s->len += add; + s->buf[s->len] = '\0'; + return 0; +} + +/* Concatenate two path components, normalising the slash boundary. NULL + * inputs propagate as NULL so call sites can chain without an extra + * branch per step. + */ +static char *path_join(const char *a, const char *b) +{ + if (!a || !b) + return NULL; + size_t alen = strlen(a); + size_t blen = strlen(b); + char *r = malloc(alen + blen + 2); + if (!r) + return NULL; + bool a_has_trailing = alen > 0 && a[alen - 1] == '/'; + bool b_has_leading = blen > 0 && b[0] == '/'; + if (a_has_trailing && b_has_leading) { + memcpy(r, a, alen); + memcpy(r + alen, b + 1, blen - 1); + r[alen + blen - 1] = '\0'; + } else if (!a_has_trailing && !b_has_leading && alen > 0 && blen > 0) { + memcpy(r, a, alen); + r[alen] = '/'; + memcpy(r + alen + 1, b, blen); + r[alen + 1 + blen] = '\0'; + } else { + memcpy(r, a, alen); + memcpy(r + alen, b, blen); + r[alen + blen] = '\0'; + } + return r; +} + +/* True when real_candidate equals real_sysroot or sits below it. Both + * arguments are absolute paths produced by realpath(3) so no trailing + * slash normalisation is needed on the candidate side; real_sysroot + * keeps any trailing slash stripped (realpath never adds one, but the + * defensive check below handles a caller that mutated it). + */ +static bool path_within_sysroot(const char *real_candidate, + const char *real_sysroot) +{ + size_t slen = strlen(real_sysroot); + while (slen > 1 && real_sysroot[slen - 1] == '/') + slen--; + if (strncmp(real_candidate, real_sysroot, slen) != 0) + return false; + char trailing = real_candidate[slen]; + return trailing == '\0' || trailing == '/'; +} + +enum probe_result { + PROBE_OK = 0, + PROBE_NOEXEC = 1, + PROBE_ESCAPE = 2, + PROBE_MISS = 3, +}; + +/* Probe one host candidate. The realpath()-based containment check runs + * first so escape symlinks never reach the stat() call. The stat() + * itself follows symlinks (POSIX stat semantics), matching how execvp + * would observe the candidate. + */ +static enum probe_result probe_candidate(const char *host_path, + const char *real_sysroot) +{ + char *real = realpath(host_path, NULL); + if (!real) + return PROBE_MISS; + bool contained = path_within_sysroot(real, real_sysroot); + free(real); + if (!contained) + return PROBE_ESCAPE; + struct stat st; + if (stat(host_path, &st) < 0) + return PROBE_MISS; + if (!S_ISREG(st.st_mode)) + return PROBE_MISS; + if ((st.st_mode & 0111) == 0) + return PROBE_NOEXEC; + return PROBE_OK; +} + +/* Direct-mode resolve: argv0 contains '/' so the PATH is bypassed. + * Absolute argv0 maps to ; relative argv0 anchors to + * cwd_guest. + */ +static int resolve_direct(const char *real_sysroot, + const char *argv0, + const char *cwd_guest, + char **out_host_path, + char **out_guest_path, + const char **err) +{ + char *guest_path = NULL; + if (argv0[0] == '/') + guest_path = strdup(argv0); + else + guest_path = path_join(cwd_guest, argv0); + if (!guest_path) { + set_err_static(err, "out of memory building guest path"); + errno = ENOMEM; + return -1; + } + char *host_path = path_join(real_sysroot, guest_path); + if (!host_path) { + free(guest_path); + set_err_static(err, "out of memory building host path"); + errno = ENOMEM; + return -1; + } + enum probe_result probe = probe_candidate(host_path, real_sysroot); + if (probe == PROBE_OK) { + *out_host_path = host_path; + *out_guest_path = guest_path; + return 0; + } + if (probe == PROBE_NOEXEC) { + set_err_fmt(err, "'%s' is not executable inside sysroot", argv0); + errno = EACCES; + } else if (probe == PROBE_ESCAPE) { + set_err_fmt(err, "'%s' resolves outside sysroot (refusing escape)", + argv0); + errno = ENOENT; + } else { + set_err_fmt(err, "cannot find '%s' inside sysroot", argv0); + errno = ENOENT; + } + free(host_path); + free(guest_path); + return -1; +} + +/* PATH-search resolve: walk path_env in order, returning the first + * executable candidate. Tracks the first NOEXEC for the late EACCES + * diagnostic and a sbuf_t of probed directories for the late ENOENT + * diagnostic. + */ +static int resolve_path_search(const char *real_sysroot, + const char *argv0, + const char *path_env, + const char *cwd_guest, + char **out_host_path, + char **out_guest_path, + const char **err) +{ + if (!path_env || !*path_env) { + set_err_fmt(err, + "cannot find '%s' on PATH inside sysroot (PATH is" + " empty)", + argv0); + errno = ENOENT; + return -1; + } + + sbuf_t searched = {0}; + char *first_noexec_argv0 = NULL; + int rc = -1; + + const char *p = path_env; + while (*p) { + const char *colon = strchr(p, ':'); + size_t elen = colon ? (size_t) (colon - p) : strlen(p); + char entry[PATH_MAX]; + if (elen >= sizeof(entry)) { + p = colon ? colon + 1 : p + elen; + continue; + } + memcpy(entry, p, elen); + entry[elen] = '\0'; + p = colon ? colon + 1 : p + elen; + + char *guest_dir; + if (elen == 0) + guest_dir = strdup(cwd_guest); + else if (entry[0] == '/') + guest_dir = strdup(entry); + else + guest_dir = path_join(cwd_guest, entry); + if (!guest_dir) { + set_err_static(err, "out of memory building PATH entry"); + errno = ENOMEM; + goto cleanup; + } + + char *guest_path = path_join(guest_dir, argv0); + char *host_path = + guest_path ? path_join(real_sysroot, guest_path) : NULL; + if (!guest_path || !host_path) { + free(guest_path); + free(host_path); + free(guest_dir); + set_err_static(err, "out of memory building host candidate"); + errno = ENOMEM; + goto cleanup; + } + + enum probe_result probe = probe_candidate(host_path, real_sysroot); + if (probe == PROBE_OK) { + *out_host_path = host_path; + *out_guest_path = guest_path; + free(guest_dir); + rc = 0; + goto cleanup; + } + if (probe == PROBE_NOEXEC) { + if (searched.len) + if (sbuf_append(&searched, ":") < 0) + goto oom_in_loop; + if (sbuf_append(&searched, guest_dir) < 0) + goto oom_in_loop; + if (!first_noexec_argv0) + first_noexec_argv0 = guest_path; + else + free(guest_path); + free(host_path); + free(guest_dir); + continue; + } + if (probe == PROBE_MISS) { + if (searched.len) + if (sbuf_append(&searched, ":") < 0) + goto oom_in_loop_full; + if (sbuf_append(&searched, guest_dir) < 0) + goto oom_in_loop_full; + } + free(guest_path); + free(host_path); + free(guest_dir); + continue; + + oom_in_loop_full: + free(guest_path); + oom_in_loop: + free(host_path); + free(guest_dir); + set_err_static(err, "out of memory recording PATH entry"); + errno = ENOMEM; + goto cleanup; + } + + if (first_noexec_argv0) { + set_err_fmt(err, "'%s' is not executable inside sysroot", argv0); + errno = EACCES; + free(first_noexec_argv0); + } else { + set_err_fmt(err, + "cannot find '%s' on PATH inside sysroot (searched:" + " %s)", + argv0, searched.buf ? searched.buf : ""); + errno = ENOENT; + } + +cleanup: + /* On success the saved first_noexec_argv0 has already been freed (it + * stays NULL because the OK branch hit first). On failure with no + * NOEXEC seen the same logic applies. The dual-free pattern here + * avoids leaking when a PATH miss happened to also have a noexec + * earlier in the walk. + */ + if (rc == 0 && first_noexec_argv0) + free(first_noexec_argv0); + free(searched.buf); + return rc; +} + +int oci_path_resolve(const char *sysroot_dir, + const char *argv0, + const char *path_env, + const char *cwd_guest, + char **out_host_path, + char **out_guest_path, + const char **err) +{ + if (out_host_path) + *out_host_path = NULL; + if (out_guest_path) + *out_guest_path = NULL; + if (err) + *err = NULL; + + if (!sysroot_dir || !argv0 || !*argv0 || !out_host_path || + !out_guest_path) { + set_err_static(err, "oci_path_resolve: NULL argument or empty argv0"); + errno = EINVAL; + return -1; + } + if (!cwd_guest || !*cwd_guest) + cwd_guest = "/"; + + char *real_sysroot = realpath(sysroot_dir, NULL); + if (!real_sysroot) { + set_err_fmt(err, "sysroot not accessible: %s", sysroot_dir); + /* errno preserved by realpath */ + return -1; + } + + int rc; + if (strchr(argv0, '/')) { + rc = resolve_direct(real_sysroot, argv0, cwd_guest, out_host_path, + out_guest_path, err); + } else { + rc = resolve_path_search(real_sysroot, argv0, path_env, cwd_guest, + out_host_path, out_guest_path, err); + } + + free(real_sysroot); + return rc; +} diff --git a/src/oci/path-resolve.h b/src/oci/path-resolve.h new file mode 100644 index 0000000..d779f8b --- /dev/null +++ b/src/oci/path-resolve.h @@ -0,0 +1,78 @@ +/* OCI guest PATH resolver inside a cloned rootfs + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Resolves a guest argv[0] against the merged guest PATH while keeping the + * lookup contained inside the per-run cloned rootfs. The function is the + * pre-launch bridge between oci_runspec_build (Phase 3 commit 2, which + * decides what argv and what PATH the guest should see) and elfuse_launch + * (Phase 3 commit 4, which expects a host filesystem path to open): + * + * - host_path is the absolute host path elfuse should open() to load + * the guest binary. It is the candidate exactly as found via PATH or + * directly via argv0; symlinks are NOT collapsed because the guest + * loader (and any execve() the guest runs internally) want to see the + * name they were invoked under, not the real path. Containment checks + * do use realpath internally; only the result handed back to the caller + * stays in symlink form. + * + * - guest_path is the guest-absolute path the guest itself thinks it is + * running (for argv0[0], for /proc/self/exe, for whatever a tool wants + * to learn about its own name). For PATH-search results it is + * /; for direct-mode results it is argv0 itself + * (absolute) or cwd_guest/argv0 (relative with '/'). + * + * Containment policy: every candidate path is fed to realpath(3) and the + * resolved absolute path must equal sysroot_dir or start with + * sysroot_dir + '/'. Symlink chains that resolve outside the sysroot are + * silently skipped (the PATH search continues) so a malicious or sloppy + * image layer cannot trick elfuse into loading a host-side binary. This + * matches how Docker's runc treats escape symlinks: drop them from the + * search instead of failing the entire launch. + * + * Executability is decided by host stat(2) (follows symlinks) against + * st_mode & 0111. PATH search records the first found-but-not-executable + * candidate and surfaces EACCES if no later entry succeeds, mirroring + * execvp's "first noexec wins" behaviour. + * + * The module deliberately does NOT reuse src/syscall/path.c's + * path_translate_at: that resolver is tied to the running guest's live + * sysroot/cwd plumbing, while this resolver runs before the vCPU starts + * and therefore needs a self-contained containment check. + */ + +#pragma once + +/* Resolve argv0 against PATH inside sysroot_dir. + * + * sysroot_dir must exist and be a directory; it is realpath'd once at + * entry so the containment check is stable across symlinks in the + * sysroot prefix itself. argv0 follows POSIX execvp semantics: when it + * contains '/' the PATH is bypassed and argv0 is resolved directly + * (absolute argv0 as a guest-absolute path, relative argv0 anchored to + * cwd_guest). When argv0 has no '/', path_env is split on ':' and each + * entry is treated as a guest-absolute directory (empty entries fall + * back to cwd_guest, matching POSIX). + * + * cwd_guest may be NULL; "/" is used in that case. path_env may be NULL + * or empty for the no-slash argv0 path -- the result is then a clean + * ENOENT with an empty searched-dirs annotation. + * + * On success returns 0 and writes heap-allocated *out_host_path and + * *out_guest_path. The caller frees both. On failure returns -1 with + * errno set (ENOENT for "not found", EACCES for "found but not + * executable", EINVAL for argument errors, ENOMEM for allocation) and + * leaves *out_host_path / *out_guest_path NULL. *err points at a + * diagnostic string; the pointer is valid until the next call from this + * thread. The diagnostic carries argv0 verbatim (quoted) and, for PATH + * search misses, a colon-separated list of the directories that were + * actually probed. + */ +int oci_path_resolve(const char *sysroot_dir, + const char *argv0, + const char *path_env, + const char *cwd_guest, + char **out_host_path, + char **out_guest_path, + const char **err); diff --git a/src/oci/policy.c b/src/oci/policy.c new file mode 100644 index 0000000..3854e24 --- /dev/null +++ b/src/oci/policy.c @@ -0,0 +1,1240 @@ +/* OCI policy.json schema and loader (C6.1) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Walks the documented config-path chain, parses the JSON body via the + * vendored cJSON, and produces an oci_policy_t the fetcher consults via + * oci_policy_lookup. Implementation notes: + * + * - The loader is lenient on unknown keys (top-level and per-host). Each + * unknown key is recorded so a future audit / debug surface can + * surface it; the load itself never fails because of a key the + * reader has not learned about yet. This is what makes the C6.3 + * sigstore.publicKey reservation work without a coordinated rollout. + * - Known fields with wrong types are a hard error. The diagnostic + * spells out the JSON pointer-like path so an operator can fix the + * offending node directly. + * - String fields starting with "~/" or equal to "~" expand against + * $HOME at load time. "~user/" forms pass through verbatim (no + * getpwnam dependency, no surprise expansions). Other paths stay as + * authored. + * - ca_bundle existence is checked at load time so a fetcher that + * consults the policy never races a deleted file mid-pull. auth_file + * existence and 0600 mode are deferred to C6.2 because they share + * failure-mode ergonomics with the fetcher's credential loader. + */ + +#include "policy.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define POLICY_ERR_CAP 512 + +/* has_* flags drive the C6.3 overlay field-level merge: an overlay file only + * carries the fields the operator chose to override, so the merge step must + * distinguish "field was declared (possibly as null)" from "field omitted". + * For base entries the flags are set as side-effects of parsing and the + * lookup path keeps using the NULL-pointer check it already had; the flags + * are only consulted during overlay merge. + */ +typedef struct { + char *host; + bool has_insecure; + bool insecure; + bool has_ca_bundle; + char *ca_bundle; + bool has_auth_file; + char *auth_file; + bool has_sigstore_public_key; + char *sigstore_public_key; + char **unknown_keys; + size_t n_unknown_keys; +} policy_entry_t; + +struct oci_policy { + bool default_insecure; + char *default_ca_bundle; + policy_entry_t *entries; + size_t n_entries; + char *source_path; + char **unknown_top_keys; + size_t n_unknown_top_keys; + char *err_buf; +}; + +static char *xstrdup(const char *s) +{ + if (!s) + return NULL; + char *r = strdup(s); + if (!r) + errno = ENOMEM; + return r; +} + +/* Set p->err_buf to a printf-formatted message and return -1. Allocates the + * scratch buffer lazily so a load that never errors out does not pay for + * one. The caller propagates the pointer through *err_msg. + */ +static int set_err(oci_policy_t *p, const char **err_msg, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +static int set_err(oci_policy_t *p, const char **err_msg, const char *fmt, ...) +{ + if (p) { + if (!p->err_buf) { + p->err_buf = malloc(POLICY_ERR_CAP); + if (!p->err_buf) { + if (err_msg) + *err_msg = "out of memory formatting policy error"; + errno = ENOMEM; + return -1; + } + } + va_list ap; + va_start(ap, fmt); + vsnprintf(p->err_buf, POLICY_ERR_CAP, fmt, ap); + va_end(ap); + if (err_msg) + *err_msg = p->err_buf; + } else if (err_msg) { + *err_msg = "policy load failed"; + } + return -1; +} + +/* Compose a per-field diagnostic that adapts to whether the field came from + * the base policy (host-scoped) or a registries.d overlay (file-scoped). The + * "what" suffix is whatever follows the field name in the message; callers + * pre-format the strerror tail when they need one so this stays plain + * non-variadic. + */ +static int field_err(oci_policy_t *p, + const char *src_path, + const char *host, + const char *field, + const char *what, + const char **err_msg) +{ + if (src_path) + return set_err(p, err_msg, "policy overlay '%s': '%s' %s", src_path, + field, what); + return set_err(p, err_msg, "policy 'registries[\"%s\"].%s' %s", host, field, + what); +} + +/* Append s (copied) to *arr / *n. Returns 0 on success, -1 on ENOMEM. */ +static int strarr_push(char ***arr, size_t *n, const char *s) +{ + char **next = (char **) realloc((void *) *arr, (*n + 1) * sizeof(char *)); + if (!next) { + errno = ENOMEM; + return -1; + } + *arr = next; + next[*n] = xstrdup(s); + if (!next[*n]) + return -1; + (*n)++; + return 0; +} + +static void strarr_free(char **arr, size_t n) +{ + if (!arr) + return; + for (size_t i = 0; i < n; i++) + free(arr[i]); + free((void *) arr); +} + +static void entry_free(policy_entry_t *e) +{ + if (!e) + return; + free(e->host); + free(e->ca_bundle); + free(e->auth_file); + free(e->sigstore_public_key); + strarr_free(e->unknown_keys, e->n_unknown_keys); +} + +void oci_policy_free(oci_policy_t *p) +{ + if (!p) + return; + free(p->default_ca_bundle); + for (size_t i = 0; i < p->n_entries; i++) + entry_free(&p->entries[i]); + free(p->entries); + free(p->source_path); + strarr_free(p->unknown_top_keys, p->n_unknown_top_keys); + free(p->err_buf); + free(p); +} + +const char *oci_policy_source(const oci_policy_t *p) +{ + return p && p->source_path ? p->source_path : ""; +} + +/* Expand a "~/..." prefix against $HOME. Pure "~" maps to $HOME. Other + * inputs (including "~user/...") pass through verbatim. Returns a heap- + * owned string the caller frees, or NULL on ENOMEM / missing $HOME. + */ +static char *expand_home(const char *in) +{ + if (!in) { + errno = EINVAL; + return NULL; + } + if (in[0] != '~') + return xstrdup(in); + if (in[1] != '/' && in[1] != '\0') + return xstrdup(in); + const char *home = getenv("HOME"); + if (!home || !*home) { + errno = ENOENT; + return NULL; + } + if (in[1] == '\0') + return xstrdup(home); + /* in[1] == '/': join home + (in + 1). The "~" replacement removes one + * byte; the joined string occupies strlen(home) + strlen(in + 1) + 1. + */ + size_t hl = strlen(home); + size_t tl = strlen(in + 1); + char *r = malloc(hl + tl + 1); + if (!r) { + errno = ENOMEM; + return NULL; + } + memcpy(r, home, hl); + memcpy(r + hl, in + 1, tl); + r[hl + tl] = '\0'; + return r; +} + +/* Join two path components with a single '/'. Returns a heap string or + * NULL on ENOMEM. Either arg may be empty; an empty base just yields + * "/" + tail which is sufficient for the fallback chain (HOME/XDG are + * never empty when present). + */ +static char *path_join(const char *a, const char *b) +{ + size_t al = a ? strlen(a) : 0; + size_t bl = b ? strlen(b) : 0; + char *r = malloc(al + 1 + bl + 1); + if (!r) { + errno = ENOMEM; + return NULL; + } + if (al) + memcpy(r, a, al); + r[al] = '/'; + if (bl) + memcpy(r + al + 1, b, bl); + r[al + 1 + bl] = '\0'; + return r; +} + +/* Slurp a file into a heap buffer. Returns a NUL-terminated string and + * writes the byte count to *out_len. Caller frees. On failure returns + * NULL with errno preserved. + */ +static char *slurp_file(const char *path, size_t *out_len) +{ + int fd = open(path, O_RDONLY); + if (fd < 0) + return NULL; + struct stat st; + if (fstat(fd, &st) < 0) { + int e = errno; + close(fd); + errno = e; + return NULL; + } + if (!S_ISREG(st.st_mode)) { + close(fd); + errno = EINVAL; + return NULL; + } + if (st.st_size < 0 || (uint64_t) st.st_size >= (uint64_t) SIZE_MAX) { + close(fd); + errno = EFBIG; + return NULL; + } + size_t len = (size_t) st.st_size; + char *buf = malloc(len + 1); + if (!buf) { + close(fd); + errno = ENOMEM; + return NULL; + } + size_t off = 0; + while (off < len) { + ssize_t n = read(fd, buf + off, len - off); + if (n < 0) { + if (errno == EINTR) + continue; + int e = errno; + free(buf); + close(fd); + errno = e; + return NULL; + } + if (n == 0) + break; + off += (size_t) n; + } + close(fd); + buf[off] = '\0'; + if (out_len) + *out_len = off; + return buf; +} + +/* Resolve the candidate path chain. On success writes a heap-owned + * absolute path into *out and returns 0; *out is NULL when no candidate + * exists (caller uses built-in default). Returns -1 on hard errors: + * $ELFUSE_POLICY_FILE points at a missing file, a fallback path fails to + * open with errno != ENOENT, or allocation fails. Diagnostic messages + * go through set_err on the caller-supplied policy. + */ +static int resolve_path(oci_policy_t *p, char **out, const char **err_msg) +{ + *out = NULL; + const char *env_override = getenv("ELFUSE_POLICY_FILE"); + if (env_override && *env_override) { + struct stat st; + if (stat(env_override, &st) < 0) { + int e = errno; + int rc = set_err(p, err_msg, + "ELFUSE_POLICY_FILE='%s' does not exist: %s", + env_override, strerror(e)); + errno = e; + return rc; + } + if (!S_ISREG(st.st_mode)) { + errno = EINVAL; + return set_err(p, err_msg, + "ELFUSE_POLICY_FILE='%s' is not a regular file", + env_override); + } + *out = xstrdup(env_override); + if (!*out) + return set_err(p, err_msg, + "out of memory recording ELFUSE_POLICY_FILE path"); + return 0; + } + + const char *home = getenv("HOME"); + + /* Fallback 1: XDG_CONFIG_HOME or $HOME/.config */ + const char *xdg = getenv("XDG_CONFIG_HOME"); + char *xdg_root = NULL; + if (xdg && *xdg) { + xdg_root = xstrdup(xdg); + } else if (home && *home) { + xdg_root = path_join(home, ".config"); + } + if (xdg_root) { + char *elf_dir = path_join(xdg_root, "elfuse"); + free(xdg_root); + char *candidate = NULL; + if (elf_dir) { + candidate = path_join(elf_dir, "policy.json"); + free(elf_dir); + } + if (!candidate) + return set_err(p, err_msg, + "out of memory composing XDG policy path"); + struct stat st; + if (stat(candidate, &st) == 0 && S_ISREG(st.st_mode)) { + *out = candidate; + return 0; + } + if (errno != ENOENT) { + int e = errno; + int rc = set_err(p, err_msg, + "policy candidate '%s' could not be stat'd: %s", + candidate, strerror(e)); + free(candidate); + errno = e; + return rc; + } + free(candidate); + } + + /* Fallback 2: $HOME/Library/Application Support/elfuse/policy.json */ + if (home && *home) { + const char *suffix = "/Library/Application Support/elfuse/policy.json"; + size_t n = strlen(home) + strlen(suffix) + 1; + char *candidate = malloc(n); + if (!candidate) { + errno = ENOMEM; + return set_err(p, err_msg, + "out of memory composing Library policy path"); + } + snprintf(candidate, n, "%s%s", home, suffix); + struct stat st; + if (stat(candidate, &st) == 0 && S_ISREG(st.st_mode)) { + *out = candidate; + return 0; + } + if (errno != ENOENT) { + int e = errno; + int rc = set_err(p, err_msg, + "policy candidate '%s' could not be stat'd: %s", + candidate, strerror(e)); + free(candidate); + errno = e; + return rc; + } + free(candidate); + } + + /* Empty chain: caller falls back to built-in default. */ + return 0; +} + +/* Type checks used while walking the schema. cJSON treats true/false as + * separate node types so the unified bool predicate calls both probes. + */ +static bool json_is_bool(const cJSON *n) +{ + return n && (cJSON_IsBool(n) || cJSON_IsTrue(n) || cJSON_IsFalse(n)); +} + +static bool known_default_key(const char *k) +{ + return !strcmp(k, "insecure") || !strcmp(k, "ca_bundle"); +} + +static bool known_entry_key(const char *k) +{ + return !strcmp(k, "insecure") || !strcmp(k, "ca_bundle") || + !strcmp(k, "auth_file") || !strcmp(k, "sigstore"); +} + +static bool known_top_key(const char *k) +{ + return !strcmp(k, "default") || !strcmp(k, "registries"); +} + +/* Parse a "default" block. Missing block leaves the policy on its + * zero-value defaults. Bad shapes raise hard errors via set_err. + */ +static int parse_default_block(oci_policy_t *p, + cJSON *node, + const char **err_msg) +{ + if (!cJSON_IsObject(node)) + return set_err(p, err_msg, "policy 'default' must be a JSON object"); + cJSON *child; + cJSON_ArrayForEach(child, node) + { + const char *k = child->string; + if (!k) + continue; + if (!strcmp(k, "insecure")) { + if (!json_is_bool(child)) + return set_err(p, err_msg, + "policy 'default.insecure' must be boolean"); + p->default_insecure = cJSON_IsTrue(child); + } else if (!strcmp(k, "ca_bundle")) { + if (cJSON_IsNull(child)) { + free(p->default_ca_bundle); + p->default_ca_bundle = NULL; + } else if (cJSON_IsString(child) && child->valuestring) { + char *expanded = expand_home(child->valuestring); + if (!expanded) + return set_err( + p, err_msg, + "policy 'default.ca_bundle' expansion failed: %s", + strerror(errno)); + free(p->default_ca_bundle); + p->default_ca_bundle = expanded; + } else { + return set_err( + p, err_msg, + "policy 'default.ca_bundle' must be a string or null"); + } + } else if (!known_default_key(k)) { + /* Forward-compat: future shaped defaults silently accepted. + * No record on the default block; the per-host slot does. + */ + } + } + if (p->default_ca_bundle) { + struct stat st; + if (stat(p->default_ca_bundle, &st) < 0 || !S_ISREG(st.st_mode)) + return set_err( + p, err_msg, + "policy 'default.ca_bundle' file '%s' is not accessible", + p->default_ca_bundle); + } + return 0; +} + +/* Parse a "sigstore" sub-object. Only publicKey is read; other keys go onto + * the parent entry's unknown_keys list with a "sigstore." prefix so the + * diagnostic stays unambiguous. src_path is NULL for base policy parsing and + * the overlay file path for the registries.d path; field_err picks the right + * shape. + */ +static int parse_sigstore_fields(oci_policy_t *p, + policy_entry_t *e, + cJSON *node, + const char *src_path, + const char **err_msg) +{ + if (!cJSON_IsObject(node)) + return field_err(p, src_path, e->host, "sigstore", + "must be a JSON object", err_msg); + cJSON *child; + cJSON_ArrayForEach(child, node) + { + const char *k = child->string; + if (!k) + continue; + if (!strcmp(k, "publicKey")) { + if (!cJSON_IsString(child) || !child->valuestring) + return field_err(p, src_path, e->host, "sigstore.publicKey", + "must be a string", err_msg); + char *expanded = expand_home(child->valuestring); + if (!expanded) { + char what[256]; + snprintf(what, sizeof(what), "expansion failed: %s", + strerror(errno)); + return field_err(p, src_path, e->host, "sigstore.publicKey", + what, err_msg); + } + free(e->sigstore_public_key); + e->sigstore_public_key = expanded; + e->has_sigstore_public_key = true; + } else { + char composed[256]; + snprintf(composed, sizeof(composed), "sigstore.%s", k); + if (strarr_push(&e->unknown_keys, &e->n_unknown_keys, composed) < + 0) { + if (src_path) + return set_err(p, err_msg, + "policy overlay '%s' sigstore unknown-key " + "recording failed", + src_path); + return set_err(p, err_msg, + "policy 'registries[\"%s\"].sigstore' " + "unknown-key recording failed", + e->host); + } + } + } + return 0; +} + +/* Shared field parser for per-host entries. Used by the base policy parser + * (src_path == NULL, e is the live array slot) and by the C6.3 overlay parser + * (src_path is the overlay file path, e is a scratch policy_entry_t the + * caller merges into the target). Only sets fields; the ca_bundle stat check + * is left to the caller so each context can tailor the diagnostic. + */ +static int parse_entry_fields(oci_policy_t *p, + policy_entry_t *e, + cJSON *node, + const char *src_path, + const char **err_msg) +{ + cJSON *child; + cJSON_ArrayForEach(child, node) + { + const char *k = child->string; + if (!k) + continue; + if (!strcmp(k, "insecure")) { + if (!json_is_bool(child)) + return field_err(p, src_path, e->host, "insecure", + "must be boolean", err_msg); + e->has_insecure = true; + e->insecure = cJSON_IsTrue(child); + } else if (!strcmp(k, "ca_bundle")) { + if (cJSON_IsNull(child)) { + free(e->ca_bundle); + e->ca_bundle = NULL; + e->has_ca_bundle = true; + } else if (cJSON_IsString(child) && child->valuestring) { + char *expanded = expand_home(child->valuestring); + if (!expanded) { + char what[256]; + snprintf(what, sizeof(what), "expansion failed: %s", + strerror(errno)); + return field_err(p, src_path, e->host, "ca_bundle", what, + err_msg); + } + free(e->ca_bundle); + e->ca_bundle = expanded; + e->has_ca_bundle = true; + } else { + return field_err(p, src_path, e->host, "ca_bundle", + "must be a string or null", err_msg); + } + } else if (!strcmp(k, "auth_file")) { + if (!cJSON_IsString(child) || !child->valuestring) + return field_err(p, src_path, e->host, "auth_file", + "must be a string", err_msg); + char *expanded = expand_home(child->valuestring); + if (!expanded) { + char what[256]; + snprintf(what, sizeof(what), "expansion failed: %s", + strerror(errno)); + return field_err(p, src_path, e->host, "auth_file", what, + err_msg); + } + free(e->auth_file); + e->auth_file = expanded; + e->has_auth_file = true; + } else if (!strcmp(k, "sigstore")) { + if (parse_sigstore_fields(p, e, child, src_path, err_msg) < 0) + return -1; + } else if (!known_entry_key(k)) { + if (strarr_push(&e->unknown_keys, &e->n_unknown_keys, k) < 0) { + if (src_path) + return set_err(p, err_msg, + "policy overlay '%s' unknown-key " + "recording failed", + src_path); + return set_err(p, err_msg, + "policy 'registries[\"%s\"]' " + "unknown-key recording failed", + e->host); + } + } + } + return 0; +} + +static int parse_entry_block(oci_policy_t *p, + policy_entry_t *e, + cJSON *node, + const char **err_msg) +{ + if (!cJSON_IsObject(node)) + return set_err(p, err_msg, + "policy 'registries[\"%s\"]' must be a JSON object", + e->host); + if (parse_entry_fields(p, e, node, NULL, err_msg) < 0) + return -1; + if (e->ca_bundle) { + struct stat st; + if (stat(e->ca_bundle, &st) < 0 || !S_ISREG(st.st_mode)) + return set_err(p, err_msg, + "policy 'registries[\"%s\"].ca_bundle' file '%s' " + "is not accessible", + e->host, e->ca_bundle); + } + return 0; +} + +static int parse_registries_block(oci_policy_t *p, + cJSON *node, + const char **err_msg) +{ + if (!cJSON_IsObject(node)) + return set_err(p, err_msg, "policy 'registries' must be a JSON object"); + size_t n = (size_t) cJSON_GetArraySize(node); + if (n == 0) + return 0; + p->entries = calloc(n, sizeof(policy_entry_t)); + if (!p->entries) { + errno = ENOMEM; + return set_err(p, err_msg, + "policy 'registries' entry allocation failed"); + } + cJSON *child; + cJSON_ArrayForEach(child, node) + { + const char *host = child->string; + if (!host) + continue; + policy_entry_t *e = &p->entries[p->n_entries]; + e->host = xstrdup(host); + if (!e->host) + return set_err(p, err_msg, + "policy 'registries[\"%s\"]' host copy failed", + host); + p->n_entries++; + if (parse_entry_block(p, e, child, err_msg) < 0) + return -1; + } + return 0; +} + +/* Find an existing entry by host, or grow the entries array by one and + * initialise a new slot. Returns the slot or NULL on ENOMEM. New slots have + * host set and all fields zeroed; the caller (overlay merge) fills them via + * merge_overlay_into_entry. On xstrdup failure for the new slot's host, the + * grown array stays allocated but n_entries is not bumped, so oci_policy_free + * walks the same n_entries it already had. + */ +static policy_entry_t *entry_grow_and_get(oci_policy_t *p, const char *host) +{ + for (size_t i = 0; i < p->n_entries; i++) { + if (p->entries[i].host && !strcmp(p->entries[i].host, host)) + return &p->entries[i]; + } + policy_entry_t *next = + realloc(p->entries, (p->n_entries + 1) * sizeof(*next)); + if (!next) { + errno = ENOMEM; + return NULL; + } + p->entries = next; + policy_entry_t *e = &p->entries[p->n_entries]; + memset(e, 0, sizeof(*e)); + e->host = xstrdup(host); + if (!e->host) + return NULL; + p->n_entries++; + return e; +} + +/* Move declared fields from an overlay-parsed scratch entry into the target + * entry, freeing whatever the target previously held. Pointer ownership + * transfers to the target; the overlay's pointers are nulled so the caller's + * entry_free does not double-free. unknown_keys are appended by copy via + * strarr_push -- the originals stay in the overlay for entry_free to release. + */ +static int merge_overlay_into_entry(policy_entry_t *tgt, + policy_entry_t *ov, + const char **err_msg) +{ + if (ov->has_insecure) { + tgt->has_insecure = true; + tgt->insecure = ov->insecure; + } + if (ov->has_ca_bundle) { + free(tgt->ca_bundle); + tgt->ca_bundle = ov->ca_bundle; + ov->ca_bundle = NULL; + tgt->has_ca_bundle = true; + } + if (ov->has_auth_file) { + free(tgt->auth_file); + tgt->auth_file = ov->auth_file; + ov->auth_file = NULL; + tgt->has_auth_file = true; + } + if (ov->has_sigstore_public_key) { + free(tgt->sigstore_public_key); + tgt->sigstore_public_key = ov->sigstore_public_key; + ov->sigstore_public_key = NULL; + tgt->has_sigstore_public_key = true; + } + for (size_t i = 0; i < ov->n_unknown_keys; i++) { + if (strarr_push(&tgt->unknown_keys, &tgt->n_unknown_keys, + ov->unknown_keys[i]) < 0) { + (void) err_msg; + errno = ENOMEM; + return -1; + } + } + return 0; +} + +/* Parse a single registries.d/.json overlay file and field-merge it + * into the target entry (created if absent). Failure leaves the target + * unchanged on any path past the merge call; failures before merge never + * touched the target. The overlay scratch entry is always released here. + */ +static int parse_overlay_file(oci_policy_t *p, + const char *host, + const char *file_path, + const char **err_msg) +{ + size_t body_len = 0; + char *body = slurp_file(file_path, &body_len); + if (!body) + return set_err(p, err_msg, "policy overlay '%s' could not be read: %s", + file_path, strerror(errno)); + cJSON *root = cJSON_ParseWithLength(body, body_len); + free(body); + if (!root) + return set_err(p, err_msg, "policy overlay '%s' is not valid JSON", + file_path); + + int rc = -1; + policy_entry_t overlay; + memset(&overlay, 0, sizeof(overlay)); + + if (!cJSON_IsObject(root)) { + (void) set_err(p, err_msg, "policy overlay '%s' must be a JSON object", + file_path); + goto out; + } + overlay.host = xstrdup(host); + if (!overlay.host) { + (void) set_err(p, err_msg, "out of memory parsing policy overlay '%s'", + file_path); + goto out; + } + if (parse_entry_fields(p, &overlay, root, file_path, err_msg) < 0) + goto out; + if (overlay.has_ca_bundle && overlay.ca_bundle) { + struct stat st; + if (stat(overlay.ca_bundle, &st) < 0 || !S_ISREG(st.st_mode)) { + (void) set_err(p, err_msg, + "policy overlay '%s': ca_bundle file '%s' " + "is not accessible", + file_path, overlay.ca_bundle); + goto out; + } + } + policy_entry_t *target = entry_grow_and_get(p, host); + if (!target) { + (void) set_err(p, err_msg, + "policy overlay '%s' target entry allocation failed", + file_path); + goto out; + } + if (merge_overlay_into_entry(target, &overlay, err_msg) < 0) { + (void) set_err(p, err_msg, + "policy overlay '%s' merge failed: out of memory", + file_path); + goto out; + } + rc = 0; +out: + entry_free(&overlay); + cJSON_Delete(root); + return rc; +} + +static int overlay_name_cmp(const void *a, const void *b) +{ + return strcmp(*(const char *const *) a, *(const char *const *) b); +} + +/* Scan /registries.d/ for *.json overlay files and merge each into + * the policy. The directory itself is optional: opendir returning ENOENT is + * silent; any other errno (ENOTDIR, EACCES, ...) is a hard error so an + * operator pointing at an unreadable overlay tree learns about it. Each + * filename minus the .json suffix is the target host. Files are processed + * in lexicographic order for determinism; same-host duplicates cannot exist + * on POSIX filesystems so the order is mostly observable in diagnostics. + */ +static int load_overlay_dir(oci_policy_t *p, + const char *base_path, + const char **err_msg) +{ + if (!base_path || !*base_path) + return 0; + const char *last_slash = strrchr(base_path, '/'); + if (!last_slash) + return 0; + size_t parent_len = (size_t) (last_slash - base_path); + static const char overlay_suffix[] = "/registries.d"; + size_t suffix_len = sizeof(overlay_suffix) - 1; + char *dir_path = malloc(parent_len + suffix_len + 1); + if (!dir_path) + return set_err(p, err_msg, + "out of memory composing policy overlay path"); + memcpy(dir_path, base_path, parent_len); + memcpy(dir_path + parent_len, overlay_suffix, suffix_len); + dir_path[parent_len + suffix_len] = '\0'; + + DIR *d = opendir(dir_path); + if (!d) { + int e = errno; + if (e == ENOENT) { + free(dir_path); + return 0; + } + int rc = set_err(p, err_msg, + "policy overlay directory '%s' cannot be opened: %s", + dir_path, strerror(e)); + free(dir_path); + errno = e; + return rc; + } + + char **names = NULL; + size_t n_names = 0; + size_t cap_names = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + const char *n = de->d_name; + if (n[0] == '.') + continue; + size_t nl = strlen(n); + if (nl <= 5) + continue; + if (strcmp(n + nl - 5, ".json") != 0) + continue; + if (n_names == cap_names) { + size_t new_cap = cap_names ? cap_names * 2 : 8; + char **next = + (char **) realloc((void *) names, new_cap * sizeof(char *)); + if (!next) { + closedir(d); + strarr_free(names, n_names); + int rc = set_err(p, err_msg, + "policy overlay '%s' name-list allocation " + "failed", + dir_path); + free(dir_path); + return rc; + } + names = next; + cap_names = new_cap; + } + names[n_names] = strdup(n); + if (!names[n_names]) { + closedir(d); + strarr_free(names, n_names); + int rc = set_err(p, err_msg, "policy overlay '%s' name copy failed", + dir_path); + free(dir_path); + return rc; + } + n_names++; + } + closedir(d); + + qsort((void *) names, n_names, sizeof(char *), overlay_name_cmp); + + int rc = 0; + for (size_t i = 0; i < n_names; i++) { + const char *fname = names[i]; + size_t nl = strlen(fname); + size_t host_len = nl - 5; /* trim ".json" */ + if (host_len == 0) + continue; /* literal ".json" filename: not a host */ + char *host = malloc(host_len + 1); + if (!host) { + rc = set_err(p, err_msg, + "out of memory composing overlay host name"); + goto cleanup; + } + memcpy(host, fname, host_len); + host[host_len] = '\0'; + size_t file_path_size = strlen(dir_path) + 1 + nl + 1; + char *file_path = malloc(file_path_size); + if (!file_path) { + free(host); + rc = set_err(p, err_msg, + "out of memory composing overlay file path"); + goto cleanup; + } + snprintf(file_path, file_path_size, "%s/%s", dir_path, fname); + struct stat st; + if (stat(file_path, &st) < 0 || !S_ISREG(st.st_mode)) { + /* Filename ending in .json that turned out not to be a regular + * file (e.g. a directory named "foo.json"). Silently skip: + * defensive, leaves base policy load undisturbed. + */ + free(file_path); + free(host); + continue; + } + rc = parse_overlay_file(p, host, file_path, err_msg); + free(file_path); + free(host); + if (rc < 0) + goto cleanup; + } + +cleanup: + strarr_free(names, n_names); + free(dir_path); + return rc; +} + +static int parse_body(oci_policy_t *p, + const char *body, + size_t body_len, + const char **err_msg) +{ + cJSON *root = cJSON_ParseWithLength(body, body_len); + if (!root) + return set_err(p, err_msg, "policy file '%s' is not valid JSON", + p->source_path ? p->source_path : "(stdin)"); + int rc = -1; + if (!cJSON_IsObject(root)) { + (void) set_err(p, err_msg, "policy root must be a JSON object"); + goto out; + } + cJSON *child; + cJSON_ArrayForEach(child, root) + { + const char *k = child->string; + if (!k) + continue; + if (!strcmp(k, "default")) { + if (parse_default_block(p, child, err_msg) < 0) + goto out; + } else if (!strcmp(k, "registries")) { + if (parse_registries_block(p, child, err_msg) < 0) + goto out; + } else if (!known_top_key(k)) { + if (strarr_push(&p->unknown_top_keys, &p->n_unknown_top_keys, k) < + 0) { + (void) set_err(p, err_msg, + "policy unknown-key recording failed"); + goto out; + } + } + } + rc = 0; +out: + cJSON_Delete(root); + return rc; +} + +int oci_policy_load(oci_policy_t **out, const char **err_msg) +{ + if (!out) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + *out = NULL; + + oci_policy_t *p = calloc(1, sizeof(*p)); + if (!p) { + if (err_msg) + *err_msg = "out of memory allocating policy"; + errno = ENOMEM; + return -1; + } + p->err_buf = malloc(POLICY_ERR_CAP); + if (!p->err_buf) { + free(p); + if (err_msg) + *err_msg = "out of memory allocating policy diagnostic"; + errno = ENOMEM; + return -1; + } + p->err_buf[0] = '\0'; + + /* On every failure past this point, *out keeps the partially built + * policy so the caller can free it (and the err_buf the diagnostic + * lives in) via oci_policy_free. The policy.h contract requires + * exactly this shape for diagnostic lifetime. + */ + *out = p; + + char *path = NULL; + if (resolve_path(p, &path, err_msg) < 0) + return -1; + + if (!path) { + /* Empty chain: built-in default. source_path stays empty. */ + p->source_path = xstrdup(""); + if (!p->source_path) + return set_err(p, err_msg, "out of memory recording source path"); + return 0; + } + + p->source_path = path; /* takes ownership */ + + size_t body_len = 0; + char *body = slurp_file(p->source_path, &body_len); + if (!body) + return set_err(p, err_msg, "policy file '%s' could not be read: %s", + p->source_path, strerror(errno)); + int rc = parse_body(p, body, body_len, err_msg); + free(body); + if (rc < 0) + return rc; + return load_overlay_dir(p, p->source_path, err_msg); +} + +void oci_policy_lookup(const oci_policy_t *p, + const char *host, + oci_policy_effective_t *eff) +{ + if (!eff) + return; + eff->insecure = p ? p->default_insecure : false; + eff->ca_bundle = p ? p->default_ca_bundle : NULL; + eff->auth_file = NULL; + eff->sigstore_public_key = NULL; + if (!p || !host) + return; + for (size_t i = 0; i < p->n_entries; i++) { + const policy_entry_t *e = &p->entries[i]; + if (strcmp(e->host, host) != 0) + continue; + if (e->has_insecure) + eff->insecure = e->insecure; + if (e->ca_bundle) + eff->ca_bundle = e->ca_bundle; + if (e->auth_file) + eff->auth_file = e->auth_file; + if (e->sigstore_public_key) + eff->sigstore_public_key = e->sigstore_public_key; + return; + } +} + +/* Static error literals. The auth-file load path has no policy_t err_buf to + * share, and the diagnostic is short enough that a fixed table beats a + * dynamic format. The caller composes the final user-facing message with + * the path it already knows. + */ +static const char AUTH_ERR_OPEN[] = "auth file could not be opened"; +static const char AUTH_ERR_FSTAT[] = "auth file could not be stat'd"; +static const char AUTH_ERR_NOT_REG[] = "auth file is not a regular file"; +static const char AUTH_ERR_MODE[] = + "auth file has insecure mode (must be 0600)"; +static const char AUTH_ERR_TOO_BIG[] = "auth file is too large"; +static const char AUTH_ERR_READ[] = "auth file could not be read"; +static const char AUTH_ERR_NOMEM[] = "out of memory parsing auth file"; +static const char AUTH_ERR_BAD_JSON[] = "auth file is not valid JSON"; +static const char AUTH_ERR_NOT_OBJ[] = "auth file body must be a JSON object"; +static const char AUTH_ERR_NO_USER[] = + "auth file missing required string field 'username'"; +static const char AUTH_ERR_NO_PASS[] = + "auth file missing required string field 'password'"; + +int oci_policy_load_auth(const char *path, + char **out_user, + char **out_pass, + const char **err_msg) +{ + if (!path || !out_user || !out_pass) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + *out_user = NULL; + *out_pass = NULL; + + int fd = open(path, O_RDONLY); + if (fd < 0) { + int e = errno; + if (err_msg) + *err_msg = AUTH_ERR_OPEN; + errno = e; + return -1; + } + struct stat st; + if (fstat(fd, &st) < 0) { + int e = errno; + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_FSTAT; + errno = e; + return -1; + } + if (!S_ISREG(st.st_mode)) { + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_NOT_REG; + errno = EINVAL; + return -1; + } + /* Mode must grant access to the owner only. 0600 is the canonical + * shape; 0400 (read-only) is also accepted. Any bit in the group or + * other triad fails the check. + */ + if ((st.st_mode & 077) != 0) { + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_MODE; + errno = EPERM; + return -1; + } + if (st.st_size < 0 || (uint64_t) st.st_size >= (uint64_t) SIZE_MAX) { + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_TOO_BIG; + errno = EFBIG; + return -1; + } + size_t len = (size_t) st.st_size; + char *buf = malloc(len + 1); + if (!buf) { + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_NOMEM; + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < len) { + ssize_t n = read(fd, buf + off, len - off); + if (n < 0) { + if (errno == EINTR) + continue; + int e = errno; + free(buf); + close(fd); + if (err_msg) + *err_msg = AUTH_ERR_READ; + errno = e; + return -1; + } + if (n == 0) + break; + off += (size_t) n; + } + close(fd); + buf[off] = '\0'; + + cJSON *json = cJSON_ParseWithLength(buf, off); + free(buf); + if (!json) { + if (err_msg) + *err_msg = AUTH_ERR_BAD_JSON; + errno = EINVAL; + return -1; + } + if (!cJSON_IsObject(json)) { + cJSON_Delete(json); + if (err_msg) + *err_msg = AUTH_ERR_NOT_OBJ; + errno = EINVAL; + return -1; + } + cJSON *ju = cJSON_GetObjectItemCaseSensitive(json, "username"); + if (!cJSON_IsString(ju) || !ju->valuestring) { + cJSON_Delete(json); + if (err_msg) + *err_msg = AUTH_ERR_NO_USER; + errno = EINVAL; + return -1; + } + cJSON *jp = cJSON_GetObjectItemCaseSensitive(json, "password"); + if (!cJSON_IsString(jp) || !jp->valuestring) { + cJSON_Delete(json); + if (err_msg) + *err_msg = AUTH_ERR_NO_PASS; + errno = EINVAL; + return -1; + } + *out_user = xstrdup(ju->valuestring); + *out_pass = xstrdup(jp->valuestring); + cJSON_Delete(json); + if (!*out_user || !*out_pass) { + if (err_msg) + *err_msg = AUTH_ERR_NOMEM; + errno = ENOMEM; + return -1; + } + return 0; +} diff --git a/src/oci/policy.h b/src/oci/policy.h new file mode 100644 index 0000000..bbb45d3 --- /dev/null +++ b/src/oci/policy.h @@ -0,0 +1,140 @@ +/* OCI policy.json schema and loader + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Plan 6 C6.1: parse a podman/skopeo-style policy.json out of one of the + * standard config locations and expose a per-host effective view that + * fetch.c (in C6.2) consults before applying CLI overrides. + * + * Load order: + * 1. $ELFUSE_POLICY_FILE (when set and non-empty) + * 2. $XDG_CONFIG_HOME/elfuse/policy.json (fallback: + * $HOME/.config/elfuse/policy.json) + * 3. $HOME/Library/Application Support/elfuse/policy.json + * 4. Built-in default (insecure=false, ca_bundle=NULL, no per-host entries) + * + * An $ELFUSE_POLICY_FILE that points at a missing file is a hard error so + * an operator that explicitly named a path always learns about typos. + * Missing fallback files silently fall through to the next candidate; + * a fully empty chain yields the built-in default with source_path == "". + * + * Supported schema subset (additional keys at any level are recorded for + * forward-compat diagnostics but never reject the load): + * + * { + * "default": { "insecure": bool, "ca_bundle": string|null }, + * "registries": { + * "": { + * "insecure": bool, + * "ca_bundle": string|null, + * "auth_file": string, + * "sigstore": { "publicKey": string } // C6.3 reservation; ignored + * }, + * ... + * } + * } + * + * String fields starting with "~/" or equal to "~" expand against $HOME at + * load time. Any other path passes through verbatim. ca_bundle is stat'd + * during load and a missing target is a hard error; auth_file is not + * accessed by the loader (the fetcher reads and mode-checks it in C6.2). + * + * Thread safety: oci_policy_t is read-only after load. Multiple threads may + * call oci_policy_lookup concurrently. The loader itself is not reentrant; + * one fetcher loads its own copy and frees on destruction. + */ + +#pragma once + +#include +#include + +typedef struct oci_policy oci_policy_t; + +/* Effective per-host view. Strings are owned by the parent oci_policy_t; + * callers must not free them or use them past oci_policy_free. A NULL + * string field means the policy did not declare a value at either the + * per-host entry or the default block, so the caller should fall back to + * whatever default it would otherwise use. + */ +typedef struct { + bool insecure; + const char *ca_bundle; + const char *auth_file; + /* C6.3 reservation: a registries[""].sigstore.publicKey field + * parses into this slot for forward-compat introspection. fetch.c in + * Plan 6 never reads it; a future sigstore-verify hook lights up + * after Phase 4+. + */ + const char *sigstore_public_key; +} oci_policy_effective_t; + +/* Load the policy, walking the candidate path chain documented above. + * On success returns 0 and stores a heap-allocated oci_policy_t in *out + * which the caller frees via oci_policy_free. On failure returns -1 with + * errno set; *err_msg (when non-NULL) points at a description owned by + * the partially constructed policy (released by oci_policy_free even on + * failure when *out is non-NULL) or a static literal when allocation + * failed before the struct existed. Pass NULL for err_msg to skip the + * diagnostic. + * + * The loader is tolerant of unknown JSON keys at every level: they are + * accepted and recorded so future schema extensions (sigstore beyond the + * minimal subset, registries.d overlays in C6.3, mirror chains, ...) do + * not require a coordinated reader rollout. + */ +int oci_policy_load(oci_policy_t **out, const char **err_msg); + +/* Release a policy. Safe on NULL and on the partially constructed object + * a failed oci_policy_load may have produced. + */ +void oci_policy_free(oci_policy_t *p); + +/* Fill *eff with the merged view for host. Unknown host falls back to the + * default block. host is matched exactly (case-sensitive); the OCI ref + * parser already lowercases registry hostnames, so this is the same key + * shape policy.json uses. + * + * eff is always fully populated: fields the policy did not declare come + * out as NULL strings or as the default-block values (for the insecure + * flag). A NULL p or NULL host treats the call as a request for the + * zero-value default. + */ +void oci_policy_lookup(const oci_policy_t *p, + const char *host, + oci_policy_effective_t *eff); + +/* Return the absolute filesystem path of the policy file that produced + * this object, or "" when the built-in default fired. The pointer is + * owned by the policy and stays valid until oci_policy_free. NULL p + * returns "". + */ +const char *oci_policy_source(const oci_policy_t *p); + +/* Read a podman/skopeo-style auth file from path. The body must be JSON of + * the shape: + * + * { "username": "", "password": "" } + * + * Both fields are required; either may not be NULL. A missing field, a + * malformed JSON body, a non-regular file, or a mode that grants group or + * other access is a hard error (the file must satisfy (st_mode & 077) == 0, + * matching the credential-handling discipline ssh and curl both use). + * + * On success returns 0 and writes heap-owned strings into *out_user and + * *out_pass which the caller frees. On failure returns -1 with errno set + * (ENOENT, EACCES, EPERM for mode, EINVAL for missing fields / malformed + * JSON) and *err_msg (when non-NULL) pointing at a static description + * suitable for direct caller-side use. *out_user and *out_pass may be + * partially populated on failure (one strdup succeeded, another failed); the + * caller must free both unconditionally, including on rc != 0. NULL path, + * NULL out_user, or NULL out_pass is EINVAL. + * + * The diagnostic does not include the path; the caller already knows it and + * is free to compose its own message ("auth file %s: %s", path, err). + */ +int oci_policy_load_auth(const char *path, + char **out_user, + char **out_pass, + const char **err_msg); diff --git a/src/oci/pull.c b/src/oci/pull.c new file mode 100644 index 0000000..50d2be0 --- /dev/null +++ b/src/oci/pull.c @@ -0,0 +1,785 @@ +/* elfuse oci pull pipeline + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The pull function is intentionally linear: every state transition (top-level + * fetch, index recurse, config fetch, layer fetch, pin write) flows top-to- + * bottom in oci_pull below. Helpers exist only to remove pure boilerplate + * (response cleanup, hex equality, progress prints), so that a reader of + * oci_pull can follow the registry round trips without chasing through + * indirection. + */ + +#include "pull.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blob-store.h" +#include "digest.h" +#include "manifest.h" +#include "media-type.h" + +static const char *const PULL_ACCEPT[] = { + "application/vnd.oci.image.index.v1+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.docker.distribution.manifest.v2+json", + NULL, +}; + +static FILE *pick_progress(const oci_pull_options_t *opts) +{ + if (!opts) + return stderr; + if (opts->quiet) + return NULL; + return opts->progress ? opts->progress : stderr; +} + +static void progress_line(FILE *fp, + const char *kind, + const char *digest_str, + int64_t size, + const char *state, + const char *media_type) +{ + if (!fp) + return; + /* Truncated digest keeps the line readable; full hex still goes into the + * pin file and the blob store for verification. + */ + char short_digest[24]; + snprintf(short_digest, sizeof(short_digest), "%.19s...", digest_str); + fprintf(fp, " %-9s %-22s %12lldB %-11s %s\n", kind, short_digest, + (long long) size, state ? state : "", media_type ? media_type : ""); + fflush(fp); +} + +/* In-progress per-blob slot for the batch fetcher's xferinfo callback to + * update. Kept file-local because pull.c is the only caller. A second + * subcommand wiring the same progress callback would be the trigger to + * lift this and pull_progress_t into src/oci/progress.{c,h}. + */ +typedef struct { + const oci_descriptor_t *desc; + const char *kind; /* "config" or "layer" */ + const char *media_type; + int64_t bytes_dl; + int64_t bytes_total; + bool done_emitted; /* non-TTY mode: per-blob one-shot guard */ +} pull_progress_slot_t; + +typedef struct { + FILE *fp; + bool is_tty; + bool started; /* TTY: placeholder lines already printed */ + pull_progress_slot_t *slots; + size_t n_slots; +} pull_progress_t; + +static void pull_progress_print_inplace_line(FILE *fp, + const pull_progress_slot_t *slot) +{ + char short_digest[24]; + snprintf(short_digest, sizeof(short_digest), "%.19s...", + slot->desc->digest_str); + int percent = 0; + if (slot->bytes_total > 0) { + int64_t p = (slot->bytes_dl * 100) / slot->bytes_total; + if (p < 0) + p = 0; + if (p > 100) + p = 100; + percent = (int) p; + } + const char *state = + slot->bytes_total > 0 && slot->bytes_dl >= slot->bytes_total + ? "downloaded" + : "pulling"; + /* CSI 2K clears the entire line under the cursor; the trailing newline + * advances to the next slot row. + */ + fprintf(fp, "\033[2K %-9s %-22s %8lld/%lldB %3d%% %-11s %s\n", slot->kind, + short_digest, (long long) slot->bytes_dl, + (long long) slot->bytes_total, percent, state, + slot->media_type ? slot->media_type : ""); +} + +/* TTY render path: cursor-up to the top of the redraw zone, reprint every + * slot in place. The redraw zone is exactly n_slots rows tall and the + * cursor ends up one row below the last slot, matching the post-init + * position. + */ +static void pull_progress_tty_redraw(pull_progress_t *pp) +{ + if (!pp->fp || pp->n_slots == 0) + return; + /* CSI nF moves the cursor up n lines to column 0; "1F" is a single row. + * n_slots is at most a few dozen so the integer width fits trivially. + */ + fprintf(pp->fp, "\033[%zuF", pp->n_slots); + for (size_t i = 0; i < pp->n_slots; i++) + pull_progress_print_inplace_line(pp->fp, &pp->slots[i]); + fflush(pp->fp); +} + +/* Walk descs[] and split it into already-cached (printed immediately, no + * slot) and to-be-downloaded (one slot each). Cached lines preserve the + * pre-C5.3 byte-identical wording so existing log-parsing pipelines do + * not regress. In TTY mode, after the cached lines, n_slots placeholder + * lines are printed and the cursor lands one row below the zone so the + * xferinfo redraw loop can repeatedly hop back to the zone top. + */ +static int pull_progress_init(pull_progress_t *pp, + FILE *fp, + const oci_descriptor_t *config, + bool config_cached, + const oci_descriptor_t *layers, + size_t n_layers, + const bool *layer_cached) +{ + memset(pp, 0, sizeof(*pp)); + pp->fp = fp; + pp->is_tty = fp != NULL && isatty(fileno(fp)); + /* ELFUSE_OCI_PROGRESS=plain (or =lines, =off) forces the + * line-per-completion path even on a real TTY. Some terminal panes + * (notably embedded ones that emulate a pty without honoring CSI + * cursor-up) leave the in-place redraw stacking copies down the + * screen instead of rewriting the active rows; the env override + * gives the operator a stable opt-out without touching code. + */ + if (pp->is_tty) { + const char *override = getenv("ELFUSE_OCI_PROGRESS"); + if (override && + (!strcmp(override, "plain") || !strcmp(override, "lines") || + !strcmp(override, "off"))) + pp->is_tty = false; + } + + size_t cap = 1 + n_layers; + pp->slots = calloc(cap, sizeof(*pp->slots)); + if (!pp->slots) + return -1; + + /* Emit cached lines immediately and reserve a slot for everything else. + * The slot's bytes_total is desc->size; bytes_dl starts at zero so the + * TTY placeholder shows 0/B 0%. + */ + if (config_cached) { + progress_line(fp, "config", config->digest_str, config->size, "cached", + oci_media_type_name(config->media_type)); + } else { + pp->slots[pp->n_slots++] = (pull_progress_slot_t) { + .desc = config, + .kind = "config", + .media_type = oci_media_type_name(config->media_type), + .bytes_dl = 0, + .bytes_total = config->size, + }; + } + for (size_t i = 0; i < n_layers; i++) { + const oci_descriptor_t *L = &layers[i]; + if (layer_cached[i]) { + progress_line(fp, "layer", L->digest_str, L->size, "cached", + oci_media_type_name(L->media_type)); + } else { + pp->slots[pp->n_slots++] = (pull_progress_slot_t) { + .desc = L, + .kind = "layer", + .media_type = oci_media_type_name(L->media_type), + .bytes_dl = 0, + .bytes_total = L->size, + }; + } + } + + /* TTY: print n_slots placeholder lines; the cursor lands on the row + * immediately below the zone. Non-TTY: defer per-blob output until + * the bytes_dl == bytes_total event in the callback. + */ + if (pp->is_tty && pp->n_slots > 0) { + for (size_t i = 0; i < pp->n_slots; i++) + pull_progress_print_inplace_line(fp, &pp->slots[i]); + fflush(fp); + pp->started = true; + } + return 0; +} + +static pull_progress_slot_t *pull_progress_find(pull_progress_t *pp, + const oci_descriptor_t *desc) +{ + for (size_t i = 0; i < pp->n_slots; i++) { + if (pp->slots[i].desc == desc) + return &pp->slots[i]; + } + return NULL; +} + +/* Callback handed to oci_fetch_blob_batch. Runs on the fetcher's thread + * (single-threaded curl_multi event loop) so the renderer's pp state + * needs no locking. Returning 0 lets the transfer continue; the C5.3 + * renderer never aborts (a future cancellable pull would return non-zero + * here). + */ +static int pull_progress_cb(const oci_descriptor_t *desc, + int64_t bytes_dl, + int64_t bytes_total, + void *user) +{ + pull_progress_t *pp = user; + if (!pp) + return 0; + pull_progress_slot_t *slot = pull_progress_find(pp, desc); + if (!slot) + return 0; + slot->bytes_dl = bytes_dl; + slot->bytes_total = bytes_total; + if (pp->is_tty) { + pull_progress_tty_redraw(pp); + } else if (!slot->done_emitted && bytes_total > 0 && + bytes_dl >= bytes_total) { + /* Single line per blob on completion. Matches the line-per-event + * log shape that scripts grep against (digest, size, state). + */ + progress_line(pp->fp, slot->kind, slot->desc->digest_str, + slot->bytes_total, "downloaded", slot->media_type); + slot->done_emitted = true; + } + return 0; +} + +static void pull_progress_dispose(pull_progress_t *pp) +{ + free(pp->slots); + pp->slots = NULL; + pp->n_slots = 0; +} + +/* Case-insensitive prefix check for "sha256:" / "sha512:". */ +static bool digest_str_matches(const char *want, const char *got) +{ + if (!want || !got) + return false; + return strcasecmp(want, got) == 0; +} + +/* Cross-check the manifest body against the registry-supplied + * Docker-Content-Digest header. Servers usually emit one; when they do not, + * trust the body's local SHA-256. The local hex is what we use to address the + * blob in the store regardless, so a missing header degrades to local-only + * verification but not to silent corruption. + */ +static int verify_manifest_digest(const oci_fetch_response_t *resp, + const char *expected_digest_str, + char *out_digest_str, + size_t out_cap, + const char **err_msg) +{ + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (oci_digest_bytes(OCI_DIGEST_SHA256, resp->body, resp->body_len, hex) == + 0) { + if (err_msg) + *err_msg = "failed to hash manifest body"; + errno = EIO; + return -1; + } + int n = snprintf(out_digest_str, out_cap, "sha256:%s", hex); + if (n < 0 || (size_t) n >= out_cap) { + if (err_msg) + *err_msg = "manifest digest buffer too small"; + errno = ENAMETOOLONG; + return -1; + } + if (resp->docker_content_digest && + !digest_str_matches(resp->docker_content_digest, out_digest_str)) { + if (err_msg) + *err_msg = + "manifest body digest does not match " + "Docker-Content-Digest header"; + errno = EPROTO; + return -1; + } + if (expected_digest_str && + !digest_str_matches(expected_digest_str, out_digest_str)) { + if (err_msg) + *err_msg = "manifest body digest does not match expected digest"; + errno = EPROTO; + return -1; + } + return 0; +} + +/* Fetch a manifest document (image index, image manifest, or sub-manifest) by + * selector, hash its body, cross-check against expected_digest_str (when + * non-NULL), and write it into the local blob store. Returns 0 on success and + * fills *out_digest_str with the canonical "sha256:" representation. The + * caller frees *out_response via oci_fetch_response_free. + * + * if_none_match is forwarded as the conditional GET header; when set and the + * registry responds 304 Not Modified the helper returns 0, writes nothing to + * the store, leaves *out_digest_str empty, and sets *out_unchanged (when + * non-NULL). The caller seeds the digest string from the pin before calling. + */ +static int fetch_and_persist_manifest(oci_fetcher_t *f, + oci_store_t *store, + const oci_ref_t *ref, + const char *selector, + const char *expected_digest_str, + const char *if_none_match, + oci_fetch_response_t *out_resp, + char *out_digest_str, + size_t out_cap, + bool *out_unchanged, + const char **err_msg) +{ + if (out_unchanged) + *out_unchanged = false; + memset(out_resp, 0, sizeof(*out_resp)); + if (oci_fetch_manifest(f, ref, selector, PULL_ACCEPT, if_none_match, + out_resp, err_msg) < 0) { + return -1; + } + if (out_resp->http_status == 304) { + if (out_unchanged) + *out_unchanged = true; + return 0; + } + if (out_resp->body_len == 0 || !out_resp->body) { + if (err_msg) + *err_msg = "manifest response had an empty body"; + errno = EPROTO; + return -1; + } + if (verify_manifest_digest(out_resp, expected_digest_str, out_digest_str, + out_cap, err_msg) < 0) { + return -1; + } + char hex[OCI_DIGEST_HEX_MAX + 1]; + oci_digest_algo_t algo; + if (!oci_digest_parse(out_digest_str, &algo, hex)) { + if (err_msg) + *err_msg = "computed manifest digest is malformed"; + errno = EINVAL; + return -1; + } + if (oci_blob_store_put_bytes(oci_store_blobs(store), OCI_DIGEST_SHA256, hex, + out_resp->body, out_resp->body_len) < 0) { + if (err_msg) + *err_msg = "failed to persist manifest body to local store"; + return -1; + } + return 0; +} + +/* Load a manifest blob already present in the local store into a heap buffer. + * Used by the refresh path after the registry confirms an unchanged digest: + * the manifest body must still be parsed (to drive the layer-cache sweep) + * but no network round trip is needed because the bytes are already on disk. + * Returns 0 on success with *out_buf newly malloc'd (caller frees) and + * *out_len set; -1 on IO failure with errno preserved. + */ +static int load_manifest_blob(oci_blob_store_t *blobs, + oci_digest_algo_t algo, + const char *hex, + char **out_buf, + size_t *out_len, + const char **err_msg) +{ + char path[1024]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + if (err_msg) + *err_msg = "manifest blob path overflow"; + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY); + if (fd < 0) { + if (err_msg) + *err_msg = "failed to open cached manifest blob"; + return -1; + } + struct stat st; + if (fstat(fd, &st) < 0) { + int e = errno; + close(fd); + if (err_msg) + *err_msg = "failed to stat cached manifest blob"; + errno = e; + return -1; + } + if (st.st_size <= 0 || (uintmax_t) st.st_size > (uintmax_t) SIZE_MAX - 1) { + close(fd); + if (err_msg) + *err_msg = "cached manifest blob has an unreasonable size"; + errno = EFBIG; + return -1; + } + size_t want = (size_t) st.st_size; + char *buf = malloc(want + 1); + if (!buf) { + close(fd); + if (err_msg) + *err_msg = "out of memory loading cached manifest"; + errno = ENOMEM; + return -1; + } + size_t got = 0; + while (got < want) { + ssize_t r = read(fd, buf + got, want - got); + if (r < 0) { + int e = errno; + free(buf); + close(fd); + if (err_msg) + *err_msg = "read failed on cached manifest blob"; + errno = e; + return -1; + } + if (r == 0) + break; + got += (size_t) r; + } + close(fd); + if (got != want) { + free(buf); + if (err_msg) + *err_msg = "cached manifest blob truncated mid-read"; + errno = EIO; + return -1; + } + buf[want] = '\0'; + *out_buf = buf; + *out_len = want; + return 0; +} + +static int parse_top_level(const oci_fetch_response_t *resp, + oci_media_type_t *out_mt, + const char **err_msg) +{ + oci_media_type_t mt = oci_media_type_parse(resp->content_type); + if (mt == OCI_MT_UNKNOWN) { + if (err_msg) + *err_msg = "registry returned an unrecognized Content-Type"; + errno = EPROTO; + return -1; + } + if (!oci_media_type_is_index(mt) && !oci_media_type_is_manifest(mt)) { + if (err_msg) + *err_msg = "registry returned a non-manifest Content-Type"; + errno = EPROTO; + return -1; + } + *out_mt = mt; + return 0; +} + +int oci_pull(oci_fetcher_t *fetcher, + oci_store_t *store, + const oci_ref_t *ref, + const oci_pull_options_t *opts, + const char **err_msg) +{ + if (!fetcher || !store || !ref) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + + FILE *progress = pick_progress(opts); + int rc = -1; + oci_fetch_response_t top_resp = {0}; + oci_fetch_response_t sub_resp = {0}; + oci_index_t idx_doc = {0}; + oci_manifest_t manifest = {0}; + bool top_unchanged = false; + char top_digest_str[OCI_DIGEST_HEX_MAX + 16]; + char sub_digest_str[OCI_DIGEST_HEX_MAX + 16]; + top_digest_str[0] = '\0'; + sub_digest_str[0] = '\0'; + char *cached_top_body = NULL; + size_t cached_top_body_len = 0; + char *pin_digest_for_refresh = NULL; + char if_none_match_buf[OCI_DIGEST_HEX_MAX + 32]; + const char *if_none_match = NULL; + + /* 0. Refresh prologue. Only fires when --refresh is set, the ref carries + * a tag (digest-only refs are content-addressed and cannot drift), the + * pin exists, and the pinned manifest blob is still on disk. Otherwise + * the call falls through to the normal pull path. + */ + if (opts && opts->refresh && ref->tag) { + const char *pin_err = NULL; + if (oci_store_get_ref(store, ref, &pin_digest_for_refresh, &pin_err) == + 0 && + pin_digest_for_refresh) { + oci_digest_algo_t pin_algo; + char pin_hex[OCI_DIGEST_HEX_MAX + 1]; + if (oci_digest_parse(pin_digest_for_refresh, &pin_algo, pin_hex) && + oci_blob_store_has(oci_store_blobs(store), pin_algo, pin_hex)) { + snprintf(if_none_match_buf, sizeof(if_none_match_buf), "\"%s\"", + pin_digest_for_refresh); + if_none_match = if_none_match_buf; + /* Seed top_digest_str so the 304 path can echo the pin into + * progress and the layer-cache sweep without re-deriving it + * from a body that the registry just omitted. + */ + snprintf(top_digest_str, sizeof(top_digest_str), "%s", + pin_digest_for_refresh); + } + } + } + + /* 1. Top-level fetch. Selector defaults to ref->digest, falling through + * to ref->tag, inside oci_fetch_manifest. When the user pulled by digest, + * expected_digest_str is the locked target; pulls by tag accept whatever + * the server resolves the tag to. if_none_match is set only by the + * refresh prologue above; a 304 response keeps the pin and re-uses the + * cached manifest body from the local store. + */ + if (fetch_and_persist_manifest(fetcher, store, ref, NULL, ref->digest, + if_none_match, &top_resp, top_digest_str, + sizeof(top_digest_str), &top_unchanged, + err_msg) < 0) { + goto out; + } + + const char *manifest_body = NULL; + size_t manifest_body_len = 0; + oci_media_type_t top_mt = OCI_MT_UNKNOWN; + const char *pin_digest_str = top_digest_str; + + if (top_unchanged) { + /* Registry confirmed the pinned digest still matches. Load the + * persisted manifest blob and run the rest of the pipeline against + * it so the layer-cache sweep can re-fetch any blob the user has + * pruned since the last pull. + */ + oci_digest_algo_t cached_algo; + char cached_hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(top_digest_str, &cached_algo, cached_hex)) { + if (err_msg) + *err_msg = "pinned manifest digest is malformed"; + errno = EINVAL; + goto out; + } + if (load_manifest_blob(oci_store_blobs(store), cached_algo, cached_hex, + &cached_top_body, &cached_top_body_len, + err_msg) < 0) { + goto out; + } + /* The persisted manifest blob has no Content-Type header. Try the + * image-index media type first, fall back to image-manifest. The + * parse step below is the actual gate; this only steers the index + * drill decision. + */ + oci_index_t probe = {0}; + if (oci_index_parse(cached_top_body, cached_top_body_len, &probe, + NULL) == 0) { + top_mt = OCI_MT_INDEX_OCI; + oci_index_free(&probe); + } else { + top_mt = OCI_MT_MANIFEST_OCI; + } + manifest_body = cached_top_body; + manifest_body_len = cached_top_body_len; + progress_line(progress, "manifest", top_digest_str, + (int64_t) cached_top_body_len, "unchanged", + oci_media_type_name(top_mt)); + } else { + if (parse_top_level(&top_resp, &top_mt, err_msg) < 0) + goto out; + progress_line(progress, "manifest", top_digest_str, + (int64_t) top_resp.body_len, "downloaded", + oci_media_type_name(top_mt)); + manifest_body = top_resp.body; + manifest_body_len = top_resp.body_len; + } + + /* 2. If top-level was an image index, pick linux/arm64 and refetch. */ + if (oci_media_type_is_index(top_mt)) { + if (oci_index_parse(manifest_body, manifest_body_len, &idx_doc, + err_msg) < 0) { + goto out; + } + const oci_index_entry_t *entry = oci_index_pick_linux_arm64(&idx_doc); + if (!entry) { + if (err_msg) + *err_msg = "image index has no linux/arm64 entry"; + errno = ENOENT; + goto out; + } + if (progress) { + fprintf( + progress, " picked %-22s %12lldB linux/arm64%s%s\n", + entry->desc.digest_str, (long long) entry->desc.size, + entry->platform.variant && *entry->platform.variant ? " " : "", + entry->platform.variant ? entry->platform.variant : ""); + fflush(progress); + } + + /* Sub-manifest fetch never carries If-None-Match: the index drill + * targets a specific digest, so a conditional GET there has no + * semantic anchor (the local blob, if cached, is already the answer + * by content-address). When the sub-manifest blob is already in the + * store oci_fetch_manifest would still re-GET; the linear shape + * leaves that as future work because manifests are small. + */ + if (fetch_and_persist_manifest( + fetcher, store, ref, entry->desc.digest_str, + entry->desc.digest_str, NULL, &sub_resp, sub_digest_str, + sizeof(sub_digest_str), NULL, err_msg) < 0) { + goto out; + } + oci_media_type_t sub_mt = OCI_MT_UNKNOWN; + if (parse_top_level(&sub_resp, &sub_mt, err_msg) < 0) + goto out; + if (!oci_media_type_is_manifest(sub_mt)) { + if (err_msg) + *err_msg = "index entry resolved to a non-manifest document"; + errno = EPROTO; + goto out; + } + progress_line(progress, "manifest", sub_digest_str, + (int64_t) sub_resp.body_len, "downloaded", + oci_media_type_name(sub_mt)); + + manifest_body = sub_resp.body; + manifest_body_len = sub_resp.body_len; + /* pin_digest_str stays as top_digest_str: the user pulled the tag, + * the registry resolved that tag to the index, so the pin records the + * index digest. Future inspect re-walks index -> manifest. + */ + } + + /* 3. Parse the manifest body. */ + if (oci_manifest_parse(manifest_body, manifest_body_len, &manifest, + err_msg) < 0) { + goto out; + } + + /* 4+5. Fetch config + every layer blob in parallel via the batch fetcher + * (oci-improvements-plan Plan 5 C5.1). The progress lines below are + * still per-blob and still preserve the cached/downloaded annotation, + * so the store-has lookup is captured before the batch call hides the + * transfer / cache decision behind the multi event loop. The batch is + * atomic: any blob fail aborts every writer and the function bails. + */ + { + bool batch_ok = false; + size_t batch_n = 1 + manifest.nlayers; + const oci_descriptor_t **batch_descs = + calloc(batch_n, sizeof(*batch_descs)); + bool config_cached = false; + bool *layer_cached = + manifest.nlayers > 0 + ? calloc(manifest.nlayers, sizeof(*layer_cached)) + : NULL; + if (!batch_descs || (manifest.nlayers > 0 && !layer_cached)) { + free(batch_descs); + free(layer_cached); + if (err_msg) + *err_msg = "out of memory composing blob batch"; + errno = ENOMEM; + goto out; + } + batch_descs[0] = &manifest.config; + config_cached = oci_blob_store_has( + oci_store_blobs(store), manifest.config.algo, manifest.config.hex); + for (size_t i = 0; i < manifest.nlayers; i++) { + batch_descs[1 + i] = &manifest.layers[i]; + layer_cached[i] = oci_blob_store_has(oci_store_blobs(store), + manifest.layers[i].algo, + manifest.layers[i].hex); + } + + /* Set up the per-blob progress renderer before the batch call so + * the cached lines land in the same paragraph the C5.1 code path + * used to produce post-hoc. The downloaded blobs get rendered in + * place during the transfer (TTY) or one line each on completion + * (non-TTY) via pull_progress_cb. A NULL progress fp (--quiet) + * still produces zero output because pp.fp is NULL and the + * formatter functions short-circuit on that. + */ + pull_progress_t pp; + if (pull_progress_init(&pp, progress, &manifest.config, config_cached, + manifest.layers, manifest.nlayers, + layer_cached) < 0) { + free(batch_descs); + free(layer_cached); + if (err_msg) + *err_msg = "out of memory initialising progress"; + errno = ENOMEM; + goto out; + } + + if (oci_fetch_blob_batch(fetcher, ref, batch_descs, batch_n, + oci_store_blobs(store), pull_progress_cb, &pp, + err_msg) == 0) { + batch_ok = true; + } + + pull_progress_dispose(&pp); + free(batch_descs); + free(layer_cached); + if (!batch_ok) + goto out; + } + + /* 6. Pin tag -> top-level digest. Digest-only refs are self-pinning and + * skip this step (oci_store_put_ref refuses them). On 304 the pin is + * already at the right digest, so the put_ref call is skipped to avoid + * an unnecessary tmp + rename round trip. + */ + if (ref->tag) { + if (!top_unchanged) { + if (oci_store_put_ref(store, ref, pin_digest_str, err_msg) < 0) + goto out; + } + if (progress) { + fprintf(progress, " pin %s:%s -> %s%s\n", ref->repository, + ref->tag, pin_digest_str, + top_unchanged ? " (unchanged)" : ""); + fflush(progress); + } + } + + rc = 0; + +out: + /* Preserve the caller-visible errno across cleanup. free / fclose can + * stomp on errno even when they succeed, which would defeat callers that + * key tests off specific values (EPROTO / ENOENT / EINVAL). + */ + { + int saved_errno = errno; + oci_manifest_free(&manifest); + oci_index_free(&idx_doc); + /* sub_resp is zero-initialised and oci_fetch_response_free is + * null-safe, so free it unconditionally. fetch_and_persist_manifest + * can populate the response body and then fail a later validation + * step (digest mismatch, persist error) without freeing it, so a + * guarded free would leak on those error paths. + */ + oci_fetch_response_free(&sub_resp); + oci_fetch_response_free(&top_resp); + free(cached_top_body); + free(pin_digest_for_refresh); + if (rc != 0) + errno = saved_errno; + } + return rc; +} diff --git a/src/oci/pull.h b/src/oci/pull.h new file mode 100644 index 0000000..ef00c55 --- /dev/null +++ b/src/oci/pull.h @@ -0,0 +1,75 @@ +/* elfuse oci pull pipeline + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Glues the slice 4a/4b fetcher and the slice 3 manifest parser to the + * slice 5a local store. One call to oci_pull resolves an image reference into + * a fully populated blob graph on disk: + * + * 1. Fetch the top-level descriptor by ref->digest or ref->tag. + * 2. Cross-check the response Docker-Content-Digest against a local SHA-256 + * of the body; a mismatch is a hostile-registry signal and aborts. + * 3. If the response is an image index, parse it, pick the linux/arm64 + * sub-manifest (oci-roadmap Q3), and re-fetch by that digest. + * 4. Parse the manifest, fetch the config blob, fetch each layer blob. + * 5. Write the tag-to-manifest-digest pin so the next pull or inspect for + * the same tag is reproducible. + * + * The function is best-effort idempotent: a re-pull of the same reference + * short-circuits all already-present blobs through the slice 4a oci_fetch_blob + * cache check, only the top-level manifest is re-fetched (small bytes; future + * slice can add a manifest cache). + * + * Foreign / nondistributable layers and schema v1 manifests are rejected by + * the parsers in slice 3; oci_pull surfaces the diagnostic and aborts before + * any partial layer hits the store. + */ + +#pragma once + +#include + +#include "fetch.h" +#include "ref.h" +#include "store.h" + +typedef struct { + /* Per-blob progress is written here as one line per descriptor. Set to + * NULL to suppress all output. Defaults to stderr when opts is NULL or + * progress is NULL but suppress_progress is not requested explicitly. + */ + FILE *progress; + /* When true, suppress progress output even if progress is NULL (the + * NULL/default interpretation lands on stderr). Used by elfuse oci + * pull -q. + */ + bool quiet; + /* Opt-in tag revalidation. When the pinned manifest digest and its blob + * are both already in the store, the top-level manifest GET carries + * If-None-Match: ""; on 304 Not Modified the pull + * short-circuits without re-fetching layer blobs and leaves the pin in + * place. Without this flag the default pull re-runs every step (never + * trusts the pin), which keeps stale-tag detection responsive but pays + * the network cost. + * + * The flag is a no-op for digest-only refs (no tag to revalidate + * against), and silently falls through to a normal pull when no pin + * exists yet or the pinned manifest blob has been pruned from the + * store. Servers may ignore If-None-Match and respond 200 with a new + * digest; the pull then runs the full pipeline against the new + * manifest. The previous manifest blob stays in the store until prune + * collects it. + */ + bool refresh; +} oci_pull_options_t; + +/* Run the pull pipeline. Returns 0 on success, -1 on failure with errno + * preserved and *err_msg (when non-NULL) pointing at a static description. + * The store and fetcher must outlive the call; both are reused across phases. + */ +int oci_pull(oci_fetcher_t *fetcher, + oci_store_t *store, + const oci_ref_t *ref, + const oci_pull_options_t *opts, + const char **err_msg); diff --git a/src/oci/rebuild-cache.c b/src/oci/rebuild-cache.c new file mode 100644 index 0000000..573f0b2 --- /dev/null +++ b/src/oci/rebuild-cache.c @@ -0,0 +1,281 @@ +/* OCI stack cache back-fill driver. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * See rebuild-cache.h for the externally visible contract. Implementation + * sketch: oci_volume_list_unpacked yields every /images/ + * sha256-/ directory; for each one this module reads the origin + * sidecar, recomputes the terminating ChainID from the recorded diff_id + * list, probes the stack cache for an existing entry, and (when + * opts->commit is set) clonefiles the tree into a staged stack snapshot, + * strips the .elfuse-origin.json that the unpacker wrote AFTER the fresh- + * unpack snapshot was taken, and atomically promotes the staged tree via + * oci_store_stack_commit. Failures inside the per-tree loop are aggregated + * into stats counters and reported on stderr; the loop never aborts so a + * single corrupt tree cannot block the rest of the back-fill. + */ + +#include "rebuild-cache.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "digest.h" +#include "origin-meta.h" +#include "volume.h" + +#define RC_PATH_MAX 4096 + +static int set_err(const char **err, const char *msg, int err_no) +{ + if (err) + *err = msg; + errno = err_no; + return -1; +} + +/* Recursive rmdir / unlink for clean-up after a failed snapshot stage. + * Duplicates the rm_recursive in src/oci/unpack.c because pulling unpack.c + * into a small back-fill module would drag the full layer-apply / decompress + * graph along for one helper. Lift to a shared util if a third copy + * appears. + */ +static int rm_recursive(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return errno == ENOENT ? 0 : -1; + if (!S_ISDIR(st.st_mode)) + return unlink(path); + DIR *d = opendir(path); + if (!d) + return -1; + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[RC_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + if (rm_recursive(child) < 0) { + rc = -1; + break; + } + } + closedir(d); + if (rc == 0 && rmdir(path) < 0) + rc = -1; + return rc; +} + +static size_t count_diff_ids(char *const *diff_ids) +{ + size_t n = 0; + if (!diff_ids) + return 0; + while (diff_ids[n]) + n++; + return n; +} + +/* Compute the terminating ChainID over a non-empty diff_id list. Iterates + * oci_chainid_compute with a two-buffer ping-pong so the running chain + * value can be threaded through each step without per-iteration alloc. + * Writes the result into out (must hold at least OCI_DIGEST_HEX_MAX + 16 + * bytes). Returns 0 on success, -1 with errno set on failure. + */ +static int compute_terminal_chain(char *const *diff_ids, + size_t n, + char *out, + size_t cap) +{ + char buf_a[OCI_DIGEST_HEX_MAX + 16]; + char buf_b[OCI_DIGEST_HEX_MAX + 16]; + char *cur = buf_a; + char *nxt = buf_b; + if (oci_chainid_compute(NULL, diff_ids[0], cur, sizeof(buf_a)) < 0) + return -1; + for (size_t i = 1; i < n; i++) { + if (oci_chainid_compute(cur, diff_ids[i], nxt, sizeof(buf_b)) < 0) + return -1; + char *tmp = cur; + cur = nxt; + nxt = tmp; + } + size_t need = strlen(cur) + 1; + if (need > cap) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(out, cur, need); + return 0; +} + +/* Stage + clonefile + sanitize + stack_commit one tree. The caller verified + * that oci_store_stack_has returned 0 for chain_id; this function does not + * re-probe. Returns 0 on success, -1 with errno preserved on failure. On + * failure the staging path (if any) is removed. + */ +static int snapshot_tree_to_stack(oci_store_t *store, + const char *tree_path, + const char *chain_id, + const char **err) +{ + char stage_path[RC_PATH_MAX]; + if (oci_store_stack_stage_path(store, chain_id, stage_path, + sizeof(stage_path)) < 0) { + if (err) + *err = "rebuild-cache: stack stage_path resolve failed"; + return -1; + } + if (clonefile(tree_path, stage_path, CLONE_NOFOLLOW) < 0) { + int saved = errno; + if (err) + *err = saved == EXDEV + ? "rebuild-cache: stack snapshot EXDEV (store and " + "volume must share an APFS volume)" + : "rebuild-cache: stack snapshot clonefile failed"; + errno = saved; + return -1; + } + /* Strip the origin sidecar so the rebuilt snapshot matches the shape + * a fresh-unpack snapshot would have produced (oci_unpack writes the + * origin sidecar AFTER it takes the stack snapshot). + */ + char origin_path[RC_PATH_MAX]; + int n = snprintf(origin_path, sizeof(origin_path), "%s/.elfuse-origin.json", + stage_path); + if (n < 0 || (size_t) n >= sizeof(origin_path)) { + (void) rm_recursive(stage_path); + if (err) + *err = "rebuild-cache: origin sidecar path overflow"; + errno = ENAMETOOLONG; + return -1; + } + if (unlink(origin_path) < 0 && errno != ENOENT) { + int saved = errno; + (void) rm_recursive(stage_path); + if (err) + *err = "rebuild-cache: origin sidecar unlink failed"; + errno = saved; + return -1; + } + const char *scerr = NULL; + if (oci_store_stack_commit(store, stage_path, chain_id, &scerr) < 0) { + int saved = errno; + (void) rm_recursive(stage_path); + if (err) + *err = scerr ? scerr : "rebuild-cache: stack_commit failed"; + errno = saved; + return -1; + } + return 0; +} + +int oci_rebuild_cache(oci_store_t *store, + const char *volume_root, + oci_rebuild_cache_options_t *opts, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!store || !opts) + return set_err(err, "rebuild-cache: NULL argument", EINVAL); + + /* Zero only output fields; the caller's commit flag must survive. */ + opts->trees_scanned = 0; + opts->trees_rebuilt = 0; + opts->trees_skipped_cached = 0; + opts->trees_skipped_no_origin = 0; + opts->trees_skipped_bad_origin = 0; + opts->trees_skipped_empty_diffids = 0; + opts->trees_failed = 0; + opts->stack_entries_added = 0; + + oci_volume_list_t trees = {0}; + const char *list_err = NULL; + if (oci_volume_list_unpacked(volume_root, &trees, &list_err) < 0) { + int saved = errno; + set_err(err, + list_err ? list_err : "rebuild-cache: list unpacked failed", + saved); + return -1; + } + + for (size_t i = 0; i < trees.count; i++) { + const char *tree = trees.items[i]; + opts->trees_scanned++; + + oci_origin_t origin = {0}; + if (oci_origin_read(tree, &origin, NULL) < 0) { + if (errno == ENOENT) + opts->trees_skipped_no_origin++; + else + opts->trees_skipped_bad_origin++; + continue; + } + + size_t n_diffs = count_diff_ids(origin.layer_diffids); + if (n_diffs == 0) { + opts->trees_skipped_empty_diffids++; + oci_origin_free(&origin); + continue; + } + + char chain[OCI_DIGEST_HEX_MAX + 16]; + if (compute_terminal_chain(origin.layer_diffids, n_diffs, chain, + sizeof(chain)) < 0) { + fprintf(stderr, "rebuild-cache: %s: chainid compute failed: %s\n", + tree, strerror(errno)); + opts->trees_failed++; + oci_origin_free(&origin); + continue; + } + oci_origin_free(&origin); + + int has = oci_store_stack_has(store, chain); + if (has < 0) { + fprintf(stderr, "rebuild-cache: %s: stack_has probe failed: %s\n", + tree, strerror(errno)); + opts->trees_failed++; + continue; + } + if (has > 0) { + opts->trees_skipped_cached++; + continue; + } + + if (!opts->commit) { + opts->trees_rebuilt++; + opts->stack_entries_added++; + continue; + } + + const char *snap_err = NULL; + if (snapshot_tree_to_stack(store, tree, chain, &snap_err) < 0) { + fprintf(stderr, "rebuild-cache: %s: %s: %s\n", tree, + snap_err ? snap_err : "snapshot failed", strerror(errno)); + opts->trees_failed++; + continue; + } + opts->trees_rebuilt++; + opts->stack_entries_added++; + } + + oci_volume_list_free(&trees); + return 0; +} diff --git a/src/oci/rebuild-cache.h b/src/oci/rebuild-cache.h new file mode 100644 index 0000000..71ff807 --- /dev/null +++ b/src/oci/rebuild-cache.h @@ -0,0 +1,101 @@ +/* OCI stack cache back-fill for legacy unpacked sysroots + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Plan 3 C3.3c introduced the ChainID-keyed stack cache at + * /layers/stacks/sha256// so a re-unpack of any image whose + * layer prefix matches an already-extracted tree short-circuits the per- + * layer apply loop into a single APFS clonefile restore. The cache only + * grows as a side effect of oci_unpack, which means any image that was + * unpacked before C3.3c landed (or that was unpacked into a store whose + * C3.3b schema marker had just wiped v1 entries) leaves no stack snapshot + * on disk: the next unpack still pays the full extract cost even though a + * usable assembled stage_dir already sits under /images/sha256-/. + * + * oci_rebuild_cache walks every unpacked sysroot under /images/, + * reads its .elfuse-origin.json sidecar to recover the original layer + * diff_id ordering, recomputes ChainID for the terminating layer, and (when + * commit is true) clonefiles the tree into a fresh stack cache entry keyed + * by that ChainID. Subsequent unpacks of any image sharing the same ordered + * layer list short-circuit immediately. Intermediate-prefix entries are NOT + * back-filled because an unpacked tree only captures the final overlay + * state; the per-layer raw cache /layers/sha256// similarly + * remains empty until a re-pull + re-unpack of the source image repopulates + * it. + * + * The walk is purely additive: existing stack cache entries are never + * modified or deleted, and the rename(2) used by oci_store_stack_commit + * treats EEXIST as a benign loss to a racing writer so repeated invocations + * are idempotent. No interaction with raw cache entries, blob storage, or + * pin metadata; rebuild-cache only manipulates layers/stacks/. + */ + +#pragma once + +#include +#include + +#include "store.h" + +/* Inputs and outputs for oci_rebuild_cache. Output counters are zeroed on + * entry so a caller can render a uniform report regardless of dry-run vs + * commit, and they reflect the same "what would happen" view: a dry-run + * reports trees_rebuilt as the number of trees that WOULD have landed a + * stack snapshot, while a commit run reports the number that actually did. + * trees_failed counts trees whose snapshot pipeline (chainid compute, + * clonefile, stack_commit) raised an error and which were therefore left + * out of trees_rebuilt; the offending tree is reported on stderr and the + * walk continues so a single bad tree does not abort the whole back-fill. + */ +typedef struct { + /* Inputs */ + bool commit; /* false = dry-run, true = back-fill */ + + /* Outputs */ + size_t trees_scanned; /* candidates encountered under images/ */ + size_t trees_rebuilt; /* would-rebuild (dry-run) or rebuilt (commit) */ + size_t trees_skipped_cached; /* terminating ChainID already on disk */ + size_t trees_skipped_no_origin; /* .elfuse-origin.json missing (ENOENT) */ + size_t + trees_skipped_bad_origin; /* origin sidecar parse or schema failure */ + size_t trees_skipped_empty_diffids; /* origin diff_ids array is empty */ + size_t trees_failed; /* chainid_compute / clonefile / commit IO */ + size_t stack_entries_added; /* sum across rebuilt trees */ +} oci_rebuild_cache_options_t; + +/* Walk /images/sha256-/ and back-fill the stack cache + * entry at /layers/stacks/sha256// for every + * tree whose .elfuse-origin.json carries a non-empty layer_diffids array + * and whose terminating ChainID is not already on disk. + * + * volume_root may be NULL; in that case oci_volume_list_unpacked treats the + * request as the empty case and trees_scanned stays 0. A missing + * /images/ directory is also treated as empty. + * + * The walk uses oci_volume_list_unpacked to enumerate candidates; only + * directories shaped sha256- are returned, so dotfiles and + * the .staging/ subtree are skipped automatically. + * + * The unpacked tree contains .elfuse-origin.json (written by oci_unpack + * AFTER the stack snapshot is taken on the fresh-unpack path). The + * back-fill strips that file from the staged snapshot before commit so a + * rebuilt stack cache entry is byte-identical to a fresh-unpack one. + * + * Failure policy: + * - per-tree origin read failure (ENOENT vs other) is counted in + * trees_skipped_no_origin / trees_skipped_bad_origin and the walk + * continues. + * - per-tree chainid_compute, clonefile, or stack_commit failure is + * counted in trees_failed and the walk continues. A diagnostic line is + * written to stderr identifying the tree path. + * - listing failure (failure to traverse images/) returns -1 with errno + * preserved and *err populated; opts->trees_scanned reflects the trees + * processed before the failure surfaced. + * + * Returns 0 on success and -1 on listing-level failure. err may be NULL. + */ +int oci_rebuild_cache(oci_store_t *store, + const char *volume_root, + oci_rebuild_cache_options_t *opts, + const char **err); diff --git a/src/oci/ref.c b/src/oci/ref.c new file mode 100644 index 0000000..ec4b409 --- /dev/null +++ b/src/oci/ref.c @@ -0,0 +1,467 @@ +/* OCI image reference parser + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * See ref.h for the grammar and design notes. The parser is split into: + * 1. find the optional @digest suffix and validate it + * 2. find the optional :tag suffix on the remainder + * 3. split the rest into registry vs path using the containerd domain rule + * 4. apply Docker defaults (docker.io, library/, latest) + * 5. validate every component against the OCI character class rules + */ + +#include "ref.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_REGISTRY "docker.io" +#define DEFAULT_LIBRARY_NAMESPACE "library" +#define DEFAULT_TAG "latest" + +#define MAX_REFERENCE_LEN 4096 +#define MAX_TAG_LEN 128 + +static char *strndup_local(const char *src, size_t n) +{ + char *dst = (char *) malloc(n + 1); + if (!dst) + return NULL; + memcpy(dst, src, n); + dst[n] = '\0'; + return dst; +} + +static void set_err(const char **slot, const char *msg) +{ + if (slot) + *slot = msg; +} + +static bool is_lower_alnum(char c) +{ + return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); +} + +static bool is_path_separator(char c) +{ + return c == '.' || c == '_' || c == '-'; +} + +/* Validate one path component against [a-z0-9]+ (([._-]|__) [a-z0-9]+)*. + * Empty components and uppercase letters are rejected. + */ +static bool valid_path_component(const char *s, size_t len) +{ + if (len == 0) + return false; + if (!is_lower_alnum(s[0]) || !is_lower_alnum(s[len - 1])) + return false; + + size_t i = 0; + while (i < len) { + if (is_lower_alnum(s[i])) { + i++; + continue; + } + /* Separator run per the distribution-spec grammar + * [a-z0-9]+((\.|_|__|-+)[a-z0-9]+)*: a run of one-or-more '-', a + * single '.' or '_', or exactly "__". Anything else (e.g. "a..b", + * "a___b") is rejected. Note dashes may repeat ("my--repo") but + * dots and underscores may not. + */ + if (s[i] == '-') { + while (i < len && s[i] == '-') + i++; + } else if (s[i] == '_' && i + 1 < len && s[i + 1] == '_') { + i += 2; + } else if (is_path_separator(s[i])) { + i++; + } else { + return false; + } + if (i >= len || !is_lower_alnum(s[i])) + return false; + } + return true; +} + +/* Validate a multi-component path (components separated by '/'). */ +static bool valid_repository_path(const char *s, size_t len) +{ + if (len == 0) + return false; + size_t start = 0; + for (size_t i = 0; i < len; i++) { + if (s[i] == '/') { + if (!valid_path_component(s + start, i - start)) + return false; + start = i + 1; + } + } + return valid_path_component(s + start, len - start); +} + +/* Domain detection per containerd: a leading slash component is a registry + * only when it contains '.' or ':', or when it is exactly "localhost". + */ +static bool looks_like_domain(const char *s, size_t len) +{ + if (len == 9 && memcmp(s, "localhost", 9) == 0) + return true; + for (size_t i = 0; i < len; i++) { + if (s[i] == '.' || s[i] == ':') + return true; + } + return false; +} + +/* Portable rightmost-match: Darwin libc does not ship memrchr. */ +static const char *memrchr_local(const char *s, int c, size_t n) +{ + while (n > 0) { + n--; + if ((unsigned char) s[n] == (unsigned char) c) + return s + n; + } + return NULL; +} + +/* Validate a registry host[:port]. The host portion is permissive (DNS + * label rules plus IPv6 brackets are not enforced) but uppercase letters + * are accepted because hostnames are case-insensitive. The optional port + * suffix must be a 1..5 digit decimal number. + */ +static bool valid_registry(const char *s, size_t len) +{ + if (len == 0) + return false; + /* Reject embedded whitespace or path separators outright. */ + for (size_t i = 0; i < len; i++) { + unsigned char c = (unsigned char) s[i]; + if (c <= ' ' || c == '/' || c == '@') + return false; + } + /* If there is a ':' it must be followed by 1..5 decimal digits and must + * be the last colon (IPv6 in brackets is not yet supported). + */ + const char *colon = memchr(s, ':', len); + if (colon) { + size_t host_len = (size_t) (colon - s); + size_t port_len = len - host_len - 1; + if (host_len == 0 || port_len == 0 || port_len > 5) + return false; + for (size_t i = 0; i < port_len; i++) { + if (colon[1 + i] < '0' || colon[1 + i] > '9') + return false; + } + } + return true; +} + +static bool valid_tag(const char *s, size_t len) +{ + if (len == 0 || len > MAX_TAG_LEN) + return false; + /* First char: word character (letter, digit, underscore). */ + unsigned char c0 = (unsigned char) s[0]; + if (!isalnum(c0) && c0 != '_') + return false; + for (size_t i = 1; i < len; i++) { + unsigned char c = (unsigned char) s[i]; + if (!isalnum(c) && c != '_' && c != '.' && c != '-') + return false; + } + return true; +} + +static bool is_lower_hex(char c) +{ + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); +} + +/* Validate ":" with algo in {sha256, sha512}. The hex digits are + * required to be lowercase per the OCI image-spec descriptor canonicalisation + * rules; uppercase encodings would otherwise cause silent dedup misses in + * the local store. + */ +static bool valid_digest(const char *s, size_t len, const char **err_msg) +{ + const char *colon = memchr(s, ':', len); + if (!colon) { + set_err(err_msg, "digest missing ':' separator"); + return false; + } + size_t algo_len = (size_t) (colon - s); + size_t hex_len = len - algo_len - 1; + + size_t expected_hex; + if (algo_len == 6 && memcmp(s, "sha256", 6) == 0) { + expected_hex = 64; + } else if (algo_len == 6 && memcmp(s, "sha512", 6) == 0) { + expected_hex = 128; + } else { + set_err(err_msg, "digest algorithm must be sha256 or sha512"); + return false; + } + if (hex_len != expected_hex) { + set_err(err_msg, "digest hex length does not match algorithm"); + return false; + } + for (size_t i = 0; i < hex_len; i++) { + if (!is_lower_hex(colon[1 + i])) { + set_err(err_msg, "digest hex must be lowercase 0-9 a-f"); + return false; + } + } + return true; +} + +void oci_ref_free(oci_ref_t *ref) +{ + if (!ref) + return; + free(ref->registry); + free(ref->repository); + free(ref->tag); + free(ref->digest); + ref->registry = NULL; + ref->repository = NULL; + ref->tag = NULL; + ref->digest = NULL; +} + +int oci_ref_parse(const char *input, oci_ref_t *out, const char **err_msg) +{ + set_err(err_msg, NULL); + if (!out) + return -1; + memset(out, 0, sizeof(*out)); + + if (!input) { + set_err(err_msg, "reference is NULL"); + return -1; + } + size_t total = strlen(input); + if (total == 0) { + set_err(err_msg, "reference is empty"); + return -1; + } + if (total > MAX_REFERENCE_LEN) { + set_err(err_msg, "reference exceeds 4096 characters"); + return -1; + } + + /* Step 1: split off "@digest" (rightmost '@' wins because '@' cannot + * legally appear elsewhere in a well-formed reference). + */ + const char *digest_start = NULL; + size_t digest_len = 0; + const char *at = memchr(input, '@', total); + if (at) { + /* Reject multiple '@' separators outright. */ + const char *second = + memchr(at + 1, '@', total - (size_t) (at + 1 - input)); + if (second) { + set_err(err_msg, "reference contains multiple '@' separators"); + return -1; + } + digest_start = at + 1; + digest_len = total - (size_t) (digest_start - input); + if (digest_len == 0) { + set_err(err_msg, "digest is empty after '@'"); + return -1; + } + if (!valid_digest(digest_start, digest_len, err_msg)) + return -1; + total = (size_t) (at - input); + if (total == 0) { + set_err(err_msg, "reference has no name before '@'"); + return -1; + } + } + + /* Step 2: peel off ":tag" if present. The tag separator is the rightmost + * ':' that follows the last '/' (a colon before any '/' belongs to the + * registry's port). + */ + const char *tag_start = NULL; + size_t tag_len = 0; + size_t name_len = total; + const char *last_slash = memrchr_local(input, '/', total); + const char *scan_from = last_slash ? last_slash + 1 : input; + const char *scan_end = input + total; + const char *tag_colon = + memchr(scan_from, ':', (size_t) (scan_end - scan_from)); + if (tag_colon) { + tag_start = tag_colon + 1; + tag_len = total - (size_t) (tag_start - input); + if (tag_len == 0) { + set_err(err_msg, "tag is empty after ':'"); + return -1; + } + if (!valid_tag(tag_start, tag_len)) { + set_err(err_msg, "tag has invalid characters or length"); + return -1; + } + name_len = (size_t) (tag_colon - input); + if (name_len == 0) { + set_err(err_msg, "reference has no name before ':'"); + return -1; + } + } + + /* Step 3: split name into [registry "/"] path. */ + const char *registry_start = NULL; + size_t registry_len = 0; + const char *path_start = input; + size_t path_len = name_len; + + const char *first_slash = memchr(input, '/', name_len); + if (first_slash) { + size_t head_len = (size_t) (first_slash - input); + if (looks_like_domain(input, head_len)) { + registry_start = input; + registry_len = head_len; + path_start = first_slash + 1; + path_len = name_len - head_len - 1; + if (path_len == 0) { + set_err(err_msg, "reference has no repository after registry"); + return -1; + } + } + } + + /* Step 4: validate path components and detect single-segment defaults. */ + if (!valid_repository_path(path_start, path_len)) { + set_err(err_msg, + "repository path has invalid component (lowercase letters," + " digits, '.', '_', '-' only)"); + return -1; + } + + if (registry_len > 0 && !valid_registry(registry_start, registry_len)) { + set_err(err_msg, "registry host has invalid characters"); + return -1; + } + + /* Step 5: materialise the canonical fields. */ + out->registry = registry_len > 0 + ? strndup_local(registry_start, registry_len) + : strdup(DEFAULT_REGISTRY); + if (!out->registry) + goto oom; + + /* Registry hostnames are case-insensitive (DNS), so "Docker.io" must + * resolve to the docker.io default namespace just like "docker.io". + */ + bool needs_library_prefix = + strcasecmp(out->registry, DEFAULT_REGISTRY) == 0 && + memchr(path_start, '/', path_len) == NULL; + if (needs_library_prefix) { + size_t prefix_len = strlen(DEFAULT_LIBRARY_NAMESPACE); + size_t total_len = prefix_len + 1 + path_len; + out->repository = (char *) malloc(total_len + 1); + if (!out->repository) + goto oom; + memcpy(out->repository, DEFAULT_LIBRARY_NAMESPACE, prefix_len); + out->repository[prefix_len] = '/'; + memcpy(out->repository + prefix_len + 1, path_start, path_len); + out->repository[total_len] = '\0'; + } else { + out->repository = strndup_local(path_start, path_len); + if (!out->repository) + goto oom; + } + + if (tag_len > 0) { + out->tag = strndup_local(tag_start, tag_len); + if (!out->tag) + goto oom; + } else if (digest_len == 0) { + out->tag = strdup(DEFAULT_TAG); + if (!out->tag) + goto oom; + } + + if (digest_len > 0) { + out->digest = strndup_local(digest_start, digest_len); + if (!out->digest) + goto oom; + } + + return 0; + +oom: + set_err(err_msg, "out of memory"); + oci_ref_free(out); + return -1; +} + +char *oci_ref_canonical(const oci_ref_t *ref) +{ + if (!ref || !ref->registry || !ref->repository) + return NULL; + size_t reg_len = strlen(ref->registry); + size_t repo_len = strlen(ref->repository); + size_t tag_len = ref->tag ? strlen(ref->tag) : 0; + size_t dig_len = ref->digest ? strlen(ref->digest) : 0; + size_t total = reg_len + 1 + repo_len + (tag_len ? tag_len + 1 : 0) + + (dig_len ? dig_len + 1 : 0) + 1; + char *buf = (char *) malloc(total); + if (!buf) + return NULL; + char *p = buf; + memcpy(p, ref->registry, reg_len); + p += reg_len; + *p++ = '/'; + memcpy(p, ref->repository, repo_len); + p += repo_len; + if (tag_len) { + *p++ = ':'; + memcpy(p, ref->tag, tag_len); + p += tag_len; + } + if (dig_len) { + *p++ = '@'; + memcpy(p, ref->digest, dig_len); + p += dig_len; + } + *p = '\0'; + return buf; +} + +char *oci_ref_canonical_name(const oci_ref_t *ref) +{ + if (!ref || !ref->registry || !ref->repository || !ref->tag) { + errno = EINVAL; + return NULL; + } + size_t reg_len = strlen(ref->registry); + size_t repo_len = strlen(ref->repository); + size_t tag_len = strlen(ref->tag); + size_t total = reg_len + 1 + repo_len + 1 + tag_len + 1; + char *buf = (char *) malloc(total); + if (!buf) { + errno = ENOMEM; + return NULL; + } + char *p = buf; + memcpy(p, ref->registry, reg_len); + p += reg_len; + *p++ = '/'; + memcpy(p, ref->repository, repo_len); + p += repo_len; + *p++ = ':'; + memcpy(p, ref->tag, tag_len); + p += tag_len; + *p = '\0'; + return buf; +} diff --git a/src/oci/ref.h b/src/oci/ref.h new file mode 100644 index 0000000..3129c28 --- /dev/null +++ b/src/oci/ref.h @@ -0,0 +1,70 @@ +/* Parse OCI image references (REGISTRY/REPO[:TAG][@DIGEST]) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Implements the de-facto containerd/docker reference grammar so that user + * input like alpine, alpine:3.20, myuser/myrepo:tag, ghcr.io/owner/img:tag, + * or repo@sha256: resolves to a canonical (registry, repository, tag, + * digest) tuple. Defaults match Docker conventions: bare names land under + * docker.io/library/ with tag latest. + * + * Grammar (informal): + * + * reference := name [":" tag] ["@" digest] + * name := [domain "/"] path + * domain := first slash component containing "." or ":" or == "localhost" + * path := component ("/" component)* + * component := [a-z0-9]+ ((["._-"] | "__") [a-z0-9]+)* + * tag := [A-Za-z0-9_] [A-Za-z0-9_.-]{0,127} + * digest := ("sha256" | "sha512") ":" hex (lowercase hex) + * + * Domain detection follows containerd: the first slash-separated component + * is treated as a registry only when it carries a domain marker. Bare + * single-segment names (alpine) and two-segment names (user/repo) default + * to docker.io. Single-segment defaults additionally pick up the library/ + * prefix. + */ + +#pragma once + +typedef struct { + /* Registry hostname (and optional :port). Always non-NULL after parse. */ + char *registry; + /* Repository path with namespace, e.g. "library/alpine". Always non-NULL. + */ + char *repository; + /* Tag name. NULL when the reference is pinned by digest only. Defaults + * to "latest" when neither tag nor digest is present. + */ + char *tag; + /* Digest ":", or NULL. */ + char *digest; +} oci_ref_t; + +/* Parse input into out. Returns 0 on success or -1 on malformed input. On + * error, *err_msg (when err_msg != NULL) is set to a static description; the + * string must not be freed. On success the caller owns out and must call + * oci_ref_free. + */ +int oci_ref_parse(const char *input, oci_ref_t *out, const char **err_msg); + +/* Render a canonical "registry/repository[:tag][@digest]" string. Always + * heap-allocated; the caller frees. Returns NULL on allocation failure. + */ +char *oci_ref_canonical(const oci_ref_t *ref); + +/* Render the canonical pin-name form "registry/repository:tag" used as the + * value of the org.opencontainers.image.ref.name annotation in the store's + * index.json. The digest segment is intentionally dropped: pin entries are + * keyed by tag-name and the digest is stored separately in the descriptor. + * Returns NULL with errno=EINVAL when ref->tag is unset (digest-only refs + * are self-pinning and cannot be inserted into the pin table) or with + * errno=ENOMEM on allocation failure. The caller frees the result. + */ +char *oci_ref_canonical_name(const oci_ref_t *ref); + +/* Release any heap fields. Safe on a zero-initialised or partially populated + * struct; resets all fields to NULL. + */ +void oci_ref_free(oci_ref_t *ref); diff --git a/src/oci/run.c b/src/oci/run.c new file mode 100644 index 0000000..0c0e689 --- /dev/null +++ b/src/oci/run.c @@ -0,0 +1,790 @@ +/* elfuse oci run -- unpack + clone + runspec + path resolve + launch + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Implementation walks the orchestration steps in the order specified + * by the Phase 3 plan: + * + * 1. resolve ref against the local store (must already be pulled) + * 2. oci_unpack into the APFS sysroot volume (idempotent; no-op if + * layers already extracted) + * 3. oci_clone_rootfs into /runs// + * 4. read + parse the manifest, then the image config, off the blob + * store via oci_blob_store_path + a small read-into-heap helper + * (parallel to src/oci/inspect.c's read_blob_file; intentionally + * duplicated rather than re-exported because the inspect copy + * lives behind a static and a cross-module hoist would expand the + * public surface for a 50-line helper) + * 5. oci_runspec_build folds the image runtime + CLI flags into + * argv/envp/cwd/uid + * 6. materialize spec.cwd under run_dir (mkdir -p, mode 0755) + * 7. oci_path_resolve resolves spec.argv[0] against the merged PATH + * inside run_dir (sysroot containment included) + * 8. replace spec.argv[0] with the guest-absolute path the resolver + * handed back, so the guest sees its own canonical name + * 9. save host cwd, chdir to so the guest's + * inherited cwd matches the OCI WorkingDir + * 10. assemble launch_args_t and dispatch to elfuse_launch (or to the + * test override when oci_run_set_launch_for_testing is in effect) + * 11. restore host cwd, free intermediate state, remove the clone dir + * unless keep_rootfs is set + * + * Errors at any stage funnel through the same cleanup epilogue. The + * clone directory is removed on launch failure too, matching the + * "ephemeral by default" decision the Phase 3 roadmap commits to; + * --keep is the only way to opt out. + */ + +#include "run.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blob-store.h" +#include "clone-rootfs.h" +#include "digest.h" +#include "manifest.h" +#include "path-resolve.h" +#include "runtime-files.h" +#include "unpack.h" +#include "volume.h" + +#include "core/launch.h" + +#include "debug/log.h" + +/* Process-global launch backend pointer. NULL means "use the default + * (elfuse_launch)". Toggled by oci_run_set_launch_for_testing; nobody + * else writes it. The indirection lets tests assert on the + * launch_args_t shape without spinning up an HVF VM. + */ +static oci_run_launch_fn_t g_launch_override = NULL; + +void oci_run_set_launch_for_testing(oci_run_launch_fn_t fn) +{ + g_launch_override = fn; +} + +/* Diagnostic scratch shared with the rest of oci_run. Thread-local so + * future parallel runs do not stomp each other. The buffer is sized + * for the longest dynamic message (a quoted argv[0] plus the searched + * PATH list propagated up from path-resolve). + */ +static _Thread_local char run_err_buf[2048]; + +static void set_err_static(const char **err, const char *msg) +{ + if (err) + *err = msg; +} + +static void set_err_fmt(const char **err, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); + +#include + +static void set_err_fmt(const char **err, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(run_err_buf, sizeof(run_err_buf), fmt, ap); + va_end(ap); + if (err) + *err = run_err_buf; +} + +/* Read a blob from the store into a heap buffer. Parallel to the + * read_blob_file helper that lives behind a static in src/oci/inspect.c; + * the 50 lines here intentionally duplicate that helper rather than + * hoisting it into a public utility, because the inspect path needs + * subtly different error reporting (a "blob missing" warning that goes + * to stderr) and a hoist would have to keep both shapes alive. + */ +static int load_blob(oci_blob_store_t *blobs, + oci_digest_algo_t algo, + const char *hex, + char **out_body, + size_t *out_len, + const char **err) +{ + char path[4096]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + set_err_static(err, "blob path too long"); + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY); + if (fd < 0) { + set_err_fmt(err, "cannot open blob %s: %s", path, strerror(errno)); + return -1; + } + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + set_err_static(err, "fstat on blob failed"); + errno = saved; + return -1; + } + if (st.st_size < 0 || st.st_size > 64 * 1024 * 1024) { + close(fd); + set_err_static(err, "blob too large"); + errno = EFBIG; + return -1; + } + size_t want = (size_t) st.st_size; + char *buf = malloc(want + 1); + if (!buf) { + close(fd); + set_err_static(err, "out of memory loading blob"); + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < want) { + ssize_t r = read(fd, buf + off, want - off); + if (r < 0) { + int saved = errno; + free(buf); + close(fd); + set_err_static(err, "read on blob failed"); + errno = saved; + return -1; + } + if (r == 0) + break; + off += (size_t) r; + } + close(fd); + if (off != want) { + free(buf); + set_err_static(err, "short read on blob"); + errno = EIO; + return -1; + } + buf[want] = '\0'; + *out_body = buf; + *out_len = want; + return 0; +} + +/* Resolve the manifest blob pinned at digest_str. If the blob is an + * image index (the shape docker.io multi-arch tags such as alpine:3 + * pin to by default), drill into the linux/arm64 leaf descriptor and + * re-load that blob; the leaf body is what oci_manifest_parse expects. + * The mirror of this classify-then-walk path lives in src/oci/inspect.c; + * keep the two in sync. On success returns 0 with *out_body holding the + * leaf-manifest bytes, *out_len its length, and *out_mf the parsed + * shape. The caller frees *out_body via free() and *out_mf via + * oci_manifest_free(); on failure both stay untouched and the helper + * cleans up its own intermediate state. + */ +static int resolve_image_manifest(oci_store_t *store, + const char *digest_str, + char **out_body, + size_t *out_len, + oci_manifest_t *out_mf, + const char **err) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + set_err_static(err, "pinned manifest digest is malformed"); + errno = EINVAL; + return -1; + } + + char *body = NULL; + size_t len = 0; + if (load_blob(oci_store_blobs(store), algo, hex, &body, &len, err) < 0) + return -1; + + /* Classify. Index and manifest are disjoint JSON shapes (one + * requires "manifests", the other requires "config" + "layers"), + * so a successful parse is unambiguous. Drilling happens only when + * the index parse wins; the leaf-pinned shape (the fixture-builder + * and tests/test-oci-compat.sh path) falls through directly to the + * manifest parser below. + */ + oci_index_t idx = {0}; + if (oci_index_parse(body, len, &idx, NULL) == 0) { + const oci_index_entry_t *picked = oci_index_pick_linux_arm64(&idx); + if (!picked) { + oci_index_free(&idx); + free(body); + set_err_static(err, "image index has no linux/arm64 entry"); + errno = ENOENT; + return -1; + } + char *sub_body = NULL; + size_t sub_len = 0; + if (load_blob(oci_store_blobs(store), picked->desc.algo, + picked->desc.hex, &sub_body, &sub_len, err) < 0) { + oci_index_free(&idx); + free(body); + return -1; + } + oci_index_free(&idx); + free(body); + body = sub_body; + len = sub_len; + } + + oci_manifest_t mf = {0}; + const char *mparse_err = NULL; + if (oci_manifest_parse(body, len, &mf, &mparse_err) < 0) { + set_err_fmt(err, "manifest parse failed: %s", + mparse_err ? mparse_err : "(no message)"); + free(body); + errno = EPROTO; + return -1; + } + + *out_body = body; + *out_len = len; + *out_mf = mf; + return 0; +} + +int oci_run_resolve_image_manifest_for_testing(oci_store_t *store, + const char *digest_str, + char **out_body, + size_t *out_len, + oci_manifest_t *out_mf, + const char **err) +{ + if (err) + *err = NULL; + if (!store || !digest_str || !out_body || !out_len || !out_mf) { + set_err_static(err, "resolve_image_manifest: NULL argument"); + errno = EINVAL; + return -1; + } + return resolve_image_manifest(store, digest_str, out_body, out_len, out_mf, + err); +} + +/* Concatenate two path components with one slash boundary. Caller frees + * the result. + */ +static char *path_join_heap(const char *a, const char *b) +{ + if (!a) + return b ? strdup(b) : NULL; + if (!b) + return strdup(a); + size_t alen = strlen(a); + size_t blen = strlen(b); + bool a_trail = alen > 0 && a[alen - 1] == '/'; + bool b_lead = blen > 0 && b[0] == '/'; + char *r = malloc(alen + blen + 2); + if (!r) + return NULL; + if (a_trail && b_lead) { + memcpy(r, a, alen); + memcpy(r + alen, b + 1, blen - 1); + r[alen + blen - 1] = '\0'; + } else if (!a_trail && !b_lead && alen > 0 && blen > 0) { + memcpy(r, a, alen); + r[alen] = '/'; + memcpy(r + alen + 1, b, blen); + r[alen + 1 + blen] = '\0'; + } else { + memcpy(r, a, alen); + memcpy(r + alen, b, blen); + r[alen + blen] = '\0'; + } + return r; +} + +/* Recursive mkdir; tolerates existing intermediate directories. The + * Phase 3 plan calls for chowning each newly created segment to + * (spec.uid, spec.gid) when has_creds is set; macOS rejects fchownat + * for non-root callers spoofing arbitrary uids, so the chown is best + * effort and silently ignored when it fails. The intended ownership + * is also recorded by the sidecar metadata that the syscall layer + * (Phase 4) will read; for Phase 3 the host inode owner stays the + * invoking user. + */ +static int mkdir_p_owned(const char *path, + mode_t mode, + uint32_t uid, + uint32_t gid, + bool has_creds, + const char **err) +{ + if (!path || !*path) { + set_err_static(err, "empty path in mkdir_p"); + errno = EINVAL; + return -1; + } + /* Walk segments; create each prefix. */ + char *dup = strdup(path); + if (!dup) { + set_err_static(err, "out of memory in mkdir_p"); + errno = ENOMEM; + return -1; + } + for (char *p = dup + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(dup, mode) < 0 && errno != EEXIST) { + int saved = errno; + set_err_fmt(err, "mkdir %s failed: %s", dup, strerror(saved)); + free(dup); + errno = saved; + return -1; + } + if (has_creds) + (void) chown(dup, uid, gid); + *p = '/'; + } + if (mkdir(dup, mode) < 0 && errno != EEXIST) { + int saved = errno; + set_err_fmt(err, "mkdir %s failed: %s", dup, strerror(saved)); + free(dup); + errno = saved; + return -1; + } + if (has_creds) + (void) chown(dup, uid, gid); + free(dup); + return 0; +} + +/* Pull PATH=... out of a NULL-terminated envp. Returns the value + * pointer (not strdup'd) or NULL if no PATH key is present. + */ +static const char *envp_get_path(const char *const *envp) +{ + if (!envp) + return NULL; + for (size_t i = 0; envp[i]; i++) { + if (strncmp(envp[i], "PATH=", 5) == 0) + return envp[i] + 5; + } + return NULL; +} + +/* Count entries in a NULL-terminated string vector. */ +static int strvec_count(char *const *v) +{ + int n = 0; + if (!v) + return 0; + while (v[n]) + n++; + return n; +} + +int oci_run(oci_store_t *store, + const oci_ref_t *ref, + const oci_run_options_t *opts, + const char *const *host_environ, + const char **err) +{ + if (err) + *err = NULL; + if (!store || !ref || !opts) { + set_err_static(err, "oci_run: NULL store/ref/opts"); + errno = EINVAL; + return -1; + } + /* Suppress unused warnings for fields kept on the options struct + * for forward compatibility with Phase 3 plan items not yet wired + * (clone_name -> deterministic run-dir, store_dir -> consumed by + * the caller before this function sees the store). + */ + (void) opts->store_dir; + (void) opts->clone_name; + + char *image_dir = NULL; + char *volume_root = NULL; + char *run_dir = NULL; + char *manifest_body = NULL; + char *config_body = NULL; + char *host_argv0 = NULL; + char *guest_argv0 = NULL; + char *cwd_host = NULL; + oci_manifest_t mf = {0}; + oci_image_config_t cfg = {0}; + oci_runspec_t spec = {0}; + int rc = -1; + char host_cwd[PATH_MAX]; + bool have_host_cwd = (getcwd(host_cwd, sizeof(host_cwd)) != NULL); + + /* 1. unpack (idempotent). */ + oci_unpack_options_t uopts = { + .volume_root = opts->volume_dir, .quiet = true, .force_relayer = false}; + const char *unpack_err = NULL; + if (oci_unpack(store, ref, &uopts, &image_dir, &unpack_err) < 0) { + set_err_fmt(err, "unpack failed: %s", + unpack_err ? unpack_err : strerror(errno)); + goto out; + } + + /* 2. resolve volume root (clone-rootfs lands under /runs/). */ + const char *vol_err = NULL; + if (oci_volume_ensure(opts->volume_dir, &volume_root, &vol_err) < 0) { + set_err_fmt(err, "volume_ensure failed: %s", + vol_err ? vol_err : strerror(errno)); + goto out; + } + + /* 3. clone-rootfs. image_dir has a trailing slash; strip for the + * source argument so the inner code joins correctly. + */ + size_t il = strlen(image_dir); + if (il > 1 && image_dir[il - 1] == '/') + image_dir[il - 1] = '\0'; + const char *clone_err = NULL; + if (oci_clone_rootfs(image_dir, volume_root, &run_dir, &clone_err) < 0) { + set_err_fmt(err, "clone-rootfs failed: %s", + clone_err ? clone_err : strerror(errno)); + goto out; + } + + /* 3.5. inject host-truth /etc/{resolv.conf,hosts,hostname} so + * guest libc lookups (getaddrinfo, gethostname, /etc/hosts walks) + * match the macOS host instead of the image's containerd + * defaults. Failure tears the clone-rootfs back down through the + * existing cleanup epilogue. + */ + const char *rfi_err = NULL; + if (oci_runtime_files_inject(run_dir, &rfi_err) < 0) { + set_err_fmt(err, "runtime-files inject failed: %s", + rfi_err ? rfi_err : strerror(errno)); + goto out; + } + + /* 4. read manifest blob, then config blob, then parse both. The + * manifest read goes through resolve_image_manifest, which + * transparently walks one image-index indirection when the pin + * lands on a multi-arch index (the docker.io default for tags like + * alpine:3). + */ + char *manifest_digest_str = NULL; + const char *getref_err = NULL; + if (oci_store_get_ref(store, ref, &manifest_digest_str, &getref_err) < 0) { + set_err_fmt(err, "no pin for ref: %s", + getref_err ? getref_err : strerror(errno)); + goto out; + } + size_t manifest_len = 0; + int rcm = resolve_image_manifest(store, manifest_digest_str, &manifest_body, + &manifest_len, &mf, err); + free(manifest_digest_str); + if (rcm < 0) + goto out; + size_t config_len = 0; + if (load_blob(oci_store_blobs(store), mf.config.algo, mf.config.hex, + &config_body, &config_len, err) < 0) + goto out; + const char *cparse_err = NULL; + if (oci_image_config_parse(config_body, config_len, &cfg, &cparse_err) < + 0) { + set_err_fmt(err, "image config parse failed: %s", + cparse_err ? cparse_err : "(no message)"); + goto out; + } + + /* 5. fold runtime + CLI overrides into argv/envp/cwd/uid. Layer + * the unpacked clone-rootfs over the caller's flags so the runspec + * resolver can read /etc/passwd and /etc/group for symbolic User + * (Phase 4 F4.7). The caller-side flags stay const; only the local + * copy points the resolver at the rootfs. + */ + oci_runspec_flags_t spec_flags = opts->spec; + spec_flags.rootfs_for_nss = run_dir; + const char *rs_err = NULL; + if (oci_runspec_build(&cfg.config, &spec_flags, host_environ, &spec, + &rs_err) < 0) { + set_err_fmt(err, "runspec build failed: %s", + rs_err ? rs_err : strerror(errno)); + goto out; + } + + /* 6. materialize spec.cwd under run_dir. */ + cwd_host = path_join_heap(run_dir, spec.cwd); + if (!cwd_host) { + set_err_static(err, "out of memory building cwd host path"); + errno = ENOMEM; + goto out; + } + if (mkdir_p_owned(cwd_host, 0755, spec.uid, spec.gid, spec.has_creds, err) < + 0) + goto out; + + /* 7. PATH-resolve argv[0]. */ + if (!spec.argv || !spec.argv[0]) { + set_err_static(err, "runspec produced empty argv"); + errno = EINVAL; + goto out; + } + const char *path_env = envp_get_path((const char *const *) spec.envp); + const char *pr_err = NULL; + if (oci_path_resolve(run_dir, spec.argv[0], path_env, spec.cwd, &host_argv0, + &guest_argv0, &pr_err) < 0) { + set_err_fmt(err, "%s", pr_err ? pr_err : "argv[0] resolution failed"); + goto out; + } + + /* 8. swap spec.argv[0] for the guest-absolute path the resolver + * handed back. The guest reads /proc/self/exe + argv[0] and expects + * the canonical name it would have seen via execvp. + */ + free(spec.argv[0]); + spec.argv[0] = guest_argv0; + guest_argv0 = NULL; /* ownership transferred to spec */ + + /* 9. chdir into the materialized WorkingDir so the guest inherits + * its OCI cwd. + */ + if (chdir(cwd_host) < 0) { + set_err_fmt(err, "chdir into '%s' failed: %s", cwd_host, + strerror(errno)); + goto out; + } + + /* 10. assemble launch_args and dispatch. */ + launch_args_t la = { + .elf_path = host_argv0, + .sysroot = run_dir, + .guest_argc = strvec_count(spec.argv), + .guest_argv = (const char **) spec.argv, + .envp = (const char **) spec.envp, + .has_creds = spec.has_creds, + .uid = spec.uid, + .gid = spec.gid, + .cwd_guest = spec.cwd, + .gdb_port = 0, + .gdb_stop_on_entry = false, + .timeout_sec = 10, + .fork_child_fd = -1, + .vfork_notify_fd = -1, + .verbose = false, + }; + oci_run_launch_fn_t launch = + g_launch_override ? g_launch_override : elfuse_launch; + rc = launch(&la); + + /* 11. restore host cwd before cleanup so subsequent paths + * (clone-rootfs-remove, sysroot detach) operate from a sane place. + */ + if (have_host_cwd && chdir(host_cwd) < 0) { + log_warn("could not restore host cwd to %s: %s", host_cwd, + strerror(errno)); + (void) chdir("/"); + } + +out: + if (run_dir && !opts->keep_rootfs) { + const char *rm_err = NULL; + if (oci_clone_rootfs_remove(run_dir, &rm_err) < 0) + log_warn("clone-rootfs cleanup partial: %s", + rm_err ? rm_err : strerror(errno)); + } + free(host_argv0); + free(guest_argv0); + free(cwd_host); + oci_runspec_free(&spec); + oci_image_config_free(&cfg); + oci_manifest_free(&mf); + free(config_body); + free(manifest_body); + free(run_dir); + free(volume_root); + free(image_dir); + return rc; +} + +/* ── CLI entry ─────────────────────────────────────────────────── */ + +static int print_run_usage(FILE *out) +{ + fputs( + "usage: elfuse oci run [OPTIONS] IMAGE [ARG...]\n" + "\n" + "Run a binary from an already-pulled OCI image. The image's\n" + "Entrypoint/Cmd/Env/WorkingDir/User are honored; CLI flags\n" + "override individual fields.\n" + "\n" + "Options:\n" + " --store DIR Override the local store root\n" + " --volume DIR Override the sysroot APFS volume\n" + " --entrypoint PROG Replace the image Entrypoint\n" + " -e, --env KEY=VAL Set or replace env var\n" + " -e, --env KEY Import KEY from host environ\n" + " -w, --workdir DIR Override image WorkingDir\n" + " -u, --user UID[:GID] Override image User (numeric or " + "name[:group])\n" + " --keep Keep the per-run rootfs after exit\n" + " --name NAME Reserved: deterministic clone dir\n" + " (currently ignored)\n" + "\n" + "IMAGE follows the docker/containerd grammar (alpine,\n" + "alpine:3.20, ghcr.io/owner/img:tag, etc.). The image must\n" + "already be pulled; this subcommand does not auto-pull.\n", + out); + return out == stderr ? 2 : 0; +} + +extern char **environ; + +int oci_cli_run(int argc, char **argv) +{ + oci_run_options_t opts = {0}; + const char **env_overrides = NULL; + size_t n_overrides = 0; + size_t cap_overrides = 0; + + int i = 1; + while (i < argc) { + const char *a = argv[i]; + if (a[0] != '-') + break; + if (!strcmp(a, "--")) { + i++; + break; + } + if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + free(env_overrides); + return print_run_usage(stdout); + } + if (!strcmp(a, "--store")) { + if (++i >= argc) { + fputs("error: --store needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.store_dir = argv[i]; + } else if (!strcmp(a, "--volume")) { + if (++i >= argc) { + fputs("error: --volume needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.volume_dir = argv[i]; + } else if (!strcmp(a, "--entrypoint")) { + if (++i >= argc) { + fputs("error: --entrypoint needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.spec.entrypoint_override = argv[i]; + } else if (!strcmp(a, "-e") || !strcmp(a, "--env")) { + if (++i >= argc) { + fputs("error: -e/--env needs an argument\n", stderr); + free(env_overrides); + return 2; + } + if (n_overrides + 1 > cap_overrides) { + size_t newcap = cap_overrides ? cap_overrides * 2 : 8; + const char **np = realloc(env_overrides, newcap * sizeof(*np)); + if (!np) { + fputs("error: out of memory\n", stderr); + free(env_overrides); + return 1; + } + env_overrides = np; + cap_overrides = newcap; + } + env_overrides[n_overrides++] = argv[i]; + } else if (!strcmp(a, "-w") || !strcmp(a, "--workdir")) { + if (++i >= argc) { + fputs("error: -w/--workdir needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.spec.workdir_override = argv[i]; + } else if (!strcmp(a, "-u") || !strcmp(a, "--user")) { + if (++i >= argc) { + fputs("error: -u/--user needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.spec.user_override = argv[i]; + } else if (!strcmp(a, "--keep")) { + opts.keep_rootfs = true; + } else if (!strcmp(a, "--name")) { + if (++i >= argc) { + fputs("error: --name needs an argument\n", stderr); + free(env_overrides); + return 2; + } + opts.clone_name = argv[i]; + } else { + fprintf(stderr, "error: unknown option: %s\n", a); + free(env_overrides); + return 2; + } + i++; + } + if (i >= argc) { + fputs("error: oci run needs IMAGE\n", stderr); + free(env_overrides); + return 2; + } + const char *ref_str = argv[i]; + i++; + opts.spec.env_overrides = env_overrides; + opts.spec.nenv_overrides = n_overrides; + opts.spec.positional_argc = argc - i; + opts.spec.positional_argv = (const char *const *) (argv + i); + + oci_ref_t ref = {0}; + const char *parse_err = NULL; + if (oci_ref_parse(ref_str, &ref, &parse_err) < 0) { + fprintf(stderr, "error: invalid reference: %s\n", + parse_err ? parse_err : "(unknown)"); + free(env_overrides); + return 1; + } + + char *default_root = NULL; + const char *store_root = opts.store_dir; + if (!store_root) { + default_root = oci_store_default_root(); + if (!default_root) { + fputs("error: cannot determine default store root (HOME?)\n", + stderr); + oci_ref_free(&ref); + free(env_overrides); + return 1; + } + store_root = default_root; + } + oci_store_t *store = oci_store_open(store_root); + if (!store) { + fprintf(stderr, "error: cannot open store at %s: %s\n", store_root, + strerror(errno)); + oci_ref_free(&ref); + free(default_root); + free(env_overrides); + return 1; + } + + const char *run_err = NULL; + int rc = + oci_run(store, &ref, &opts, (const char *const *) environ, &run_err); + if (rc < 0) { + fprintf(stderr, "error: oci run failed: %s\n", + run_err ? run_err : strerror(errno)); + rc = 1; + } + + oci_store_close(store); + oci_ref_free(&ref); + free(default_root); + free(env_overrides); + return rc; +} diff --git a/src/oci/run.h b/src/oci/run.h new file mode 100644 index 0000000..0e33770 --- /dev/null +++ b/src/oci/run.h @@ -0,0 +1,114 @@ +/* elfuse oci run -- launch a guest binary from a pulled OCI image + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Closes the Phase 3 loop: unpack + clone-rootfs + image-config parse + + * runspec build + PATH resolve + elfuse_launch under one subcommand. + * The user runs an OCI image directly, with the image's + * Entrypoint/Cmd/Env/WorkingDir/User honored as configured by the + * image producer and overridable by the elfuse CLI. + * + * Dependencies (all already landed in earlier slices): + * + * - oci_unpack (Phase 2) -- layers -> image sysroot under + * the APFS sysroot volume + * - oci_clone_rootfs (Phase 2) -- clonefile-based per-run rootfs + * - oci_image_config_parse (Phase 1) + * - oci_runspec_build (Phase 3 C2) -- folds image config + CLI flags + * into argv/envp/cwd/uid bundle + * - oci_path_resolve (Phase 3 C3) -- argv0 + PATH -> host_path/ + * guest_path with sysroot + * containment + * - elfuse_launch (Phase 3 C4) -- VM bring-up shared with main() + * + * Lifetime / cleanup contract: + * + * - oci_run owns its intermediate state (run_dir, parsed manifest / + * config blobs, oci_runspec_t, resolved host_argv0). It frees + * everything before returning. + * - On success it removes the clone dir unless opts->keep_rootfs is + * set. On launch failure (elfuse_launch returns non-zero) it still + * removes the clone dir by default so a failed run does not leave + * stale clones on the volume. + * - host cwd is saved and restored across the call. The launch + * itself chdir's into so the guest sees its + * WorkingDir as cwd. + */ + +#pragma once + +#include +#include + +#include "core/launch.h" + +#include "manifest.h" +#include "ref.h" +#include "runspec.h" +#include "store.h" + +/* Flags assembled by oci_cli_run from the elfuse oci run command line. + * store_dir / volume_dir override the default Library/Application + * Support paths; clone_name reserves a slot for a future deterministic + * run-dir naming option that Phase 2's oci_clone_rootfs does not yet + * support, so the field is currently ignored. spec carries every + * runspec-relevant override (Entrypoint, -e, -w, -u, IMAGE, ARGV tail); + * it is forwarded verbatim to oci_runspec_build. + */ +typedef struct { + const char *store_dir; + const char *volume_dir; + oci_runspec_flags_t spec; + bool keep_rootfs; + const char *clone_name; +} oci_run_options_t; + +/* `elfuse oci run` subcommand entry. Argument parsing, ref parse, store + * open, oci_run dispatch. Returns a process exit code (0 success, 1 on + * runtime failure, 2 on usage / argument error to match the rest of + * src/oci/cli.c). + */ +int oci_cli_run(int argc, char **argv); + +/* Programmatic entry: drive the full unpack -> clone -> runspec -> path + * resolve -> elfuse_launch pipeline against an already-opened store and + * parsed ref. host_environ is forwarded to oci_runspec_build for the + * Env merge policy; pass the process environ. *err is populated with a + * static diagnostic on failure; the pointer is valid until the next + * call (or until oci_runspec_build / oci_path_resolve overwrite their + * own thread-local buffer for a different diagnostic class). + * + * Returns: + * >= 0 exit code of the guest binary + * -1 pre-launch failure (unpack, clone, parse, runspec, path + * resolve, or directory materialization) + */ +int oci_run(oci_store_t *store, + const oci_ref_t *ref, + const oci_run_options_t *opts, + const char *const *host_environ, + const char **err); + +/* Test hook: swap the underlying launch backend. Pass NULL to restore + * the default (elfuse_launch). The override is process-global and + * exists only so unit tests can run the orchestrator without spinning + * up a real HVF VM. Production code must never call this. + */ +typedef int (*oci_run_launch_fn_t)(const launch_args_t *args); +void oci_run_set_launch_for_testing(oci_run_launch_fn_t fn); + +/* Test hook: drive the manifest-resolution step (load blob, classify + * index vs leaf, drill linux/arm64 on index, parse) in isolation. The + * production caller is oci_run; this hook exists so unit tests can + * verify the multi-arch index-walk without spinning up an APFS + * sysroot volume. Output ownership matches the production internal + * helper: caller frees *out_body and *out_mf via free() and + * oci_manifest_free() respectively. Production code must use oci_run. + */ +int oci_run_resolve_image_manifest_for_testing(oci_store_t *store, + const char *digest_str, + char **out_body, + size_t *out_len, + oci_manifest_t *out_mf, + const char **err); diff --git a/src/oci/runspec.c b/src/oci/runspec.c new file mode 100644 index 0000000..3e01cdd --- /dev/null +++ b/src/oci/runspec.c @@ -0,0 +1,520 @@ +/* OCI launch-spec resolver: image runtime + CLI overrides -> argv/envp/... + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Pure-data merge. No filesystem touches. The translation unit deliberately + * carries its own small string-vector helpers instead of pulling in + * src/utils.h: the rest of elfuse uses arena-style allocation patterns + * tied to the guest VM lifetime, but a runspec lives across an + * elfuse oci run invocation and ships back to the caller via + * oci_runspec_t. Owning every char* with plain malloc/free makes that + * lifetime contract auditable. + */ + +#include "runspec.h" + +#include +#include +#include +#include +#include +#include + +#include "user-lookup.h" + +/* Diagnostic scratch. Thread-local so concurrent oci_runspec_build calls + * (one per --keep run dir, in a future multiplexed oci run) do not clobber + * each other's err pointer. Buffer size is generous enough for the + * longest dynamic message: the rejected User value plus the fixed Phase 4 + * pointer text. + */ +static _Thread_local char runspec_err_buf[512]; + +static const char *set_err_static(const char **err, const char *msg) +{ + if (err) + *err = msg; + return msg; +} + +static const char *set_err_fmt(const char **err, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); + +static const char *set_err_fmt(const char **err, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(runspec_err_buf, sizeof(runspec_err_buf), fmt, ap); + va_end(ap); + if (err) + *err = runspec_err_buf; + return runspec_err_buf; +} + +/* Min-grow string vector. Always NUL-terminates the backing array when + * finalized so the result is environ/argv-shaped. + */ +typedef struct { + char **items; + size_t n; + size_t cap; +} strvec_t; + +static int strvec_reserve(strvec_t *v, size_t need) +{ + if (need <= v->cap) + return 0; + size_t newcap = v->cap ? v->cap : 8; + while (newcap < need) + newcap *= 2; + char **np = realloc(v->items, newcap * sizeof(*np)); + if (!np) + return -1; + v->items = np; + v->cap = newcap; + return 0; +} + +/* Append a heap string to the vector, taking ownership. On allocation + * failure the input pointer stays unfreed (caller frees on cleanup). + */ +static int strvec_push_take(strvec_t *v, char *s_owned) +{ + if (strvec_reserve(v, v->n + 1) < 0) + return -1; + v->items[v->n++] = s_owned; + return 0; +} + +static int strvec_push_strdup(strvec_t *v, const char *s) +{ + char *d = strdup(s); + if (!d) + return -1; + if (strvec_push_take(v, d) < 0) { + free(d); + return -1; + } + return 0; +} + +/* Hand the items array to the caller as a NULL-terminated argv/envp. + * Resets the vector so subsequent free is a no-op. + */ +static int strvec_finalize(strvec_t *v, char ***out) +{ + char **arr = realloc(v->items, (v->n + 1) * sizeof(*arr)); + if (!arr) + return -1; + arr[v->n] = NULL; + *out = arr; + v->items = NULL; + v->n = 0; + v->cap = 0; + return 0; +} + +static void strvec_free(strvec_t *v) +{ + if (!v || !v->items) + return; + for (size_t i = 0; i < v->n; i++) + free(v->items[i]); + free(v->items); + v->items = NULL; + v->n = 0; + v->cap = 0; +} + +/* Return length of the KEY portion of a "KEY=VAL" or bare "KEY" string. */ +static size_t env_key_len(const char *kv) +{ + const char *eq = strchr(kv, '='); + return eq ? (size_t) (eq - kv) : strlen(kv); +} + +static ssize_t env_lookup_idx(const strvec_t *v, + const char *key, + size_t key_len) +{ + for (size_t i = 0; i < v->n; i++) { + const char *e = v->items[i]; + size_t elen = env_key_len(e); + if (elen == key_len && memcmp(e, key, key_len) == 0) + return (ssize_t) i; + } + return -1; +} + +/* Set-or-replace env entry by KEY (computed from kv_owned's pre-'=' prefix). + * Takes ownership of kv_owned. On replace, the old entry is freed. On + * allocation failure, kv_owned is freed and -1 is returned. + */ +static int env_set_take(strvec_t *v, char *kv_owned) +{ + size_t klen = env_key_len(kv_owned); + ssize_t idx = env_lookup_idx(v, kv_owned, klen); + if (idx >= 0) { + free(v->items[idx]); + v->items[idx] = kv_owned; + return 0; + } + if (strvec_push_take(v, kv_owned) < 0) { + free(kv_owned); + return -1; + } + return 0; +} + +static int env_set_strdup(strvec_t *v, const char *kv) +{ + char *d = strdup(kv); + if (!d) + return -1; + return env_set_take(v, d); +} + +/* Build "KEY=VAL" on the heap from the two halves and feed it through + * env_set_take. Used for host-import and TERM auto-import paths where + * the source is not already a single KEY=VAL string. + */ +static int env_set_pair(strvec_t *v, const char *key, const char *val) +{ + size_t klen = strlen(key); + size_t vlen = strlen(val); + char *kv = malloc(klen + 1 + vlen + 1); + if (!kv) + return -1; + memcpy(kv, key, klen); + kv[klen] = '='; + memcpy(kv + klen + 1, val, vlen); + kv[klen + 1 + vlen] = '\0'; + return env_set_take(v, kv); +} + +static const char *host_env_lookup(const char *const *host_environ, + const char *key, + size_t key_len) +{ + if (!host_environ) + return NULL; + for (size_t i = 0; host_environ[i]; i++) { + const char *e = host_environ[i]; + size_t elen = env_key_len(e); + if (elen == key_len && memcmp(e, key, key_len) == 0 && e[elen] == '=') { + return e + elen + 1; + } + } + return NULL; +} + +/* "set" in the override matrix means "non-NULL array containing at least + * one entry". Explicit empty arrays ([]) count as "unset" so an image + * Cmd=[] does not produce a zero-length argv when paired with Entrypoint. + */ +static bool runtime_arr_is_set(char *const *arr) +{ + return arr != NULL && arr[0] != NULL; +} + +/* WorkingDir must be absolute and free of ".." path components. Empty + * segments (consecutive slashes, trailing slash) are tolerated; the + * caller-side path materialization will normalize them. + */ +static int validate_workdir(const char *s) +{ + if (!s || s[0] != '/') + return -1; + const char *p = s + 1; + while (*p) { + const char *next = strchr(p, '/'); + size_t seg_len = next ? (size_t) (next - p) : strlen(p); + if (seg_len == 2 && p[0] == '.' && p[1] == '.') + return -1; + if (!next) + break; + p = next + 1; + } + return 0; +} + +/* Resolve credentials from CLI --user override, then image User, then + * host inheritance. Both sources route through oci_user_lookup so the + * numeric / symbolic / mixed shapes are handled uniformly; the only + * difference between the two paths is the diagnostic prefix so a caller + * can tell whether the bad value came from their CLI flag or from the + * pulled image. Symbolic resolution requires flags->rootfs_for_nss; a + * NULL rootfs causes the helper to reject any symbolic token, preserving + * the "pure data" contract for callers that have not unpacked a rootfs. + */ +static int resolve_user(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + oci_runspec_t *out, + const char **err) +{ + if (flags->user_override) { + const char *lookup_err = NULL; + if (oci_user_lookup(flags->rootfs_for_nss, flags->user_override, + &out->uid, &out->gid, &lookup_err) < 0) { + int saved = errno; + set_err_fmt(err, "--user '%s': %s", flags->user_override, + lookup_err ? lookup_err : "lookup failed"); + errno = saved ? saved : EINVAL; + return -1; + } + out->has_creds = true; + return 0; + } + if (cfg && cfg->user && *cfg->user) { + const char *lookup_err = NULL; + if (oci_user_lookup(flags->rootfs_for_nss, cfg->user, &out->uid, + &out->gid, &lookup_err) < 0) { + int saved = errno; + set_err_fmt(err, "User '%s': %s", cfg->user, + lookup_err ? lookup_err : "lookup failed"); + errno = saved ? saved : EINVAL; + return -1; + } + out->has_creds = true; + return 0; + } + out->has_creds = false; + return 0; +} + +static int resolve_workdir(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + oci_runspec_t *out, + const char **err) +{ + const char *chosen = NULL; + if (flags->workdir_override) { + chosen = flags->workdir_override; + } else if (cfg && cfg->working_dir && *cfg->working_dir) { + chosen = cfg->working_dir; + } else { + chosen = "/"; + } + if (validate_workdir(chosen) < 0) { + set_err_fmt(err, "WorkingDir must be absolute and not contain '..': %s", + chosen); + errno = EINVAL; + return -1; + } + out->cwd = strdup(chosen); + if (!out->cwd) { + set_err_static(err, "out of memory allocating cwd"); + errno = ENOMEM; + return -1; + } + return 0; +} + +/* Argv build follows the override matrix from the Phase 3 plan. The + * matrix collapses to: --entrypoint clobbers everything (Cmd dropped, + * image Entrypoint dropped, [override] ++ CLI args); otherwise image + * Entrypoint and Cmd combine with the CLI tail using "CLI args drop Cmd + * but keep Entrypoint" precedence. + */ +static int build_argv(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + oci_runspec_t *out, + const char **err) +{ + strvec_t argv = {0}; + int rc = -1; + + if (flags->entrypoint_override) { + if (strvec_push_strdup(&argv, flags->entrypoint_override) < 0) + goto oom; + for (int i = 0; i < flags->positional_argc; i++) { + if (strvec_push_strdup(&argv, flags->positional_argv[i]) < 0) + goto oom; + } + } else if (cfg && runtime_arr_is_set(cfg->entrypoint)) { + for (char *const *p = cfg->entrypoint; *p; p++) { + if (strvec_push_strdup(&argv, *p) < 0) + goto oom; + } + if (flags->positional_argc > 0) { + for (int i = 0; i < flags->positional_argc; i++) { + if (strvec_push_strdup(&argv, flags->positional_argv[i]) < 0) + goto oom; + } + } else if (cfg && runtime_arr_is_set(cfg->cmd)) { + for (char *const *p = cfg->cmd; *p; p++) { + if (strvec_push_strdup(&argv, *p) < 0) + goto oom; + } + } + } else { + if (flags->positional_argc > 0) { + for (int i = 0; i < flags->positional_argc; i++) { + if (strvec_push_strdup(&argv, flags->positional_argv[i]) < 0) + goto oom; + } + } else if (cfg && runtime_arr_is_set(cfg->cmd)) { + for (char *const *p = cfg->cmd; *p; p++) { + if (strvec_push_strdup(&argv, *p) < 0) + goto oom; + } + } else { + set_err_static(err, + "image has no entrypoint or cmd; pass one on" + " the CLI"); + errno = EINVAL; + goto done; + } + } + + if (strvec_finalize(&argv, &out->argv) < 0) + goto oom; + rc = 0; + goto done; + +oom: + set_err_static(err, "out of memory building argv"); + errno = ENOMEM; +done: + strvec_free(&argv); + return rc; +} + +#define LINUX_DEFAULT_PATH \ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +/* Apply the Env merge policy described in runspec.h. The build proceeds + * in stages so the merge order matches the spec: image Env first, CLI + * overrides second, then defaults (TERM, PATH, container). DYLD_ rejection + * runs before any allocation per CLI override so the failing key is + * reported with no half-applied state. + */ +static int build_envp(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + const char *const *host_environ, + oci_runspec_t *out, + const char **err) +{ + strvec_t env = {0}; + int rc = -1; + + if (cfg && cfg->env) { + for (char *const *p = cfg->env; *p; p++) { + if (env_set_strdup(&env, *p) < 0) + goto oom; + } + } + + for (size_t i = 0; i < flags->nenv_overrides; i++) { + const char *kv = flags->env_overrides[i]; + if (!kv) + continue; + if (strncmp(kv, "DYLD_", 5) == 0) { + size_t klen = env_key_len(kv); + char key_only[64]; + size_t copy = + klen < sizeof(key_only) - 1 ? klen : sizeof(key_only) - 1; + memcpy(key_only, kv, copy); + key_only[copy] = '\0'; + set_err_fmt(err, + "--env: refusing to set DYLD_* (macOS-only ABI):" + " %s", + key_only); + errno = EINVAL; + goto done; + } + const char *eq = strchr(kv, '='); + if (eq) { + if (env_set_strdup(&env, kv) < 0) + goto oom; + } else { + size_t klen = strlen(kv); + const char *hv = host_env_lookup(host_environ, kv, klen); + if (hv) { + if (env_set_pair(&env, kv, hv) < 0) + goto oom; + } + } + } + + if (env_lookup_idx(&env, "TERM", 4) < 0) { + const char *host_term = host_env_lookup(host_environ, "TERM", 4); + if (host_term) { + if (env_set_pair(&env, "TERM", host_term) < 0) + goto oom; + } + } + if (env_lookup_idx(&env, "PATH", 4) < 0) { + if (env_set_strdup(&env, LINUX_DEFAULT_PATH) < 0) + goto oom; + } + if (env_set_strdup(&env, "container=elfuse") < 0) + goto oom; + + if (strvec_finalize(&env, &out->envp) < 0) + goto oom; + rc = 0; + goto done; + +oom: + set_err_static(err, "out of memory building envp"); + errno = ENOMEM; +done: + strvec_free(&env); + return rc; +} + +int oci_runspec_build(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + const char *const *host_environ, + oci_runspec_t *out, + const char **err) +{ + if (err) + *err = NULL; + if (!flags || !out) { + set_err_static(err, "oci_runspec_build: NULL flags or out"); + errno = EINVAL; + return -1; + } + memset(out, 0, sizeof(*out)); + + if (resolve_user(cfg, flags, out, err) < 0) + goto fail; + if (resolve_workdir(cfg, flags, out, err) < 0) + goto fail; + if (build_argv(cfg, flags, out, err) < 0) + goto fail; + if (build_envp(cfg, flags, host_environ, out, err) < 0) + goto fail; + return 0; + +fail: + oci_runspec_free(out); + memset(out, 0, sizeof(*out)); + return -1; +} + +void oci_runspec_free(oci_runspec_t *spec) +{ + if (!spec) + return; + free(spec->cwd); + spec->cwd = NULL; + if (spec->argv) { + for (char **p = spec->argv; *p; p++) + free(*p); + free(spec->argv); + spec->argv = NULL; + } + if (spec->envp) { + for (char **p = spec->envp; *p; p++) + free(*p); + free(spec->envp); + spec->envp = NULL; + } + spec->uid = 0; + spec->gid = 0; + spec->has_creds = false; +} diff --git a/src/oci/runspec.h b/src/oci/runspec.h new file mode 100644 index 0000000..565f4bb --- /dev/null +++ b/src/oci/runspec.h @@ -0,0 +1,123 @@ +/* OCI launch-spec resolver for elfuse oci run + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Folds an image-config runtime block (User, WorkingDir, Entrypoint, Cmd, + * Env) together with elfuse oci run CLI overrides into the concrete launch + * bundle: guest cwd, argv, envp, and optional uid/gid credentials. The + * module is pure data: no filesystem access, no PATH lookup, no syscalls. + * That keeps the override matrix and the Env merge policy verifiable by a + * unit test that constructs an oci_image_runtime_t literal directly. + * + * PATH search and rootfs materialization happen in later Phase 3 modules + * (path-resolve, run). This builder treats argv[0] as opaque; the caller + * downstream is responsible for resolving the host path the guest will + * actually load. + * + * Override matrix (issue #31 acceptance 2) follows the table documented in + * the Phase 3 plan: --entrypoint replaces the image Entrypoint and the + * image Cmd is dropped whenever CLI positional arguments are provided or + * --entrypoint is set. The "image has neither Entrypoint nor Cmd and the + * CLI supplied no positional arguments" case is the only hard-fail in the + * argv assembly path. + * + * Env merge (issue #31 acceptance 3) starts from the image Env array, + * applies CLI -e overrides (KEY=VAL set-or-replace; bare KEY imports the + * matching host environ value when present and otherwise drops silently), + * auto-imports TERM from the host when the merged Env has no TERM, injects + * the Linux PAM-default PATH when no PATH key has been set, and finally + * forces container=elfuse so systemd-style sandbox detection works + * regardless of what the image declared. CLI overrides whose KEY starts + * with DYLD_ hard-fail with EINVAL: DYLD_* is a macOS-only loader contract + * and has no meaning inside the guest. Image-provided DYLD_* entries pass + * through (aarch64 Linux ignores them); reviewers can escalate to strip + * if needed. + * + * WorkingDir defaults to "/" when neither the image nor the CLI sets it. + * Relative paths and any path containing a ".." segment hard-fail with + * EINVAL; sysroot containment is enforced later by the path-resolve module + * and the syscall layer. + * + * User accepts the seven shapes the OCI image-spec defines: empty (no + * override), "uid", "uid:gid", "name", "name:group", "uid:group", and + * "name:gid". Symbolic forms (anything other than the two pure-numeric + * shapes) require flags->rootfs_for_nss; the resolver reads + * /etc/passwd and /etc/group through oci_user_lookup + * (see src/oci/user-lookup.h). When rootfs_for_nss is NULL the resolver + * still accepts the pure-numeric forms; a symbolic token then returns + * EINVAL so unit tests can hold runspec to its "pure data" contract by + * passing NULL. CLI --user takes precedence over the image User; both + * shapes go through the same lookup. + * + * Error reporting: on failure, the function writes a pointer into *err + * that names what went wrong. The pointer is valid until the next call + * from this thread. Static messages (the fixed strings) and dynamic + * messages (those that quote the bad value) share the same lifetime + * contract so the caller does not need to branch. + */ + +#pragma once + +#include +#include +#include + +#include "manifest.h" + +/* CLI overrides assembled by the oci run argument parser. NULL fields mean + * "no override; defer to the image config". env_overrides is the literal + * sequence of -e arguments (KEY=VAL or bare KEY); the merge logic below + * walks it in order. positional_argv is the IMAGE-trailing argv tail + * (everything after the image reference on the command line). + */ +typedef struct { + const char *entrypoint_override; + const char *const *env_overrides; + size_t nenv_overrides; + const char *workdir_override; + const char *user_override; + int positional_argc; + const char *const *positional_argv; + /* Directory the symbolic User resolver treats as "/" when reading + * /etc/passwd and /etc/group. NULL keeps the builder pure-data: any + * symbolic User token then returns EINVAL. + */ + const char *rootfs_for_nss; +} oci_runspec_flags_t; + +/* Resolved launch bundle. cwd is always set (defaults to "/"). argv and + * envp are NULL-terminated heap arrays of heap strings. has_creds is + * false when neither the image nor the CLI named a User; in that case the + * caller leaves the host uid/gid untouched and uid/gid in this struct are + * meaningless. + */ +typedef struct { + char *cwd; + char **argv; + char **envp; + uint32_t uid; + uint32_t gid; + bool has_creds; +} oci_runspec_t; + +/* Resolve image runtime config + CLI overrides into a launch bundle. + * + * cfg may be NULL (treated as "image config has no runtime block": all + * fields absent). flags must be non-NULL but may be zero-initialised. + * host_environ is a NULL-terminated environ-shaped pointer for KEY=VAL + * lookups; pass the process environ. out must be non-NULL; on entry it + * is overwritten verbatim. On success returns 0 and out owns all heap + * storage (call oci_runspec_free to release). On failure returns -1 + * with errno set (EINVAL for policy violations, ENOMEM for allocation + * failures), *out left zeroed (safe to pass to oci_runspec_free), and + * *err pointing at the diagnostic message. + */ +int oci_runspec_build(const oci_image_runtime_t *cfg, + const oci_runspec_flags_t *flags, + const char *const *host_environ, + oci_runspec_t *out, + const char **err); + +/* Release any heap fields. Safe on zero-initialised structs and on NULL. */ +void oci_runspec_free(oci_runspec_t *spec); diff --git a/src/oci/runtime-files.c b/src/oci/runtime-files.c new file mode 100644 index 0000000..f7baa8a --- /dev/null +++ b/src/oci/runtime-files.c @@ -0,0 +1,459 @@ +/* OCI per-run /etc host-truth injection + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * See runtime-files.h for the contract. Implementation walks the + * three target leaves under /etc/ in turn, unlinking any + * pre-existing inode first so a symlink left by the image (the + * common case for /etc/resolv.conf) cannot dangle past the + * injection. + * + * The nameserver list for resolv.conf is harvested by spawning + * /usr/sbin/scutil --dns and line-walking its stdout for the + * " nameserver[N] : " pattern. The reader uses posix_spawnp + * with a pipe, mirroring src/core/sysroot.c's spawn_capture_stdout + * helper; duplicating the ~40 lines here keeps the oci module's + * lifetime story self-contained instead of hoisting a host-only + * helper into a public header. + */ + +#include "runtime-files.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char **environ; + +static _Thread_local char rf_err_buf[512]; + +static void set_err_static(const char **err, const char *msg) +{ + if (err) + *err = msg; +} + +static void set_err_fmt(const char **err, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); + +#include + +static void set_err_fmt(const char **err, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(rf_err_buf, sizeof(rf_err_buf), fmt, ap); + va_end(ap); + if (err) + *err = rf_err_buf; +} + +/* Loop-write the full payload; EINTR retries and short writes are + * folded back into another write call. Returns 0 on success, -1 with + * errno preserved on the failing call. + */ +static int write_full(int fd, const char *buf, size_t n) +{ + size_t off = 0; + while (off < n) { + ssize_t w = write(fd, buf + off, n - off); + if (w < 0) { + if (errno == EINTR) + continue; + return -1; + } + off += (size_t) w; + } + return 0; +} + +/* mkdir /etc 0755; tolerate EEXIST iff it is a directory. */ +static int ensure_etc_dir(const char *run_dir, const char **err) +{ + char etc_path[4096]; + int n = snprintf(etc_path, sizeof(etc_path), "%s/etc", run_dir); + if (n < 0 || (size_t) n >= sizeof(etc_path)) { + set_err_static(err, "runtime-files: /etc path overflow"); + errno = ENAMETOOLONG; + return -1; + } + if (mkdir(etc_path, 0755) == 0) + return 0; + if (errno != EEXIST) { + int saved = errno; + set_err_fmt(err, "runtime-files: mkdir %s failed: %s", etc_path, + strerror(saved)); + errno = saved; + return -1; + } + struct stat st; + if (lstat(etc_path, &st) < 0) { + int saved = errno; + set_err_fmt(err, "runtime-files: lstat %s failed: %s", etc_path, + strerror(saved)); + errno = saved; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + set_err_fmt(err, "runtime-files: %s exists but is not a directory", + etc_path); + errno = ENOTDIR; + return -1; + } + return 0; +} + +/* Build /etc/, remove any pre-existing inode (file or + * symlink) at that path, then open a fresh file for write. Returns + * the new fd on success, -1 with *err set and errno preserved on + * failure. + */ +static int open_etc_overwrite(const char *run_dir, + const char *leaf, + const char **err) +{ + char path[4096]; + int n = snprintf(path, sizeof(path), "%s/etc/%s", run_dir, leaf); + if (n < 0 || (size_t) n >= sizeof(path)) { + set_err_fmt(err, "runtime-files: path for /etc/%s too long", leaf); + errno = ENAMETOOLONG; + return -1; + } + if (unlink(path) < 0 && errno != ENOENT) { + int saved = errno; + set_err_fmt(err, "runtime-files: unlink %s failed: %s", path, + strerror(saved)); + errno = saved; + return -1; + } + int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + int saved = errno; + set_err_fmt(err, "runtime-files: open %s failed: %s", path, + strerror(saved)); + errno = saved; + return -1; + } + return fd; +} + +/* Append a nameserver string to the dedup list. Returns 0 on success, + * -1 with errno=ENOMEM on allocation failure. Duplicates are + * silently dropped so resolver #1's primary appears once even when + * scutil prints "nameserver[0]" twice for IPv4 + IPv6 of the same + * interface. + */ +static int push_nameserver(char ***list, + size_t *n, + size_t *cap, + const char *ip, + size_t iplen) +{ + for (size_t i = 0; i < *n; i++) { + size_t e = strlen((*list)[i]); + if (e == iplen && memcmp((*list)[i], ip, iplen) == 0) + return 0; + } + if (*n + 1 > *cap) { + size_t newcap = *cap ? *cap * 2 : 4; + char **np = realloc(*list, newcap * sizeof(*np)); + if (!np) { + errno = ENOMEM; + return -1; + } + *list = np; + *cap = newcap; + } + char *dup = malloc(iplen + 1); + if (!dup) { + errno = ENOMEM; + return -1; + } + memcpy(dup, ip, iplen); + dup[iplen] = '\0'; + (*list)[(*n)++] = dup; + return 0; +} + +/* Parse one line of scutil --dns output. The relevant shape is: + * + * " nameserver[0] : 192.168.1.1" + * " nameserver[1] : 2001:db8::1" + * + * Anything else is ignored. The IP literal is the trailing token + * after the colon, with surrounding whitespace stripped. Validation + * is intentionally loose: scutil is the source of truth, so any + * non-empty token after "nameserver[N] :" is taken at face value. + */ +static void parse_scutil_line(const char *line, + size_t len, + char ***list, + size_t *n, + size_t *cap) +{ + const char *p = line; + const char *end = line + len; + while (p < end && isspace((unsigned char) *p)) + p++; + static const char prefix[] = "nameserver["; + size_t plen = sizeof(prefix) - 1; + if ((size_t) (end - p) < plen) + return; + if (memcmp(p, prefix, plen) != 0) + return; + p += plen; + while (p < end && *p != ']') + p++; + if (p >= end || *p != ']') + return; + p++; + while (p < end && isspace((unsigned char) *p)) + p++; + if (p >= end || *p != ':') + return; + p++; + while (p < end && isspace((unsigned char) *p)) + p++; + const char *ip = p; + while (p < end && !isspace((unsigned char) *p)) + p++; + if (p == ip) + return; + (void) push_nameserver(list, n, cap, ip, (size_t) (p - ip)); +} + +/* Run /usr/sbin/scutil --dns, capture stdout, harvest nameservers. + * Returns 0 on success with *out_list / *out_n populated (out_list + * may be NULL when out_n == 0). Returns -1 on any spawn / read / + * wait failure so the caller can fall back to a known-good set. + */ +static int scutil_collect_nameservers(char ***out_list, size_t *out_n) +{ + *out_list = NULL; + *out_n = 0; + + int pipefd[2] = {-1, -1}; + if (pipe(pipefd) < 0) + return -1; + + posix_spawn_file_actions_t actions; + if (posix_spawn_file_actions_init(&actions) != 0) { + close(pipefd[0]); + close(pipefd[1]); + return -1; + } + posix_spawn_file_actions_adddup2(&actions, pipefd[1], STDOUT_FILENO); + posix_spawn_file_actions_addclose(&actions, pipefd[0]); + posix_spawn_file_actions_addclose(&actions, pipefd[1]); + + char *const argv[] = {(char *) "/usr/sbin/scutil", (char *) "--dns", NULL}; + pid_t pid = -1; + int spawn_ret = posix_spawn(&pid, argv[0], &actions, NULL, argv, environ); + posix_spawn_file_actions_destroy(&actions); + close(pipefd[1]); + if (spawn_ret != 0) { + close(pipefd[0]); + errno = spawn_ret; + return -1; + } + + /* Drain stdout. scutil --dns output is bounded by the number of + * resolver entries macOS keeps; 16 KiB is comfortable for any + * realistic host. + */ + char buf[16384]; + size_t off = 0; + bool drained = true; + while (off + 1 < sizeof(buf)) { + ssize_t r = read(pipefd[0], buf + off, sizeof(buf) - 1 - off); + if (r < 0) { + if (errno == EINTR) + continue; + drained = false; + break; + } + if (r == 0) + break; + off += (size_t) r; + } + /* Best-effort drain past the cap so the child does not block on + * SIGPIPE; the parser only sees the first 16 KiB anyway. + */ + if (drained) { + char dump[4096]; + while (read(pipefd[0], dump, sizeof(dump)) > 0) { + } + } + close(pipefd[0]); + + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) + return -1; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return -1; + + char **list = NULL; + size_t n = 0; + size_t cap = 0; + buf[off] = '\0'; + char *line = buf; + char *p = buf; + while (p < buf + off) { + if (*p == '\n') { + parse_scutil_line(line, (size_t) (p - line), &list, &n, &cap); + line = p + 1; + } + p++; + } + if (line < buf + off) + parse_scutil_line(line, (size_t) ((buf + off) - line), &list, &n, &cap); + + *out_list = list; + *out_n = n; + return 0; +} + +static int write_resolv_conf(const char *run_dir, const char **err) +{ + int fd = open_etc_overwrite(run_dir, "resolv.conf", err); + if (fd < 0) + return -1; + + char **ns_list = NULL; + size_t ns_n = 0; + bool used_fallback = false; + if (scutil_collect_nameservers(&ns_list, &ns_n) < 0 || ns_n == 0) + used_fallback = true; + + int rc = 0; + if (used_fallback) { + static const char fallback[] = + "nameserver 8.8.8.8\n" + "nameserver 1.1.1.1\n"; + if (write_full(fd, fallback, sizeof(fallback) - 1) < 0) + rc = -1; + } else { + for (size_t i = 0; i < ns_n; i++) { + char line[128]; + int n = snprintf(line, sizeof(line), "nameserver %s\n", ns_list[i]); + if (n < 0 || (size_t) n >= sizeof(line)) { + /* Skip oversized entries silently; scutil should never + * print one but the loop must not abort on a bad + * resolver string. + */ + continue; + } + if (write_full(fd, line, (size_t) n) < 0) { + rc = -1; + break; + } + } + } + + int saved = errno; + for (size_t i = 0; i < ns_n; i++) + free(ns_list[i]); + free(ns_list); + + if (rc < 0) { + set_err_fmt(err, "runtime-files: write resolv.conf failed: %s", + strerror(saved)); + close(fd); + errno = saved; + return -1; + } + if (close(fd) < 0) { + int e = errno; + set_err_fmt(err, "runtime-files: close resolv.conf failed: %s", + strerror(e)); + errno = e; + return -1; + } + return 0; +} + +static int write_hosts(const char *run_dir, const char **err) +{ + static const char body[] = + "127.0.0.1 localhost\n" + "::1 localhost ip6-localhost " + "ip6-loopback\n" + "ff02::1 ip6-allnodes\n" + "ff02::2 ip6-allrouters\n" + "127.0.0.1 host.elfuse.internal\n"; + int fd = open_etc_overwrite(run_dir, "hosts", err); + if (fd < 0) + return -1; + if (write_full(fd, body, sizeof(body) - 1) < 0) { + int saved = errno; + set_err_fmt(err, "runtime-files: write hosts failed: %s", + strerror(saved)); + close(fd); + errno = saved; + return -1; + } + if (close(fd) < 0) { + int e = errno; + set_err_fmt(err, "runtime-files: close hosts failed: %s", strerror(e)); + errno = e; + return -1; + } + return 0; +} + +static int write_hostname(const char *run_dir, const char **err) +{ + static const char body[] = "elfuse\n"; + int fd = open_etc_overwrite(run_dir, "hostname", err); + if (fd < 0) + return -1; + if (write_full(fd, body, sizeof(body) - 1) < 0) { + int saved = errno; + set_err_fmt(err, "runtime-files: write hostname failed: %s", + strerror(saved)); + close(fd); + errno = saved; + return -1; + } + if (close(fd) < 0) { + int e = errno; + set_err_fmt(err, "runtime-files: close hostname failed: %s", + strerror(e)); + errno = e; + return -1; + } + return 0; +} + +int oci_runtime_files_inject(const char *run_dir, const char **err) +{ + if (err) + *err = NULL; + if (!run_dir || !*run_dir) { + set_err_static(err, "runtime-files: NULL/empty run_dir"); + errno = EINVAL; + return -1; + } + if (ensure_etc_dir(run_dir, err) < 0) + return -1; + if (write_resolv_conf(run_dir, err) < 0) + return -1; + if (write_hosts(run_dir, err) < 0) + return -1; + if (write_hostname(run_dir, err) < 0) + return -1; + return 0; +} diff --git a/src/oci/runtime-files.h b/src/oci/runtime-files.h new file mode 100644 index 0000000..23c707c --- /dev/null +++ b/src/oci/runtime-files.h @@ -0,0 +1,46 @@ +/* OCI per-run /etc host-truth injection + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Phase 4 F4.2 (/etc/resolv.conf) and F4.3 (/etc/hosts, /etc/hostname) + * ask the runtime to overlay three files on the per-run clone-rootfs + * so guest libc lookups (getaddrinfo, gethostname, /etc/hosts walks) + * see values matching the macOS host rather than the image's + * containerd defaults. The entry point is invoked from src/oci/run.c + * after oci_clone_rootfs and before the manifest parse. + */ + +#ifndef ELFUSE_OCI_RUNTIME_FILES_H +#define ELFUSE_OCI_RUNTIME_FILES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* Write /etc/{resolv.conf,hosts,hostname} into /etc/. + * + * Creates the /etc/ directory at mode 0755 if it is missing. + * Overwrites any pre-existing file or symlink at the three target + * paths (image distros often ship /etc/resolv.conf as a symlink to + * /run/systemd/resolve/stub-resolv.conf, which would otherwise + * dangle inside the guest). + * + * /etc/resolv.conf is built from "nameserver " entries reported + * by scutil --dns; on any scutil failure or zero hits the helper + * falls back to 8.8.8.8 / 1.1.1.1. /etc/hosts is a fixed five-line + * block with localhost, ip6-loopback aliases, link-local multicast + * names, and host.elfuse.internal. /etc/hostname is the literal + * string "elfuse\n". + * + * Returns 0 on success, -1 on the first irrecoverable error with + * *err pointing at a static diagnostic. errno is preserved on the + * failing syscall. + */ +int oci_runtime_files_inject(const char *run_dir, const char **err); + +#ifdef __cplusplus +} +#endif + +#endif /* ELFUSE_OCI_RUNTIME_FILES_H */ diff --git a/src/oci/status.c b/src/oci/status.c new file mode 100644 index 0000000..3c8ae46 --- /dev/null +++ b/src/oci/status.c @@ -0,0 +1,807 @@ +/* Store-wide OCI status report + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Single walker over both store sources (pins via index.json + optional + * unpacked sysroots) plus three disk sweeps (blobs/, layers/, layers/stacks/). + * The pin walker resolves each manifest down to its image-config diff_ids, + * accumulates reachable diff_id and ChainID prefix sets into shared + * oci_digest_set_t accumulators, and records per-pin status / sizes / mtime + * for the CLI render. Unpacked sysroots feed the same accumulators from + * .elfuse-origin.json so no blob read is needed. + * + * After the walk, every diff_id in the reachable set is probed against + * /layers/// to compute the raw cache populate ratio; the + * same probe runs against /layers/stacks/// for the ChainID + * accumulator. The store sweeps under blobs//, layers//, and + * layers/stacks// count every entry (regardless of reachability) so the + * STORE TOTALS section can report the disk footprint. + * + * Failure policy: any per-entry error (missing manifest blob, malformed + * origin sidecar, bad image-config JSON) is recorded as the entry's status + * code rather than aborting. Fatal cases are reserved for the few states + * where no snapshot is possible: a NULL store, a failure walking + * /blobs/, or any other IO error during the directory sweeps. + * + * Duplication note: slurp_blob / sum_tree_size / resolve_config_digest / + * load_diff_ids / accumulate_chain are pattern-duplicates of helpers in + * src/oci/dedup-metrics.c and src/oci/store.c. The Plan 3 memory documents + * the lift threshold (deferred until a fourth caller of the diff_id walker + * pattern appears); status.c is the third caller, still below the cutoff. + */ + +#include "status.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blob-store.h" +#include "digest-set.h" +#include "digest.h" +#include "manifest.h" +#include "origin-meta.h" +#include "volume.h" + +#define STATUS_PATH_MAX 4096 + +/* Largest blob this helper will read into a heap buffer. Mirrors the + * 64 MiB cap used by dedup-metrics.c so manifest / image-config parse + * failure modes stay uniform across the OCI module set. + */ +#define STATUS_BLOB_MAX ((size_t) 64 * 1024 * 1024) + +/* ── Helpers duplicated from dedup-metrics.c / store.c ───────────────── */ + +/* Slurp a blob into a fresh heap buffer, NUL-terminated for parser ergonomics. + * Returns 0 on success and writes the body + length; -1 with errno preserved + * on failure. Caller frees *out_body. + */ +static int slurp_blob(oci_blob_store_t *blobs, + const char *digest_str, + char **out_body, + size_t *out_len) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + errno = EINVAL; + return -1; + } + char path[STATUS_PATH_MAX]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + return -1; + } + if (st.st_size < 0 || (uintmax_t) st.st_size > STATUS_BLOB_MAX) { + close(fd); + errno = EFBIG; + return -1; + } + size_t want = (size_t) st.st_size; + char *buf = malloc(want + 1); + if (!buf) { + close(fd); + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < want) { + ssize_t r = read(fd, buf + off, want - off); + if (r < 0) { + if (errno == EINTR) + continue; + int saved = errno; + free(buf); + close(fd); + errno = saved; + return -1; + } + if (r == 0) + break; + off += (size_t) r; + } + close(fd); + if (off != want) { + free(buf); + errno = EIO; + return -1; + } + buf[want] = '\0'; + *out_body = buf; + *out_len = want; + return 0; +} + +/* Stat manifest blob to capture size + mtime for the pin row. Returns 0 on + * hit; -1 with errno=ENOENT on miss (caller treats as MISSING_MANIFEST) or + * with errno preserved on other failures. + */ +static int stat_blob(oci_blob_store_t *blobs, + const char *digest_str, + uint64_t *out_size, + int64_t *out_mtime) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + errno = EINVAL; + return -1; + } + char path[STATUS_PATH_MAX]; + int n = oci_blob_store_path(blobs, algo, hex, path, sizeof(path)); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + struct stat st; + if (stat(path, &st) < 0) + return -1; + if (out_size) + *out_size = (uint64_t) (st.st_size < 0 ? 0 : st.st_size); + if (out_mtime) + *out_mtime = (int64_t) st.st_mtime; + return 0; +} + +/* Recursive st_size sum over a path tree. Returns the accumulated total on + * success or 0 when the entry is absent / unreadable (a missing layer cache + * entry simply contributes zero bytes). Symlinks are skipped via lstat so a + * stray symlink can never inflate the count. + */ +static uint64_t sum_tree_size(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return 0; + if (S_ISREG(st.st_mode)) + return (uint64_t) st.st_size; + if (!S_ISDIR(st.st_mode)) + return 0; + DIR *d = opendir(path); + if (!d) + return 0; + uint64_t total = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[STATUS_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) + continue; + total += sum_tree_size(child); + } + closedir(d); + return total; +} + +/* Free a NULL-terminated char ** array (strdup'd entries plus the array). */ +static void free_strv(char **v) +{ + if (!v) + return; + for (size_t i = 0; v[i]; i++) + free(v[i]); + free((void *) v); +} + +/* Resolve a manifest blob to its image-config digest. Walks one level of + * image-index indirection (linux/arm64 pick). Returns a heap-allocated + * ":" config digest on success (caller frees) or NULL with errno + * set on failure: + * ENOENT - manifest blob missing OR index has no linux/arm64 entry + * EINVAL - manifest blob neither parseable as manifest nor index + * other - propagated from slurp_blob + * + * out_status_code, when non-NULL, receives a hint matching the pin status + * enum so the caller can map ENOENT-from-missing-sub-manifest to + * MISSING_MANIFEST (rather than INDEX_NO_ARM64) and so on. + */ +static char *resolve_config_digest(oci_store_t *store, + const char *manifest_digest, + oci_status_pin_code_t *out_status_code) +{ + oci_blob_store_t *blobs = oci_store_blobs(store); + char *body = NULL; + size_t body_len = 0; + if (slurp_blob(blobs, manifest_digest, &body, &body_len) < 0) { + if (out_status_code) + *out_status_code = (errno == ENOENT) + ? OCI_STATUS_PIN_MISSING_MANIFEST + : OCI_STATUS_PIN_CORRUPT_MANIFEST; + return NULL; + } + + oci_manifest_t mf = {0}; + if (oci_manifest_parse(body, body_len, &mf, NULL) == 0) { + char *cfg = strdup(mf.config.digest_str); + oci_manifest_free(&mf); + free(body); + if (!cfg) { + errno = ENOMEM; + if (out_status_code) + *out_status_code = OCI_STATUS_PIN_CORRUPT_CONFIG; + return NULL; + } + return cfg; + } + + oci_index_t idx = {0}; + if (oci_index_parse(body, body_len, &idx, NULL) < 0) { + free(body); + errno = EINVAL; + if (out_status_code) + *out_status_code = OCI_STATUS_PIN_CORRUPT_MANIFEST; + return NULL; + } + free(body); + + const oci_index_entry_t *picked = oci_index_pick_linux_arm64(&idx); + if (!picked) { + oci_index_free(&idx); + errno = ENOENT; + if (out_status_code) + *out_status_code = OCI_STATUS_PIN_INDEX_NO_ARM64; + return NULL; + } + char *sub_digest = strdup(picked->desc.digest_str); + oci_index_free(&idx); + if (!sub_digest) { + errno = ENOMEM; + if (out_status_code) + *out_status_code = OCI_STATUS_PIN_CORRUPT_CONFIG; + return NULL; + } + /* Recurse on the picked sub-manifest. Status code on a sub-manifest miss + * is MISSING_MANIFEST rather than INDEX_NO_ARM64: the index itself was + * fine, the sub-manifest blob just is not on disk. + */ + oci_status_pin_code_t sub_code = OCI_STATUS_PIN_OK; + char *cfg = resolve_config_digest(store, sub_digest, &sub_code); + free(sub_digest); + if (!cfg && out_status_code) + *out_status_code = sub_code; + return cfg; +} + +/* Load the rootfs.diff_ids array from an image-config blob. Returns a fresh + * NULL-terminated char ** on success (free via free_strv); NULL with errno + * set on missing / unparseable / OOM. The caller maps the failure to the + * CORRUPT_CONFIG pin status. + */ +static char **load_diff_ids(oci_store_t *store, const char *config_digest) +{ + oci_blob_store_t *blobs = oci_store_blobs(store); + char *body = NULL; + size_t body_len = 0; + if (slurp_blob(blobs, config_digest, &body, &body_len) < 0) + return NULL; + + oci_image_config_t cfg = {0}; + if (oci_image_config_parse(body, body_len, &cfg, NULL) < 0) { + free(body); + errno = EINVAL; + return NULL; + } + free(body); + + size_t n = 0; + while (cfg.rootfs_diff_ids[n]) + n++; + char **copy = (char **) calloc(n + 1, sizeof(*copy)); + if (!copy) { + oci_image_config_free(&cfg); + errno = ENOMEM; + return NULL; + } + for (size_t i = 0; i < n; i++) { + copy[i] = strdup(cfg.rootfs_diff_ids[i]); + if (!copy[i]) { + free_strv(copy); + oci_image_config_free(&cfg); + errno = ENOMEM; + return NULL; + } + } + oci_image_config_free(&cfg); + return copy; +} + +/* Add every diff_id and the per-layer ChainID prefix into the accumulators. + * Returns 0 on success, -1 with errno set on chainid or set-add failure. + * Mid-walk failures leave partial entries in the sets; status.c treats the + * caller as "this image's contribution is corrupt" and lets the partial + * entries stand because they cannot inflate the populated-count beyond + * reality (the union shape of digest_set means later images naturally dedup). + */ +static int accumulate_chain(char *const *diff_ids, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc) +{ + char prev[OCI_DIGEST_HEX_MAX + 16] = ""; + for (size_t i = 0; diff_ids[i]; i++) { + if (oci_digest_set_add(diff_acc, diff_ids[i]) < 0) + return -1; + char chain[OCI_DIGEST_HEX_MAX + 16]; + const char *prev_arg = (i == 0) ? NULL : prev; + if (oci_chainid_compute(prev_arg, diff_ids[i], chain, sizeof(chain)) < + 0) + return -1; + memcpy(prev, chain, strlen(chain) + 1); + if (oci_digest_set_add(chain_acc, chain) < 0) + return -1; + } + return 0; +} + +/* ── Disk sweeps for STORE TOTALS ─────────────────────────────────────── */ + +/* Names whose ascii-hex shape matches the supplied algorithm. Mirrors the + * filter store.c uses to reject foreign state under blobs//. + */ +static bool entry_name_is_digest_hex(oci_digest_algo_t algo, const char *name) +{ + if (!name) + return false; + size_t want = oci_digest_hex_len(algo); + if (strlen(name) != want) + return false; + return oci_digest_hex_valid(algo, name); +} + +/* Sweep /blobs// and increment count + bytes for every well-formed + * blob entry. Returns 0 on success; -1 with errno set on opendir failure + * (except ENOENT, which is the empty-store case). When opts->skip_disk_usage + * is true the byte total is left at zero but the count still accumulates. + */ +static int sweep_blob_algo(const char *root, + oci_digest_algo_t algo, + bool skip_bytes, + size_t *count, + uint64_t *bytes) +{ + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) + return 0; + char dir[STATUS_PATH_MAX]; + int n = snprintf(dir, sizeof(dir), "%s/blobs/%s", root, algo_name); + if (n < 0 || (size_t) n >= sizeof(dir)) { + errno = ENAMETOOLONG; + return -1; + } + DIR *d = opendir(dir); + if (!d) { + if (errno == ENOENT) + return 0; + return -1; + } + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + if (!entry_name_is_digest_hex(algo, de->d_name)) + continue; + char child[STATUS_PATH_MAX]; + int cn = snprintf(child, sizeof(child), "%s/%s", dir, de->d_name); + if (cn < 0 || (size_t) cn >= sizeof(child)) + continue; + struct stat st; + if (lstat(child, &st) < 0) + continue; + if (!S_ISREG(st.st_mode)) + continue; + (*count)++; + if (!skip_bytes) + *bytes += (uint64_t) (st.st_size < 0 ? 0 : st.st_size); + } + closedir(d); + return 0; +} + +/* Sweep a content-addressed directory family rooted at /// + * where each well-formed child is itself a directory whose recursive + * st_size sum contributes to bytes. Used for layers// and + * layers/stacks//. Missing /// is the empty case + * (count == 0), not an error. + */ +static int sweep_tree_family(const char *root, + const char *base, + oci_digest_algo_t algo, + bool skip_bytes, + size_t *count, + uint64_t *bytes) +{ + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) + return 0; + char dir[STATUS_PATH_MAX]; + int n = snprintf(dir, sizeof(dir), "%s/%s/%s", root, base, algo_name); + if (n < 0 || (size_t) n >= sizeof(dir)) { + errno = ENAMETOOLONG; + return -1; + } + DIR *d = opendir(dir); + if (!d) { + if (errno == ENOENT) + return 0; + return -1; + } + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + if (!entry_name_is_digest_hex(algo, de->d_name)) + continue; + char child[STATUS_PATH_MAX]; + int cn = snprintf(child, sizeof(child), "%s/%s", dir, de->d_name); + if (cn < 0 || (size_t) cn >= sizeof(child)) + continue; + struct stat st; + if (lstat(child, &st) < 0) + continue; + if (!S_ISDIR(st.st_mode)) + continue; + (*count)++; + if (!skip_bytes) + *bytes += sum_tree_size(child); + } + closedir(d); + return 0; +} + +/* Algorithms swept by the disk pass. Mirrors PRUNE_ALGOS in store.c so the + * sweep stays consistent with what the GC walker considers a candidate. + */ +static const oci_digest_algo_t STATUS_ALGOS[] = { + OCI_DIGEST_SHA256, + OCI_DIGEST_SHA512, +}; +#define STATUS_ALGOS_LEN (sizeof(STATUS_ALGOS) / sizeof(STATUS_ALGOS[0])) + +/* Probe whether //// exists as a directory. Used to + * compute the populate ratios over the reachable digest sets. + */ +static bool tree_entry_exists(const char *root, + const char *base, + const char *digest_str) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) + return false; + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) + return false; + char path[STATUS_PATH_MAX]; + int n = + snprintf(path, sizeof(path), "%s/%s/%s/%s", root, base, algo_name, hex); + if (n < 0 || (size_t) n >= sizeof(path)) + return false; + struct stat st; + if (stat(path, &st) < 0) + return false; + return S_ISDIR(st.st_mode); +} + +/* ── Per-pin walk ─────────────────────────────────────────────────────── */ + +/* Populate one pin entry. Returns 0 on every code path (pin entries record + * their own status; a per-pin failure is never fatal). diff_acc / chain_acc + * accumulate the reachable layer / chain sets when the pin resolves cleanly. + */ +static void walk_pin_entry(oci_store_t *store, + const oci_pin_entry_t *pin, + oci_status_pin_entry_t *out, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc) +{ + memset(out, 0, sizeof(*out)); + out->last_seen_mtime = -1; + out->name = strdup(pin->name ? pin->name : ""); + out->digest = strdup(pin->digest ? pin->digest : ""); + if (!out->name || !out->digest) { + /* Allocation failure for the row identity strings is rare enough + * that surfacing it as MISSING_MANIFEST gives the operator a hint + * without complicating the failure model. + */ + out->status = OCI_STATUS_PIN_MISSING_MANIFEST; + return; + } + + oci_blob_store_t *blobs = oci_store_blobs(store); + + /* Manifest blob size / mtime first. A miss here short-circuits all later + * steps to MISSING_MANIFEST without polluting the per-pin row. + */ + if (stat_blob(blobs, out->digest, &out->manifest_size, + &out->last_seen_mtime) < 0) { + out->manifest_size = 0; + out->last_seen_mtime = -1; + out->status = OCI_STATUS_PIN_MISSING_MANIFEST; + return; + } + + oci_status_pin_code_t resolve_code = OCI_STATUS_PIN_OK; + char *config_digest = + resolve_config_digest(store, out->digest, &resolve_code); + if (!config_digest) { + out->status = resolve_code; + return; + } + + /* Capture image-config blob size when present. A missing config blob is + * the CORRUPT_CONFIG sentinel: the manifest was readable but the image + * is structurally incomplete. */ + uint64_t cfg_size = 0; + if (stat_blob(blobs, config_digest, &cfg_size, NULL) < 0) { + free(config_digest); + out->status = OCI_STATUS_PIN_CORRUPT_CONFIG; + return; + } + out->config_size = cfg_size; + + char **diff_ids = load_diff_ids(store, config_digest); + free(config_digest); + if (!diff_ids) { + out->status = OCI_STATUS_PIN_CORRUPT_CONFIG; + return; + } + size_t layer_count = 0; + while (diff_ids[layer_count]) + layer_count++; + out->layer_count = layer_count; + + /* Accumulate union sets so the populate ratios can run later. A failure + * inside accumulate_chain is treated as CORRUPT_CONFIG for the row but + * the partial entries stay in the accumulators: the union shape means + * a partial walk only ever inflates "reachable" honestly without + * over-reporting "populated". + */ + if (accumulate_chain(diff_ids, diff_acc, chain_acc) < 0) { + free_strv(diff_ids); + out->status = OCI_STATUS_PIN_CORRUPT_CONFIG; + return; + } + free_strv(diff_ids); + out->status = OCI_STATUS_PIN_OK; +} + +/* ── Per-unpacked-tree walk ───────────────────────────────────────────── */ + +static void walk_unpacked_entry(const char *tree_path, + bool skip_disk_usage, + oci_status_unpacked_entry_t *out, + oci_digest_set_t *diff_acc, + oci_digest_set_t *chain_acc) +{ + memset(out, 0, sizeof(*out)); + out->path = strdup(tree_path ? tree_path : ""); + if (!out->path) { + out->status = OCI_STATUS_UNPACKED_MISSING_ORIGIN; + return; + } + if (!skip_disk_usage) + out->tree_bytes = sum_tree_size(tree_path); + + oci_origin_t origin = {0}; + if (oci_origin_read(tree_path, &origin, NULL) < 0) { + out->status = (errno == ENOENT) ? OCI_STATUS_UNPACKED_MISSING_ORIGIN + : OCI_STATUS_UNPACKED_CORRUPT_ORIGIN; + return; + } + if (origin.manifest_digest) { + out->manifest_digest = strdup(origin.manifest_digest); + /* strdup failure for the digest is rare; surface as corrupt-origin + * so the row is still listed. + */ + if (!out->manifest_digest) { + oci_origin_free(&origin); + out->status = OCI_STATUS_UNPACKED_CORRUPT_ORIGIN; + return; + } + } + if (origin.layer_diffids) { + size_t n = 0; + while (origin.layer_diffids[n]) + n++; + out->layer_count = n; + if (n > 0 && + accumulate_chain(origin.layer_diffids, diff_acc, chain_acc) < 0) { + oci_origin_free(&origin); + out->status = OCI_STATUS_UNPACKED_CORRUPT_ORIGIN; + return; + } + } + oci_origin_free(&origin); + out->status = OCI_STATUS_UNPACKED_OK; +} + +/* ── Public entry point ───────────────────────────────────────────────── */ + +void oci_status_free(oci_status_t *out) +{ + if (!out) + return; + if (out->pins) { + for (size_t i = 0; i < out->pin_count; i++) { + free(out->pins[i].name); + free(out->pins[i].digest); + } + free(out->pins); + } + if (out->unpacked) { + for (size_t i = 0; i < out->unpacked_count; i++) { + free(out->unpacked[i].path); + free(out->unpacked[i].manifest_digest); + } + free(out->unpacked); + } + memset(out, 0, sizeof(*out)); +} + +int oci_status_compute(oci_store_t *store, + const oci_status_options_t *opts, + oci_status_t *out, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + + if (!store || !out) { + *err = "status_compute: NULL argument"; + errno = EINVAL; + return -1; + } + memset(out, 0, sizeof(*out)); + + bool skip_du = opts && opts->skip_disk_usage; + out->disk_usage_skipped = skip_du; + const char *volume_root = opts ? opts->volume_root : NULL; + + oci_digest_set_t diff_acc = {0}; + oci_digest_set_t chain_acc = {0}; + + /* 1. Pin walk. An unreadable index.json (or a missing one) is treated + * as the empty case here: the rest of the snapshot still has value + * (store totals, layer caches) and surfacing the failure would be a + * regression vs the rest of the OCI CLI which all treat missing + * pins as empty. + */ + oci_pin_list_t pins = {0}; + const char *list_err = NULL; + if (oci_store_list_refs(store, &pins, &list_err) == 0 && pins.count > 0) { + out->pins = calloc(pins.count, sizeof(*out->pins)); + if (!out->pins) { + oci_pin_list_free(&pins); + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + *err = "status_compute: out of memory allocating pin rows"; + errno = ENOMEM; + return -1; + } + out->pin_count = pins.count; + for (size_t i = 0; i < pins.count; i++) { + walk_pin_entry(store, &pins.items[i], &out->pins[i], &diff_acc, + &chain_acc); + } + } + oci_pin_list_free(&pins); + + /* 2. Unpacked sysroots when volume_root is provided. */ + if (volume_root) { + oci_volume_list_t trees = {0}; + if (oci_volume_list_unpacked(volume_root, &trees, NULL) == 0 && + trees.count > 0) { + out->unpacked = calloc(trees.count, sizeof(*out->unpacked)); + if (!out->unpacked) { + oci_volume_list_free(&trees); + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_status_free(out); + *err = "status_compute: out of memory allocating unpacked rows"; + errno = ENOMEM; + return -1; + } + out->unpacked_count = trees.count; + for (size_t i = 0; i < trees.count; i++) { + walk_unpacked_entry(trees.items[i], skip_du, &out->unpacked[i], + &diff_acc, &chain_acc); + } + } + oci_volume_list_free(&trees); + } + + /* 3. Store totals (counts always; bytes only when not skipped). */ + const char *root = oci_store_root(store); + if (!root) { + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_status_free(out); + *err = "status_compute: store has no root path"; + errno = EINVAL; + return -1; + } + + for (size_t i = 0; i < STATUS_ALGOS_LEN; i++) { + if (sweep_blob_algo(root, STATUS_ALGOS[i], skip_du, &out->blob_count, + &out->blob_bytes_total) < 0) { + int saved = errno; + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_status_free(out); + *err = "status_compute: blob sweep failed"; + errno = saved; + return -1; + } + } + for (size_t i = 0; i < STATUS_ALGOS_LEN; i++) { + if (sweep_tree_family(root, "layers", STATUS_ALGOS[i], skip_du, + &out->layer_cache_count, + &out->layer_cache_bytes_total) < 0) { + int saved = errno; + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_status_free(out); + *err = "status_compute: layers/ sweep failed"; + errno = saved; + return -1; + } + } + for (size_t i = 0; i < STATUS_ALGOS_LEN; i++) { + if (sweep_tree_family(root, "layers/stacks", STATUS_ALGOS[i], skip_du, + &out->stack_cache_count, + &out->stack_cache_bytes_total) < 0) { + int saved = errno; + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + oci_status_free(out); + *err = "status_compute: layers/stacks/ sweep failed"; + errno = saved; + return -1; + } + } + + /* 4. Populate ratios. Iterate the reachable diff_id and ChainID sets + * and probe each against its respective cache family. Each entry is at + * most one stat(2) so the cost is O(R) where R = reachable count. + */ + out->diff_ids_reachable = oci_digest_set_size(&diff_acc); + for (size_t i = 0; i < out->diff_ids_reachable; i++) { + const char *d = oci_digest_set_at(&diff_acc, i); + if (d && tree_entry_exists(root, "layers", d)) + out->diff_ids_populated++; + } + out->chain_ids_reachable = oci_digest_set_size(&chain_acc); + for (size_t i = 0; i < out->chain_ids_reachable; i++) { + const char *c = oci_digest_set_at(&chain_acc, i); + if (c && tree_entry_exists(root, "layers/stacks", c)) + out->chain_ids_populated++; + } + + oci_digest_set_free(&diff_acc); + oci_digest_set_free(&chain_acc); + (void) list_err; /* swallowed: missing index.json is the empty case */ + return 0; +} diff --git a/src/oci/status.h b/src/oci/status.h new file mode 100644 index 0000000..39d113b --- /dev/null +++ b/src/oci/status.h @@ -0,0 +1,160 @@ +/* Store-wide OCI status report + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Aggregates a single point-in-time snapshot of every pin in index.json plus + * every unpacked sysroot under volume_root, into a flat struct the CLI can + * render or serialise to JSON. The walker is informational rather than a GC + * keep-set: a malformed manifest or unreadable origin sidecar surfaces as a + * per-entry status code without aborting the rest of the walk, so an operator + * with one corrupt pin still sees the healthy rows. + * + * Layer-cache populate ratios (raw + stack) drill into reachable diff_ids and + * ChainID prefixes the same way oci_dedup_metrics_compute does, except the + * aggregation is store-wide rather than per-target. The two values are + * separate metrics (raw vs stack) because the Plan 3 caches dedup along + * different axes; collapsing them into one number would hide which family + * needs operator attention. + * + * Plan 4 C4.1 first consumer. Reusable bits to revisit if Plan 4 C4.2 or any + * later store-wide reporter lands a fourth caller: the diff_id / ChainID + * walker duplication this file inherits from dedup-metrics.c and store.c + * crosses the lift threshold once a fourth caller appears; see + * project_oci_plan3 memory for the pattern. + */ + +#pragma once + +#include +#include +#include + +#include "store.h" + +/* Per-pin entry status. ok == 0 means every field below is populated; + * non-zero codes name which step failed so the renderer can print a sentinel + * row instead of suppressing the pin entirely. + */ +typedef enum { + OCI_STATUS_PIN_OK = 0, + OCI_STATUS_PIN_MISSING_MANIFEST = 1, /* manifest blob not on disk */ + OCI_STATUS_PIN_CORRUPT_MANIFEST = + 2, /* blob unparseable as manifest or index */ + OCI_STATUS_PIN_CORRUPT_CONFIG = + 3, /* image-config blob missing or unparseable */ + OCI_STATUS_PIN_INDEX_NO_ARM64 = + 4, /* image-index has no linux/arm64 entry */ +} oci_status_pin_code_t; + +typedef struct { + char *name; /* canonical "/:" */ + char *digest; /* pinned manifest digest, ":" */ + uint64_t manifest_size; /* st_size of manifest blob; 0 on missing */ + uint64_t config_size; /* st_size of image-config blob; 0 on missing */ + size_t layer_count; /* rootfs.diff_ids length; 0 on resolve failure */ + int64_t last_seen_mtime; /* st_mtime of manifest blob (epoch sec); -1 if + missing */ + oci_status_pin_code_t status; +} oci_status_pin_entry_t; + +/* Per-unpacked-sysroot entry status. */ +typedef enum { + OCI_STATUS_UNPACKED_OK = 0, + OCI_STATUS_UNPACKED_MISSING_ORIGIN = 1, + OCI_STATUS_UNPACKED_CORRUPT_ORIGIN = 2, +} oci_status_unpacked_code_t; + +typedef struct { + char *path; /* absolute path to /images/sha256-/ */ + char *manifest_digest; /* origin.manifest_digest; NULL on read failure */ + size_t layer_count; /* origin.layer_diffids length; 0 on read failure */ + uint64_t tree_bytes; /* recursive st_size sum; 0 under skip_disk_usage */ + oci_status_unpacked_code_t status; +} oci_status_unpacked_entry_t; + +/* Inputs to oci_status_compute. */ +typedef struct { + /* When non-NULL, the unpacked-tree walker scans /images/. + * NULL skips that source entirely; pins-only mode still populates every + * blob and cache total. Missing or unreadable volume_root is the empty + * case (unpacked_count = 0), not an error. + */ + const char *volume_root; + + /* When true, every byte counter is left at zero and the rendered output + * notes that disk usage was skipped. This is the operator escape hatch + * for stores large enough that walking layers/sha256/ and + * layers/stacks/sha256/ trees would be too slow. + */ + bool skip_disk_usage; +} oci_status_options_t; + +/* The aggregated report. Owned by the caller; release via oci_status_free. + * Array fields are NULL with count == 0 when empty so the renderer can + * branch on count alone. + */ +typedef struct { + /* Pins: always populated from index.json (best-effort; an unreadable + * index.json is the same as an empty store, not a fatal error so the + * renderer can still print store totals). + */ + oci_status_pin_entry_t *pins; + size_t pin_count; + + /* Unpacked sysroots: empty when volume_root was NULL or the + * /images/ directory was missing. + */ + oci_status_unpacked_entry_t *unpacked; + size_t unpacked_count; + + /* Store-wide disk totals. Counts are always populated. Byte totals are + * zero when skip_disk_usage is true. + */ + size_t blob_count; + uint64_t blob_bytes_total; + size_t layer_cache_count; + uint64_t layer_cache_bytes_total; + size_t stack_cache_count; + uint64_t stack_cache_bytes_total; + + /* Reachable-set populate ratios. Numerator counts entries the union of + * reachable diff_ids / ChainIDs that are actually present on disk under + * /layers/// or /layers/stacks///. + * A diff_id or ChainID is reachable if some healthy pin or unpacked + * sysroot named it; corrupt pins contribute nothing here so the ratios + * are not skewed by unreadable manifests. + */ + size_t diff_ids_reachable; + size_t diff_ids_populated; + size_t chain_ids_reachable; + size_t chain_ids_populated; + + /* Mirrors options.skip_disk_usage so the renderer does not need the + * options struct in scope. + */ + bool disk_usage_skipped; +} oci_status_t; + +/* Compute a store-wide status snapshot. + * + * Failure model: fatal only on bad arguments or a store-open / index.json + * lock failure where no useful snapshot can be produced. Per-pin and + * per-tree failures surface as the entry's status code; the walker keeps + * going. + * + * On entry *out is reset; on success it is fully populated and the caller + * must release it via oci_status_free. On failure *out is left in the + * freed-empty state and *err (when non-NULL) points at a static description. + * + * opts may be NULL for "pin-only, include disk usage" defaults. + */ +int oci_status_compute(oci_store_t *store, + const oci_status_options_t *opts, + oci_status_t *out, + const char **err); + +/* Release every owned heap field in *out and zero the struct. Safe on a + * zero-initialised struct and on NULL. + */ +void oci_status_free(oci_status_t *out); diff --git a/src/oci/store.c b/src/oci/store.c new file mode 100644 index 0000000..4055230 --- /dev/null +++ b/src/oci/store.c @@ -0,0 +1,3429 @@ +/* Local OCI image store: blobs + tag-to-digest pinning + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Pin discipline: + * + * - index.json is the only pin store. A pin is one descriptor in + * manifests[] keyed by org.opencontainers.image.ref.name. + * - Writers serialize via flock(/index.json.lock, LOCK_EX) and + * publish via tmp + rename. The lock file is independent of index.json + * itself so that rename(2) replacing the inode does not invalidate the + * advisory lock identity for concurrent writers. + * - Readers parse the file lock-free: rename is atomic on a POSIX + * filesystem and cJSON consumes the document in one shot. + * - Re-pinning the same canonical name replaces the existing manifests[] + * entry in place; pull-by-tag with a moved tag updates rather than + * accumulating duplicates. + * + * Blob store layout: + * + * - The blob layer below this module keeps its link(2) discipline because + * content-addressed blobs are immutable; tag pins use rename(2) because + * pulling alpine:3.20 today may resolve to a different digest tomorrow + * and overwriting the pin is the correct semantic. + * + * Image-layout marker: + * + * - /oci-layout advertises the directory as a standards-compliant + * OCI image-layout so skopeo, umoci, and crane can consume the store + * directly. Writing the marker is idempotent: it is only created when + * missing and existing markers are never rewritten so a third party + * that bumped the imageLayoutVersion is not stomped. + * + * Pre-C2.2 stores wrote pin files under refs/// + * instead of index.json. C2.3 migrates older stores on open by recursively + * scanning refs/ and rebuilding index.json under the same flock that + * oci_store_put_ref takes, so a concurrent first-open and first-put cannot + * double-write. refs/ is left in place for one release so a downgrade still + * finds the legacy data. Migration is suppressed when ELFUSE_OCI_NO_MIGRATE + * is set in the environment; in that mode an older store appears empty to + * oci_store_get_ref / oci_store_list_refs until the env var is cleared on a + * subsequent open. + */ + +#include "store.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "digest-set.h" +#include "digest.h" +#include "manifest.h" +#include "origin-meta.h" +#include "volume.h" + +/* Largest path the store materializes. Comfortably above PATH_MAX so snprintf + * truncation surfaces as ENAMETOOLONG instead of a silent corruption. + */ +#define STORE_PATH_MAX 4096 + +/* Conservative ceiling for a single manifest body. Real OCI manifests run + * a few KiB; index.json itself is bounded by O(pin count * descriptor size) + * and stays well under this. Anything larger is treated as a corrupted or + * hostile blob and rejected at parse time. + */ +#define MAX_MANIFEST_BYTES (4 * 1024 * 1024) + +/* OCI annotation key under which pin names live in manifests[] descriptors. */ +static const char ANNOT_REF_NAME[] = "org.opencontainers.image.ref.name"; + +/* OCI media types used when filling the manifests[] descriptor. The actual + * mediaType is read from the manifest blob when present; these constants + * are the fallbacks used when the blob omits the JSON field (an older + * Docker manifest, for instance). + */ +static const char MT_OCI_IMAGE_INDEX[] = + "application/vnd.oci.image.index.v1+json"; +static const char MT_OCI_IMAGE_MANIFEST[] = + "application/vnd.oci.image.manifest.v1+json"; + +struct oci_store { + char *root; + oci_blob_store_t *blobs; +}; + +/* OCI image-layout 1.0.0 marker payload. The spec wants a JSON object with + * exactly one field: imageLayoutVersion = "1.0.0". The trailing newline is + * conventional and matches what umoci / skopeo write. + */ +static const char OCI_LAYOUT_BODY[] = "{\"imageLayoutVersion\":\"1.0.0\"}\n"; + +/* Environment variable that disables C2.3 auto-migration of pre-index.json + * stores. When set to any non-empty value, oci_store_open leaves refs/ and + * the absent index.json alone so a downgrade test or recovery workflow can + * inspect the legacy layout without the daemon helpfully rewriting it. + */ +static const char NO_MIGRATE_ENV[] = "ELFUSE_OCI_NO_MIGRATE"; + +/* Walks /refs/ recursively, rebuilds /index.json with one + * descriptor per discovered pin file, and writes it via tmp + rename. The + * caller must already hold an LOCK_EX on /index.json.lock. Returns 0 + * on success (including the no-pins-found case), -1 with errno preserved on + * an unrecoverable IO error. Individual pins whose manifest blob is missing + * from blobs/ are skipped with a stderr warning so a single dangling pin + * does not block migration for the rest of the store. + */ +static int migrate_legacy_refs(struct oci_store *s); + +/* Plan 3 C3.3b: probe /layers/.schema and migrate v1 stores to v2. + * + * Behaviour matrix at oci_store_open time: + * + * - marker present + schemaVersion == 2: no-op. + * - marker present + other schemaVersion or unparseable JSON: fail with + * errno=EINVAL so a forward-incompatible store does not get silently + * repopulated under the wrong shape. + * - marker absent + ELFUSE_OCI_NO_MIGRATE set: no-op (inspection mode; + * analogous to the C2.3 refs/ -> index.json gate). + * - marker absent + /layers/sha256/ empty: write v2 marker. + * - marker absent + /layers/sha256/ populated: wipe every direct + * child entry under layers/sha256/ (C3.2 cumulative-by-diff_id entries + * are not v2-compatible; reachability is recomputable from manifests + * at unpack time) and then write the v2 marker. + * + * The wipe + write runs under flock(/index.json.lock, LOCK_EX) and + * re-stats the marker under hold so a concurrent opener does not double + * migrate. The wipe is scoped to /layers/sha256/ children only; + * blobs/, images/, tmp/, refs/, index.json, and layers/.staging/ are + * never touched. + */ +static int ensure_layer_schema_marker(const char *root); + +/* Idempotently write /oci-layout. Returns 0 on success or when the + * marker already exists, -1 on any unexpected IO failure. The write uses a + * pid + counter-suffixed tmp file plus link(2) so a concurrent opener never + * observes a partial JSON document. link(2) is preferred over rename(2) for + * the publish step so that two racing openers cannot replace an external + * tool's bumped marker with our own; EEXIST is the happy path. + */ +static unsigned long layout_seq(void) +{ + static unsigned long n = 0; + return __sync_add_and_fetch(&n, 1); +} + +/* Ensure /layers/sha256/, /layers/stacks/sha256/, and + * /layers/.staging/ exist on open. The Plan 3 layer caches depend on + * three subtrees: layers/sha256/ holds committed per-layer raw entries + * (C3.3c), layers/stacks/sha256/ holds committed ChainID-keyed assembled + * stack snapshots (C3.3c), and layers/.staging/ is the shared in-flight + * staging area for clonefile(2) writers in both families. The blob store + * already created itself (oci_blob_store_open mkdirs the root + * tree), so this helper only adds the layers/ subtree. mkdir EEXIST is + * benign so reopens are idempotent. + */ +static int ensure_layer_dirs(const char *root) +{ + static const char *const subdirs[] = { + "layers", "layers/sha256", + "layers/stacks", "layers/stacks/sha256", + "layers/.staging", + }; + for (size_t i = 0; i < sizeof(subdirs) / sizeof(subdirs[0]); i++) { + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/%s", root, subdirs[i]); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + if (mkdir(path, 0755) < 0 && errno != EEXIST) + return -1; + } + return 0; +} + +static int ensure_oci_layout_marker(const char *root) +{ + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/oci-layout", root); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + struct stat st; + if (stat(path, &st) == 0) { + if (!S_ISREG(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + return 0; + } + if (errno != ENOENT) + return -1; + + char tmp[STORE_PATH_MAX]; + n = snprintf(tmp, sizeof(tmp), "%s.tmp-%d-%lu", path, (int) getpid(), + layout_seq()); + if (n < 0 || (size_t) n >= sizeof(tmp)) { + errno = ENAMETOOLONG; + return -1; + } + + int fd = open(tmp, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd < 0) + return -1; + size_t body_len = sizeof(OCI_LAYOUT_BODY) - 1; + if (write(fd, OCI_LAYOUT_BODY, body_len) != (ssize_t) body_len) { + int saved = errno; + close(fd); + unlink(tmp); + errno = saved; + return -1; + } + if (fsync(fd) < 0) { + int saved = errno; + close(fd); + unlink(tmp); + errno = saved; + return -1; + } + if (close(fd) < 0) { + int saved = errno; + unlink(tmp); + errno = saved; + return -1; + } + if (link(tmp, path) < 0) { + int saved = errno; + unlink(tmp); + if (saved == EEXIST) + return 0; + errno = saved; + return -1; + } + unlink(tmp); + return 0; +} + +oci_store_t *oci_store_open(const char *root) +{ + if (!root || !*root) { + errno = EINVAL; + return NULL; + } + oci_blob_store_t *blobs = oci_blob_store_open(root); + if (!blobs) + return NULL; + + if (ensure_oci_layout_marker(root) < 0) { + int saved = errno; + oci_blob_store_close(blobs); + errno = saved; + return NULL; + } + if (ensure_layer_dirs(root) < 0) { + int saved = errno; + oci_blob_store_close(blobs); + errno = saved; + return NULL; + } + if (ensure_layer_schema_marker(root) < 0) { + int saved = errno; + oci_blob_store_close(blobs); + errno = saved; + return NULL; + } + + oci_store_t *s = calloc(1, sizeof(*s)); + if (!s) { + oci_blob_store_close(blobs); + errno = ENOMEM; + return NULL; + } + s->root = strdup(root); + if (!s->root) { + free(s); + oci_blob_store_close(blobs); + errno = ENOMEM; + return NULL; + } + s->blobs = blobs; + + /* C2.3 auto-migration: detect a pre-index.json store (refs/ tree without + * an index.json) and rebuild index.json under the same flock that + * oci_store_put_ref takes. Suppressed by ELFUSE_OCI_NO_MIGRATE so a + * downgrade test or recovery workflow can inspect the legacy layout + * without it being silently rewritten. + */ + const char *no_migrate = getenv(NO_MIGRATE_ENV); + if (!no_migrate || !*no_migrate) { + if (migrate_legacy_refs(s) < 0) { + int saved = errno; + oci_store_close(s); + errno = saved; + return NULL; + } + } + return s; +} + +void oci_store_close(oci_store_t *s) +{ + if (!s) + return; + oci_blob_store_close(s->blobs); + free(s->root); + free(s); +} + +const char *oci_store_root(const oci_store_t *s) +{ + return s ? s->root : NULL; +} + +oci_blob_store_t *oci_store_blobs(oci_store_t *s) +{ + return s ? s->blobs : NULL; +} + +char *oci_store_default_root(void) +{ + const char *xdg = getenv("XDG_DATA_HOME"); + if (xdg && *xdg) { + size_t n = strlen(xdg) + sizeof("/elfuse/store"); + char *r = malloc(n); + if (!r) { + errno = ENOMEM; + return NULL; + } + snprintf(r, n, "%s/elfuse/store", xdg); + return r; + } + const char *home = getenv("HOME"); + if (!home || !*home) { + errno = ENOENT; + return NULL; + } + static const char SUFFIX[] = "/Library/Application Support/elfuse/store"; + size_t n = strlen(home) + sizeof(SUFFIX); + char *r = malloc(n); + if (!r) { + errno = ENOMEM; + return NULL; + } + snprintf(r, n, "%s%s", home, SUFFIX); + return r; +} + +/* Resolve the on-disk path of a manifest blob keyed by ":". The + * digest string has already been validated by oci_digest_parse, so the hex + * length is bounded and snprintf cannot truncate within STORE_PATH_MAX. + */ +static int blob_path_for_digest(const oci_store_t *s, + const char *digest_str, + char *out, + size_t cap) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + errno = EINVAL; + return -1; + } + int n = oci_blob_store_path(s->blobs, algo, hex, out, cap); + if (n < 0 || (size_t) n >= cap) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +/* stat the manifest blob and return its size. The caller has already + * validated the digest shape; ENOENT here means the caller forgot to + * persist the blob before pinning it, which is a programmer error in the + * pull / fixture path rather than user input. + */ +static int blob_size(const oci_store_t *s, + const char *digest_str, + int64_t *out_size) +{ + char path[STORE_PATH_MAX]; + if (blob_path_for_digest(s, digest_str, path, sizeof(path)) < 0) + return -1; + struct stat st; + if (stat(path, &st) < 0) + return -1; + if (!S_ISREG(st.st_mode)) { + errno = EINVAL; + return -1; + } + *out_size = (int64_t) st.st_size; + return 0; +} + +/* Best-effort read of the manifest blob's mediaType. Returns a heap-allocated + * string on success. When the blob omits the JSON mediaType field (older + * Docker manifests), sniff the shape: a top-level manifests array means an + * image-index, a layers array means an image-manifest. Falls back to the + * OCI image-manifest media type when the JSON is unrecognized so the + * descriptor stays schema-valid. Returns NULL on IO or parse failure with + * errno preserved. + */ +static char *infer_manifest_media_type(const oci_store_t *s, + const char *digest_str) +{ + char path[STORE_PATH_MAX]; + if (blob_path_for_digest(s, digest_str, path, sizeof(path)) < 0) + return NULL; + + int fd = open(path, O_RDONLY); + if (fd < 0) + return NULL; + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + return NULL; + } + if (st.st_size <= 0 || st.st_size > (off_t) MAX_MANIFEST_BYTES) { + close(fd); + errno = EINVAL; + return NULL; + } + size_t len = (size_t) st.st_size; + char *body = malloc(len + 1); + if (!body) { + close(fd); + errno = ENOMEM; + return NULL; + } + size_t off = 0; + while (off < len) { + ssize_t got = read(fd, body + off, len - off); + if (got < 0) { + int saved = errno; + free(body); + close(fd); + errno = saved; + return NULL; + } + if (got == 0) + break; + off += (size_t) got; + } + close(fd); + body[off] = '\0'; + + cJSON *root = cJSON_Parse(body); + free(body); + if (!root) { + errno = EINVAL; + return NULL; + } + + const char *mt = NULL; + const cJSON *mt_field = cJSON_GetObjectItemCaseSensitive(root, "mediaType"); + if (cJSON_IsString(mt_field) && mt_field->valuestring) + mt = mt_field->valuestring; + + char *dup = NULL; + if (mt) { + dup = strdup(mt); + } else if (cJSON_IsArray( + cJSON_GetObjectItemCaseSensitive(root, "manifests"))) { + dup = strdup(MT_OCI_IMAGE_INDEX); + } else { + dup = strdup(MT_OCI_IMAGE_MANIFEST); + } + cJSON_Delete(root); + if (!dup) { + errno = ENOMEM; + return NULL; + } + return dup; +} + +/* Read /index.json as a parsed cJSON tree. Returns NULL with errno=ENOENT + * when the file is missing (the empty-store happy path), NULL with another + * errno on IO failure, or NULL with errno=EINVAL on a parse error. The caller + * owns the returned tree and must cJSON_Delete it. + */ +static cJSON *read_index_json(const char *root, const char **err_msg) +{ + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/index.json", root); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + if (err_msg) + *err_msg = "index.json path exceeds STORE_PATH_MAX"; + return NULL; + } + int fd = open(path, O_RDONLY); + if (fd < 0) { + if (err_msg && errno != ENOENT) + *err_msg = "failed to open index.json"; + return NULL; + } + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + if (err_msg) + *err_msg = "fstat on index.json failed"; + return NULL; + } + if (st.st_size < 0 || st.st_size > (off_t) MAX_MANIFEST_BYTES) { + close(fd); + errno = EINVAL; + if (err_msg) + *err_msg = "index.json is empty or implausibly large"; + return NULL; + } + size_t len = (size_t) st.st_size; + char *body = malloc(len + 1); + if (!body) { + close(fd); + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory reading index.json"; + return NULL; + } + size_t off = 0; + while (off < len) { + ssize_t got = read(fd, body + off, len - off); + if (got < 0) { + int saved = errno; + free(body); + close(fd); + errno = saved; + if (err_msg) + *err_msg = "read on index.json failed"; + return NULL; + } + if (got == 0) + break; + off += (size_t) got; + } + close(fd); + body[off] = '\0'; + + cJSON *root_json = cJSON_Parse(body); + free(body); + if (!root_json) { + errno = EINVAL; + if (err_msg) + *err_msg = "index.json is not valid JSON"; + return NULL; + } + return root_json; +} + +/* Build an empty OCI image-index skeleton. Returns NULL on alloc failure. */ +static cJSON *new_empty_index(void) +{ + cJSON *root = cJSON_CreateObject(); + if (!root) + return NULL; + if (!cJSON_AddNumberToObject(root, "schemaVersion", 2) || + !cJSON_AddStringToObject(root, "mediaType", MT_OCI_IMAGE_INDEX)) { + cJSON_Delete(root); + return NULL; + } + cJSON *manifests = cJSON_CreateArray(); + if (!manifests) { + cJSON_Delete(root); + return NULL; + } + if (!cJSON_AddItemToObject(root, "manifests", manifests)) { + cJSON_Delete(manifests); + cJSON_Delete(root); + return NULL; + } + return root; +} + +/* Walk the manifests[] array, return the index of the descriptor whose + * annotations. equals name, or -1 if not found. + */ +static int find_manifest_index(const cJSON *manifests, const char *name) +{ + if (!cJSON_IsArray(manifests)) + return -1; + int n = cJSON_GetArraySize(manifests); + for (int i = 0; i < n; i++) { + const cJSON *entry = cJSON_GetArrayItem(manifests, i); + if (!cJSON_IsObject(entry)) + continue; + const cJSON *annots = + cJSON_GetObjectItemCaseSensitive(entry, "annotations"); + if (!cJSON_IsObject(annots)) + continue; + const cJSON *got = + cJSON_GetObjectItemCaseSensitive(annots, ANNOT_REF_NAME); + if (cJSON_IsString(got) && got->valuestring && + strcmp(got->valuestring, name) == 0) + return i; + } + return -1; +} + +/* Build a manifests[] descriptor object for (name, media_type, digest, size). + * Returns a newly-allocated cJSON node owned by the caller. NULL on alloc. + */ +static cJSON *build_descriptor(const char *name, + const char *media_type, + const char *digest_str, + int64_t size) +{ + cJSON *desc = cJSON_CreateObject(); + if (!desc) + return NULL; + if (!cJSON_AddStringToObject(desc, "mediaType", media_type) || + !cJSON_AddStringToObject(desc, "digest", digest_str) || + !cJSON_AddNumberToObject(desc, "size", (double) size)) + goto fail; + cJSON *annots = cJSON_CreateObject(); + if (!annots) + goto fail; + if (!cJSON_AddItemToObject(desc, "annotations", annots)) { + cJSON_Delete(annots); + goto fail; + } + if (!cJSON_AddStringToObject(annots, ANNOT_REF_NAME, name)) + goto fail; + return desc; + +fail: + cJSON_Delete(desc); + return NULL; +} + +static unsigned long pin_seq(void) +{ + static unsigned long n = 0; + return __sync_add_and_fetch(&n, 1); +} + +/* fsync the directory containing path so a rename(2) that publishes a new + * entry is durable across a crash: fsync on the file persists its contents + * but not the parent directory entry. Best-effort -- the tmp-file fsync is + * the primary guarantee and some filesystems reject a directory fsync, so a + * failure here must not fail the publish. + */ +static void fsync_parent_dir(const char *path) +{ + const char *slash = strrchr(path, '/'); + char dir[STORE_PATH_MAX]; + if (!slash) { + dir[0] = '.'; + dir[1] = '\0'; + } else if (slash == path) { + dir[0] = '/'; + dir[1] = '\0'; + } else { + size_t n = (size_t) (slash - path); + if (n >= sizeof(dir)) + return; + memcpy(dir, path, n); + dir[n] = '\0'; + } + int dfd = open(dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (dfd < 0) + return; + (void) fsync(dfd); + (void) close(dfd); +} + +/* Serialize root_json to /index.json via tmp + rename. The publish is + * atomic with respect to readers: an open() either sees the previous inode + * or the new one, never a half-written file. fsync the tmp file before + * rename, and the parent directory after, so a crash leaves the pin update + * durable rather than reverting to the prior index.json. + */ +static int write_index_json(const char *root, + const cJSON *root_json, + const char **err_msg) +{ + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/index.json", root); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + if (err_msg) + *err_msg = "index.json path exceeds STORE_PATH_MAX"; + return -1; + } + char tmp[STORE_PATH_MAX]; + n = snprintf(tmp, sizeof(tmp), "%s.tmp-%d-%lu", path, (int) getpid(), + pin_seq()); + if (n < 0 || (size_t) n >= sizeof(tmp)) { + errno = ENAMETOOLONG; + if (err_msg) + *err_msg = "index.json tmp path exceeds STORE_PATH_MAX"; + return -1; + } + + char *body = cJSON_PrintUnformatted(root_json); + if (!body) { + errno = ENOMEM; + if (err_msg) + *err_msg = "failed to serialize index.json"; + return -1; + } + size_t body_len = strlen(body); + + int fd = open(tmp, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + int saved = errno; + free(body); + errno = saved; + if (err_msg) + *err_msg = "failed to create index.json tmp file"; + return -1; + } + + /* Append a trailing newline so external tools that line-print the file + * (jq, cat) render cleanly. cJSON_PrintUnformatted does not include it. + */ + const char nl = '\n'; + if (write(fd, body, body_len) != (ssize_t) body_len || + write(fd, &nl, 1) != 1) { + int saved = errno; + close(fd); + unlink(tmp); + free(body); + errno = saved; + if (err_msg) + *err_msg = "failed to write index.json tmp file"; + return -1; + } + free(body); + if (fsync(fd) < 0) { + int saved = errno; + close(fd); + unlink(tmp); + errno = saved; + if (err_msg) + *err_msg = "fsync on index.json tmp file failed"; + return -1; + } + if (close(fd) < 0) { + int saved = errno; + unlink(tmp); + errno = saved; + if (err_msg) + *err_msg = "close on index.json tmp file failed"; + return -1; + } + if (rename(tmp, path) < 0) { + int saved = errno; + unlink(tmp); + errno = saved; + if (err_msg) + *err_msg = "rename of index.json tmp file failed"; + return -1; + } + /* Persist the directory entry the rename just swapped in so a crash does + * not silently roll the tag->digest pins back to the previous index.json. + */ + fsync_parent_dir(path); + return 0; +} + +/* Acquire LOCK_EX on /index.json.lock. The lock file is created when + * missing; failures to create it (full disk, permission) surface immediately + * so a writer never proceeds without coordination. Returns the lock fd on + * success; the caller must close() it to release the lock (POSIX advisory + * lock semantics tie lifetime to the fd). + */ +static int acquire_index_lock(const char *root, const char **err_msg) +{ + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/index.json.lock", root); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + if (err_msg) + *err_msg = "index.json.lock path exceeds STORE_PATH_MAX"; + return -1; + } + int fd = open(path, O_RDWR | O_CREAT | O_CLOEXEC, 0644); + if (fd < 0) { + if (err_msg) + *err_msg = "failed to open index.json.lock"; + return -1; + } + if (flock(fd, LOCK_EX) < 0) { + int saved = errno; + close(fd); + errno = saved; + if (err_msg) + *err_msg = "flock on index.json.lock failed"; + return -1; + } + return fd; +} + +int oci_store_put_ref(oci_store_t *s, + const oci_ref_t *ref, + const char *digest_str, + const char **err_msg) +{ + if (!s || !ref || !digest_str || !ref->registry || !ref->repository) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + if (!ref->tag) { + if (err_msg) + *err_msg = "ref has no tag; digest-only refs are self-pinning"; + errno = EINVAL; + return -1; + } + + /* Validate digest shape so a corrupt caller cannot poison the pin + * descriptor with arbitrary bytes that later defeat oci_store_get_ref. + */ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + if (err_msg) + *err_msg = "digest must be lowercase :"; + errno = EINVAL; + return -1; + } + + int64_t size = 0; + if (blob_size(s, digest_str, &size) < 0) { + if (err_msg) + *err_msg = "manifest blob is not present in the local store"; + return -1; + } + char *media_type = infer_manifest_media_type(s, digest_str); + if (!media_type) { + if (err_msg) + *err_msg = "failed to determine manifest mediaType from blob"; + return -1; + } + + char *name = oci_ref_canonical_name(ref); + if (!name) { + int saved = errno; + free(media_type); + errno = saved; + if (err_msg) + *err_msg = "failed to render canonical ref name"; + return -1; + } + + int rc = -1; + int lock_fd = acquire_index_lock(s->root, err_msg); + if (lock_fd < 0) + goto out_no_lock; + + const char *read_err = NULL; + cJSON *root_json = read_index_json(s->root, &read_err); + if (!root_json) { + if (errno != ENOENT) { + if (err_msg) + *err_msg = read_err ? read_err : "failed to read index.json"; + goto out; + } + root_json = new_empty_index(); + if (!root_json) { + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory building empty index.json"; + goto out; + } + } + + cJSON *manifests = cJSON_GetObjectItemCaseSensitive(root_json, "manifests"); + if (!cJSON_IsArray(manifests)) { + /* Corrupt or hand-edited index: rebuild the array so writes still + * make progress. The old contents are discarded. + */ + cJSON_DeleteItemFromObject(root_json, "manifests"); + manifests = cJSON_CreateArray(); + if (!manifests || + !cJSON_AddItemToObject(root_json, "manifests", manifests)) { + cJSON_Delete(manifests); + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory rebuilding manifests array"; + goto out; + } + } + + cJSON *desc = build_descriptor(name, media_type, digest_str, size); + if (!desc) { + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory building pin descriptor"; + goto out; + } + + int existing = find_manifest_index(manifests, name); + if (existing >= 0) { + /* Replace in place so concurrent re-pulls of the same tag do not + * accumulate duplicate descriptors. + */ + if (!cJSON_ReplaceItemInArray(manifests, existing, desc)) { + cJSON_Delete(desc); + errno = EIO; + if (err_msg) + *err_msg = "failed to replace existing pin descriptor"; + goto out; + } + } else if (!cJSON_AddItemToArray(manifests, desc)) { + cJSON_Delete(desc); + errno = ENOMEM; + if (err_msg) + *err_msg = "failed to append pin descriptor"; + goto out; + } + + if (write_index_json(s->root, root_json, err_msg) < 0) + goto out; + + rc = 0; + +out: + cJSON_Delete(root_json); + /* close releases the flock per POSIX advisory-lock semantics. */ + close(lock_fd); +out_no_lock: + free(name); + free(media_type); + return rc; +} + +int oci_store_get_ref(oci_store_t *s, + const oci_ref_t *ref, + char **out_digest, + const char **err_msg) +{ + if (!s || !ref || !out_digest || !ref->registry || !ref->repository) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + *out_digest = NULL; + if (!ref->tag) { + if (err_msg) + *err_msg = "ref has no tag"; + errno = EINVAL; + return -1; + } + + char *name = oci_ref_canonical_name(ref); + if (!name) { + if (err_msg) + *err_msg = "failed to render canonical ref name"; + return -1; + } + + const char *read_err = NULL; + cJSON *root_json = read_index_json(s->root, &read_err); + if (!root_json) { + free(name); + if (errno == ENOENT && err_msg) + *err_msg = "ref not pinned in local store"; + else if (err_msg) + *err_msg = read_err ? read_err : "failed to read index.json"; + return -1; + } + + cJSON *manifests = cJSON_GetObjectItemCaseSensitive(root_json, "manifests"); + int idx = find_manifest_index(manifests, name); + free(name); + if (idx < 0) { + cJSON_Delete(root_json); + errno = ENOENT; + if (err_msg) + *err_msg = "ref not pinned in local store"; + return -1; + } + + const cJSON *entry = cJSON_GetArrayItem(manifests, idx); + const cJSON *digest_field = + cJSON_GetObjectItemCaseSensitive(entry, "digest"); + if (!cJSON_IsString(digest_field) || !digest_field->valuestring) { + cJSON_Delete(root_json); + errno = EINVAL; + if (err_msg) + *err_msg = "pin descriptor is missing digest field"; + return -1; + } + + /* Re-validate the digest shape so a hand-edited index.json cannot smuggle + * a malformed digest back to a caller that trusts the store output. + */ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_field->valuestring, &algo, hex)) { + cJSON_Delete(root_json); + errno = EINVAL; + if (err_msg) + *err_msg = "pin descriptor digest has invalid shape"; + return -1; + } + + char *copy = strdup(digest_field->valuestring); + cJSON_Delete(root_json); + if (!copy) { + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory"; + return -1; + } + *out_digest = copy; + return 0; +} + +int oci_store_list_refs(oci_store_t *s, + oci_pin_list_t *out, + const char **err_msg) +{ + if (!s || !out) { + if (err_msg) + *err_msg = "invalid arguments"; + errno = EINVAL; + return -1; + } + out->items = NULL; + out->count = 0; + + const char *read_err = NULL; + cJSON *root_json = read_index_json(s->root, &read_err); + if (!root_json) { + if (errno == ENOENT) + return 0; + if (err_msg) + *err_msg = read_err ? read_err : "failed to read index.json"; + return -1; + } + + cJSON *manifests = cJSON_GetObjectItemCaseSensitive(root_json, "manifests"); + if (!cJSON_IsArray(manifests)) { + cJSON_Delete(root_json); + return 0; + } + int n = cJSON_GetArraySize(manifests); + if (n <= 0) { + cJSON_Delete(root_json); + return 0; + } + + oci_pin_entry_t *items = calloc((size_t) n, sizeof(*items)); + if (!items) { + cJSON_Delete(root_json); + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory allocating pin list"; + return -1; + } + size_t filled = 0; + for (int i = 0; i < n; i++) { + const cJSON *entry = cJSON_GetArrayItem(manifests, i); + if (!cJSON_IsObject(entry)) + continue; + const cJSON *annots = + cJSON_GetObjectItemCaseSensitive(entry, "annotations"); + const cJSON *name_field = + cJSON_IsObject(annots) + ? cJSON_GetObjectItemCaseSensitive(annots, ANNOT_REF_NAME) + : NULL; + const cJSON *digest_field = + cJSON_GetObjectItemCaseSensitive(entry, "digest"); + if (!cJSON_IsString(name_field) || !name_field->valuestring || + !cJSON_IsString(digest_field) || !digest_field->valuestring) { + /* Skip schema-incomplete entries: a third-party tool may have + * inserted a manifest without the ref-name annotation, in which + * case it is not a pin from elfuse's perspective. + */ + continue; + } + char *name_copy = strdup(name_field->valuestring); + char *digest_copy = strdup(digest_field->valuestring); + if (!name_copy || !digest_copy) { + free(name_copy); + free(digest_copy); + for (size_t k = 0; k < filled; k++) { + free(items[k].name); + free(items[k].digest); + } + free(items); + cJSON_Delete(root_json); + errno = ENOMEM; + if (err_msg) + *err_msg = "out of memory copying pin entry"; + return -1; + } + items[filled].name = name_copy; + items[filled].digest = digest_copy; + filled++; + } + cJSON_Delete(root_json); + + if (filled == 0) { + free(items); + return 0; + } + + out->items = items; + out->count = filled; + return 0; +} + +void oci_pin_list_free(oci_pin_list_t *list) +{ + if (!list) + return; + if (list->items) { + for (size_t i = 0; i < list->count; i++) { + free(list->items[i].name); + free(list->items[i].digest); + } + free(list->items); + } + list->items = NULL; + list->count = 0; +} + +/* Slurp the manifest-class blob at digest_str into a heap buffer. The + * caller frees *out_body. Mirrors the size and bounds checks of + * infer_manifest_media_type so a corrupt or hostile blob does not + * trigger a multi-GB malloc here. Returns 0 on success or -1 with + * errno preserved on failure. + */ +static int load_manifest_blob(const oci_store_t *s, + const char *digest_str, + char **out_body, + size_t *out_len) +{ + char path[STORE_PATH_MAX]; + if (blob_path_for_digest(s, digest_str, path, sizeof(path)) < 0) + return -1; + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + errno = saved; + return -1; + } + if (st.st_size <= 0 || st.st_size > (off_t) MAX_MANIFEST_BYTES) { + close(fd); + errno = EINVAL; + return -1; + } + size_t len = (size_t) st.st_size; + char *body = malloc(len + 1); + if (!body) { + close(fd); + errno = ENOMEM; + return -1; + } + size_t off = 0; + while (off < len) { + ssize_t got = read(fd, body + off, len - off); + if (got < 0) { + if (errno == EINTR) + continue; + int saved = errno; + free(body); + close(fd); + errno = saved; + return -1; + } + if (got == 0) + break; + off += (size_t) got; + } + close(fd); + if (off != len) { + free(body); + errno = EIO; + return -1; + } + body[len] = '\0'; + *out_body = body; + *out_len = len; + return 0; +} + +/* True when blobs// for digest_str exists on disk. Errors + * other than ENOENT (permission, ENAMETOOLONG) propagate as "missing" + * because the caller's failure path treats either as fatal for the + * keep-set walk; the distinction is academic. + */ +static bool manifest_blob_exists(const oci_store_t *s, const char *digest_str) +{ + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) + return false; + return oci_blob_store_has(s->blobs, algo, hex); +} + +/* Recursive expander: ensure digest_str is in out and, if its blob is + * a manifest or image-index, also add every descriptor it references. + * Recursion terminates because oci_digest_set_add is a no-op for any + * digest already in the set, so a cycle (theoretical: an image-index + * pointing at itself) is bounded. + * + * Returns 0 on success, -1 on fatal failure (missing or unparseable + * blob) with errno set and *err populated. + */ +static int expand_manifest_digest(oci_store_t *s, + const char *digest_str, + oci_digest_set_t *out, + const char **err) +{ + if (oci_digest_set_contains(out, digest_str)) + return 0; + if (oci_digest_set_add(out, digest_str) < 0) { + if (err) + *err = "collect_roots: digest_set_add failed"; + return -1; + } + + char *body = NULL; + size_t body_len = 0; + if (load_manifest_blob(s, digest_str, &body, &body_len) < 0) { + if (err) + *err = + "collect_roots: referenced manifest blob is missing or " + "unreadable"; + return -1; + } + + oci_manifest_t manifest = {0}; + const char *perr = NULL; + if (oci_manifest_parse(body, body_len, &manifest, &perr) == 0) { + int rc = 0; + if (oci_digest_set_add(out, manifest.config.digest_str) < 0) { + if (err) + *err = "collect_roots: digest_set_add for config failed"; + rc = -1; + goto manifest_done; + } + for (size_t i = 0; i < manifest.nlayers; i++) { + if (oci_digest_set_add(out, manifest.layers[i].digest_str) < 0) { + if (err) + *err = "collect_roots: digest_set_add for layer failed"; + rc = -1; + goto manifest_done; + } + } + manifest_done: + oci_manifest_free(&manifest); + free(body); + return rc; + } + memset(&manifest, 0, sizeof(manifest)); + + /* Not an image-manifest. Try image-index: a multi-arch index + * references one sub-manifest descriptor per platform. + */ + oci_index_t index = {0}; + const char *ierr = NULL; + if (oci_index_parse(body, body_len, &index, &ierr) < 0) { + free(body); + if (err) + *err = + "collect_roots: blob is neither image-manifest nor " + "image-index"; + errno = EINVAL; + return -1; + } + free(body); + + for (size_t i = 0; i < index.nentries; i++) { + const char *sub = index.entries[i].desc.digest_str; + /* Record the sub-manifest descriptor digest even when the + * blob is not on disk: a multi-arch index legitimately + * references blobs for other platforms that pull never + * fetched, and a sweep must not delete the platforms that + * did materialise. When the blob is present recurse so its + * config + layers join the keep set; when absent the index + * descriptor alone is enough because there is no blob to + * delete. + */ + if (manifest_blob_exists(s, sub)) { + if (expand_manifest_digest(s, sub, out, err) < 0) { + oci_index_free(&index); + return -1; + } + } else if (oci_digest_set_add(out, sub) < 0) { + if (err) + *err = "collect_roots: digest_set_add for sub-manifest failed"; + oci_index_free(&index); + return -1; + } + } + oci_index_free(&index); + return 0; +} + +int oci_store_collect_roots(oci_store_t *s, + oci_digest_set_t *out, + const char *volume_root, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!s || !out) { + *err = "collect_roots: NULL argument"; + errno = EINVAL; + return -1; + } + oci_digest_set_init(out); + + /* Source 1: pins in index.json. list_refs handles the empty case + * (no index.json yet) without surfacing an error, so a fresh + * store contributes zero entries from this source. + */ + oci_pin_list_t pins = {0}; + const char *list_err = NULL; + if (oci_store_list_refs(s, &pins, &list_err) < 0) { + if (err) + *err = list_err ? list_err + : "collect_roots: oci_store_list_refs failed"; + return -1; + } + for (size_t i = 0; i < pins.count; i++) { + if (expand_manifest_digest(s, pins.items[i].digest, out, err) < 0) { + oci_pin_list_free(&pins); + oci_digest_set_free(out); + return -1; + } + } + oci_pin_list_free(&pins); + + /* Source 2: unpacked image trees under /images/. + * A NULL volume_root skips this source entirely (callers that + * only need the pin contribution). A missing images/ directory + * is treated as zero contribution by oci_volume_list_unpacked. + */ + if (volume_root) { + oci_volume_list_t trees = {0}; + const char *vlerr = NULL; + if (oci_volume_list_unpacked(volume_root, &trees, &vlerr) < 0) { + if (err) + *err = vlerr ? vlerr + : "collect_roots: volume_list_unpacked failed"; + oci_digest_set_free(out); + return -1; + } + for (size_t i = 0; i < trees.count; i++) { + oci_origin_t origin = {0}; + const char *oerr = NULL; + if (oci_origin_read(trees.items[i], &origin, &oerr) < 0) { + if (err) + *err = oerr ? oerr + : "collect_roots: origin sidecar read failed"; + oci_volume_list_free(&trees); + oci_digest_set_free(out); + return -1; + } + if (expand_manifest_digest(s, origin.manifest_digest, out, err) < + 0) { + oci_origin_free(&origin); + oci_volume_list_free(&trees); + oci_digest_set_free(out); + return -1; + } + oci_origin_free(&origin); + } + oci_volume_list_free(&trees); + } + return 0; +} + +/* Read a legacy pin file at path. The format is a single line of + * ":" optionally followed by \n or \r\n. Trims trailing whitespace + * and validates digest shape. Returns a heap-allocated digest string on + * success, NULL on IO or schema failure with errno preserved. + */ +static char *read_legacy_pin_file(const char *path) +{ + int fd = open(path, O_RDONLY); + if (fd < 0) + return NULL; + char buf[OCI_DIGEST_HEX_MAX + 32]; + ssize_t got = read(fd, buf, sizeof(buf) - 1); + int saved_errno = errno; + close(fd); + if (got < 0) { + errno = saved_errno; + return NULL; + } + if (got == 0) { + errno = EINVAL; + return NULL; + } + buf[got] = '\0'; + /* Trim trailing newline / carriage return so a Windows-edited pin file + * does not feed a stray byte into the digest validator. + */ + while (got > 0 && (buf[got - 1] == '\n' || buf[got - 1] == '\r' || + buf[got - 1] == ' ' || buf[got - 1] == '\t')) { + buf[--got] = '\0'; + } + if (got == 0) { + errno = EINVAL; + return NULL; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(buf, &algo, hex)) { + errno = EINVAL; + return NULL; + } + char *copy = strdup(buf); + if (!copy) { + errno = ENOMEM; + return NULL; + } + return copy; +} + +/* Synthesize a pin descriptor for one legacy refs/ leaf and insert it into + * manifests[]. (name, digest_str) describe the pin; the manifest blob must + * exist on disk so mediaType + size can be derived. Returns 0 on success, + * +1 if the pin should be skipped (missing blob or other recoverable hole), + * -1 with errno preserved on an unrecoverable failure. + */ +static int migrate_append_descriptor(const oci_store_t *s, + cJSON *manifests, + const char *name, + const char *digest_str) +{ + int64_t size = 0; + if (blob_size(s, digest_str, &size) < 0) { + if (errno == ENOENT) { + fprintf(stderr, + "elfuse oci: migration skipping pin %s: manifest blob " + "%s missing from blobs/\n", + name, digest_str); + return 1; + } + return -1; + } + char *media_type = infer_manifest_media_type(s, digest_str); + if (!media_type) + return -1; + cJSON *desc = build_descriptor(name, media_type, digest_str, size); + free(media_type); + if (!desc) { + errno = ENOMEM; + return -1; + } + /* Pre-existing index entry with the same name should not happen during + * a fresh migration, but defend against a partially-migrated store + * inheriting from an earlier crash: a later refs/ leaf with the same + * canonical name simply replaces the earlier one. */ + int existing = find_manifest_index(manifests, name); + if (existing >= 0) { + if (!cJSON_ReplaceItemInArray(manifests, existing, desc)) { + cJSON_Delete(desc); + errno = EIO; + return -1; + } + } else if (!cJSON_AddItemToArray(manifests, desc)) { + cJSON_Delete(desc); + errno = ENOMEM; + return -1; + } + return 0; +} + +/* Recursive walk of refs/. depth counts directory levels below refs/: + * depth 0 = registry, depth 1.. = repository components, leaf file = tag. + * head and head_len accumulate the relative path so the leaf callback can + * split it into registry/repo/tag. *migrated and *skipped track totals for + * the final log line. Returns 0 on success, -1 on unrecoverable failure. + */ +static int scan_refs_dir(const oci_store_t *s, + cJSON *manifests, + const char *dir_abs, + const char *rel_head, + size_t rel_head_len, + size_t depth, + size_t *migrated, + size_t *skipped) +{ + DIR *dp = opendir(dir_abs); + if (!dp) + return -1; + + int rc = 0; + struct dirent *de; + while ((de = readdir(dp)) != NULL) { + const char *name = de->d_name; + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) + continue; + /* Hidden files (e.g. .DS_Store) are ignored: legacy pins always used + * an unprefixed registry / tag name so a dotfile cannot represent a + * pin and we skip it without surfacing as a migration warning. */ + if (name[0] == '.') + continue; + + size_t name_len = strlen(name); + char child_abs[STORE_PATH_MAX]; + int n = snprintf(child_abs, sizeof(child_abs), "%s/%s", dir_abs, name); + if (n < 0 || (size_t) n >= sizeof(child_abs)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + char child_rel[STORE_PATH_MAX]; + int rn = rel_head_len == 0 + ? snprintf(child_rel, sizeof(child_rel), "%s", name) + : snprintf(child_rel, sizeof(child_rel), "%s/%s", rel_head, + name); + if (rn < 0 || (size_t) rn >= sizeof(child_rel)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + + struct stat st; + if (lstat(child_abs, &st) < 0) { + rc = -1; + break; + } + if (S_ISDIR(st.st_mode)) { + if (scan_refs_dir(s, manifests, child_abs, child_rel, (size_t) rn, + depth + 1, migrated, skipped) < 0) { + rc = -1; + break; + } + continue; + } + if (!S_ISREG(st.st_mode)) + continue; + + /* Leaf file. Need at least one repository component plus the tag, so + * total depth must be >= 2 (registry / repo / tag). A leaf that lands + * directly under refs// has no repository component and is + * not a well-formed legacy pin; skip it with a warning so the store + * is not silently lossy. + */ + if (depth < 2) { + fprintf(stderr, + "elfuse oci: migration skipping refs/%s: not deep enough " + "for //\n", + child_rel); + (*skipped)++; + continue; + } + (void) name_len; + + /* Split child_rel = "/(/)*" + * "/". First '/' separates registry; last '/' separates tag. + */ + const char *first_slash = strchr(child_rel, '/'); + const char *last_slash = strrchr(child_rel, '/'); + if (!first_slash || !last_slash || first_slash == last_slash) { + fprintf(stderr, + "elfuse oci: migration skipping refs/%s: malformed path\n", + child_rel); + (*skipped)++; + continue; + } + size_t reg_len = (size_t) (first_slash - child_rel); + size_t repo_len = (size_t) (last_slash - first_slash - 1); + const char *repo_start = first_slash + 1; + const char *tag_start = last_slash + 1; + size_t tag_len = strlen(tag_start); + if (reg_len == 0 || repo_len == 0 || tag_len == 0) { + fprintf(stderr, + "elfuse oci: migration skipping refs/%s: empty path " + "component\n", + child_rel); + (*skipped)++; + continue; + } + + char *digest_str = read_legacy_pin_file(child_abs); + if (!digest_str) { + fprintf(stderr, + "elfuse oci: migration skipping refs/%s: pin file " + "unreadable or malformed\n", + child_rel); + (*skipped)++; + continue; + } + + /* Build canonical "/:" inline (cannot + * borrow oci_ref_canonical_name without round-tripping through the + * parser, which would reject repository components that the legacy + * code path happened to accept). */ + size_t total = reg_len + 1 + repo_len + 1 + tag_len + 1; + char *canon = malloc(total); + if (!canon) { + free(digest_str); + errno = ENOMEM; + rc = -1; + break; + } + char *wp = canon; + memcpy(wp, child_rel, reg_len); + wp += reg_len; + *wp++ = '/'; + memcpy(wp, repo_start, repo_len); + wp += repo_len; + *wp++ = ':'; + memcpy(wp, tag_start, tag_len); + wp += tag_len; + *wp = '\0'; + + int ar = migrate_append_descriptor(s, manifests, canon, digest_str); + free(canon); + free(digest_str); + if (ar < 0) { + rc = -1; + break; + } + if (ar > 0) { + (*skipped)++; + continue; + } + (*migrated)++; + } + closedir(dp); + return rc; +} + +static int migrate_legacy_refs(struct oci_store *s) +{ + char refs_path[STORE_PATH_MAX]; + int n = snprintf(refs_path, sizeof(refs_path), "%s/refs", s->root); + if (n < 0 || (size_t) n >= sizeof(refs_path)) { + errno = ENAMETOOLONG; + return -1; + } + struct stat st; + if (lstat(refs_path, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (!S_ISDIR(st.st_mode)) + return 0; + + char index_path[STORE_PATH_MAX]; + n = snprintf(index_path, sizeof(index_path), "%s/index.json", s->root); + if (n < 0 || (size_t) n >= sizeof(index_path)) { + errno = ENAMETOOLONG; + return -1; + } + if (lstat(index_path, &st) == 0) + return 0; + if (errno != ENOENT) + return -1; + + /* Race window: between the unlocked check and acquire_index_lock a + * concurrent put_ref / open may have written index.json. Re-check under + * the lock and bail out if so, otherwise two migrations would race and + * the later write would partially overwrite a put_ref's descriptor. + */ + const char *lock_err = NULL; + int lock_fd = acquire_index_lock(s->root, &lock_err); + if (lock_fd < 0) + return -1; + + if (lstat(index_path, &st) == 0) { + close(lock_fd); + return 0; + } + if (errno != ENOENT) { + int saved = errno; + close(lock_fd); + errno = saved; + return -1; + } + + cJSON *root_json = new_empty_index(); + if (!root_json) { + close(lock_fd); + errno = ENOMEM; + return -1; + } + cJSON *manifests = cJSON_GetObjectItemCaseSensitive(root_json, "manifests"); + + size_t migrated = 0, skipped = 0; + int rc = + scan_refs_dir(s, manifests, refs_path, "", 0, 0, &migrated, &skipped); + if (rc < 0) { + int saved = errno; + cJSON_Delete(root_json); + close(lock_fd); + errno = saved; + return -1; + } + + /* Write even when migrated == 0 so a future open does not re-probe a + * refs/ tree that turned out to be empty or all-skipped. An empty + * index.json is the documented C2.2 happy-path shape (manifests: []). + */ + const char *write_err = NULL; + if (write_index_json(s->root, root_json, &write_err) < 0) { + int saved = errno; + cJSON_Delete(root_json); + close(lock_fd); + errno = saved; + return -1; + } + cJSON_Delete(root_json); + close(lock_fd); + + if (skipped > 0) { + fprintf(stderr, + "elfuse oci: migrated %zu pin(s) from refs/ to index.json " + "(refs/ kept for downgrade fallback; %zu pin(s) skipped)\n", + migrated, skipped); + } else { + fprintf(stderr, + "elfuse oci: migrated %zu pin(s) from refs/ to index.json " + "(refs/ kept for downgrade fallback)\n", + migrated); + } + return 0; +} + +/* --- Plan 3 C3.3d: layer + stack cache mark walker -------------------- */ + +/* Free a NULL-terminated heap-owned char ** array. */ +static void diff_id_strv_free(char **v) +{ + if (!v) + return; + for (size_t i = 0; v[i]; i++) + free(v[i]); + free((void *) v); +} + +/* Walk a directory tree summing the st_size of every regular file. Symlinks + * and other non-regular entries contribute zero (lstat does not follow). A + * missing entry (ENOENT) yields 0 so a concurrent rm cannot make the caller + * undercount what is still on disk. Other directory IO errors are treated as + * zero too because the prune sweep already counted the entry as a candidate + * and a partial size sum here would only shrink the reported reclaim figure; + * the recursive rm in apply_verdicts will surface the real failure. + * + * Duplicate of dedup-metrics.c::sum_tree_size; lift to a shared util when a + * third copy appears (rebuild-cache.c already carries its own rm_recursive + * for the same reason). + */ +static uint64_t dir_tree_size_sum(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return 0; + if (S_ISREG(st.st_mode)) + return (uint64_t) st.st_size; + if (!S_ISDIR(st.st_mode)) + return 0; + DIR *d = opendir(path); + if (!d) + return 0; + uint64_t total = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[STORE_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) + continue; + total += dir_tree_size_sum(child); + } + closedir(d); + return total; +} + +/* Walk one image's manifest digest down to the linux/arm64 image-config and + * extract its rootfs.diff_ids as a heap-allocated NULL-terminated char **. + * + * Resolution path: + * - load + try image-manifest parse -> read config descriptor, load + parse + * image-config, return rootfs.diff_ids + * - else try image-index parse -> pick linux/arm64 sub-manifest, recurse + * if its blob is on disk, otherwise return a soft NO_LINUX_ARM64 result + * - else fatal: malformed blob + * + * Return discipline (see diff_id_resolve_t below): SOFT_NONE indicates "no + * linux/arm64 entry in this image-index" or "the picked sub-manifest blob is + * not on disk" and contributes zero to the keep set without surfacing as an + * error; HARD_FAIL indicates a corrupt or missing manifest / config blob and + * propagates as a fatal mark failure so prune cannot later delete reachable + * cache entries. + * + * Duplicate of dedup-metrics.c::resolve_config_digest + load_diff_ids (those + * helpers fold all failures into "skip the image"; the mark walker needs the + * fatal vs soft distinction). + */ +typedef enum { + DIFF_ID_RESOLVE_OK = 0, + DIFF_ID_RESOLVE_SOFT_NONE = 1, + DIFF_ID_RESOLVE_HARD_FAIL = 2, +} diff_id_resolve_t; + +static diff_id_resolve_t resolve_image_diff_ids(oci_store_t *s, + const char *manifest_digest, + char ***out_diff_ids, + const char **err) +{ + *out_diff_ids = NULL; + char *body = NULL; + size_t body_len = 0; + if (load_manifest_blob(s, manifest_digest, &body, &body_len) < 0) { + if (err) + *err = "collect_layer_roots: manifest blob missing or unreadable"; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + + /* Image-manifest path: drill into its image-config. */ + oci_manifest_t mf = {0}; + if (oci_manifest_parse(body, body_len, &mf, NULL) == 0) { + char config_digest[OCI_DIGEST_HEX_MAX + 16]; + int dn = snprintf(config_digest, sizeof(config_digest), "%s", + mf.config.digest_str); + oci_manifest_free(&mf); + free(body); + if (dn < 0 || (size_t) dn >= sizeof(config_digest)) { + if (err) + *err = "collect_layer_roots: config digest overflow"; + errno = ENAMETOOLONG; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + char *cfg_body = NULL; + size_t cfg_len = 0; + if (load_manifest_blob(s, config_digest, &cfg_body, &cfg_len) < 0) { + if (err) + *err = + "collect_layer_roots: image-config blob missing or " + "unreadable"; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + oci_image_config_t cfg = {0}; + if (oci_image_config_parse(cfg_body, cfg_len, &cfg, NULL) < 0) { + free(cfg_body); + if (err) + *err = "collect_layer_roots: image-config blob unparseable"; + errno = EINVAL; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + free(cfg_body); + /* Count and copy the diff_ids. Empty list yields a one-element NULL + * terminator so callers iterate uniformly. */ + size_t n = 0; + while (cfg.rootfs_diff_ids[n]) + n++; + char **copy = (char **) calloc(n + 1, sizeof(*copy)); + if (!copy) { + oci_image_config_free(&cfg); + if (err) + *err = "collect_layer_roots: diff_id strv alloc failed"; + errno = ENOMEM; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + for (size_t i = 0; i < n; i++) { + copy[i] = strdup(cfg.rootfs_diff_ids[i]); + if (!copy[i]) { + diff_id_strv_free(copy); + oci_image_config_free(&cfg); + if (err) + *err = "collect_layer_roots: diff_id strdup failed"; + errno = ENOMEM; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + } + oci_image_config_free(&cfg); + *out_diff_ids = copy; + return DIFF_ID_RESOLVE_OK; + } + memset(&mf, 0, sizeof(mf)); + + /* Image-index path: pick linux/arm64 and recurse. */ + oci_index_t idx = {0}; + if (oci_index_parse(body, body_len, &idx, NULL) < 0) { + free(body); + if (err) + *err = + "collect_layer_roots: blob is neither image-manifest nor " + "image-index"; + errno = EINVAL; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + free(body); + const oci_index_entry_t *picked = oci_index_pick_linux_arm64(&idx); + if (!picked) { + oci_index_free(&idx); + return DIFF_ID_RESOLVE_SOFT_NONE; + } + char *sub_digest = strdup(picked->desc.digest_str); + oci_index_free(&idx); + if (!sub_digest) { + if (err) + *err = "collect_layer_roots: sub-manifest digest strdup failed"; + errno = ENOMEM; + return DIFF_ID_RESOLVE_HARD_FAIL; + } + if (!manifest_blob_exists(s, sub_digest)) { + /* Multi-arch pin where pull never fetched linux/arm64. Contribute + * nothing; the sub-manifest's layers are not on disk so there is + * nothing to keep. Matches expand_manifest_digest's soft policy for + * the same shape under blob mark. + */ + free(sub_digest); + return DIFF_ID_RESOLVE_SOFT_NONE; + } + diff_id_resolve_t rc = + resolve_image_diff_ids(s, sub_digest, out_diff_ids, err); + free(sub_digest); + return rc; +} + +/* Add every diff_id in the NULL-terminated list to *diff_set and every + * ChainID prefix (ChainID(L0..Lk) for k = 0..n-1) to *chain_set. The walker + * threads the running chain through a single buffer; oci_chainid_compute + * already handles the L0 passthrough case via a NULL prev argument. + * + * Returns 0 on success or -1 with errno set on allocation failure inside the + * digest set or chainid composition. err is populated on failure. + */ +static int add_diff_ids_and_chains(char *const *diff_ids, + oci_digest_set_t *diff_set, + oci_digest_set_t *chain_set, + const char **err) +{ + char prev[OCI_DIGEST_HEX_MAX + 16] = ""; + for (size_t i = 0; diff_ids[i]; i++) { + if (oci_digest_set_add(diff_set, diff_ids[i]) < 0) { + if (err) + *err = "collect_layer_roots: diff_id set add failed"; + return -1; + } + char chain[OCI_DIGEST_HEX_MAX + 16]; + const char *prev_arg = (i == 0) ? NULL : prev; + if (oci_chainid_compute(prev_arg, diff_ids[i], chain, sizeof(chain)) < + 0) { + if (err) + *err = "collect_layer_roots: chainid compute failed"; + return -1; + } + memcpy(prev, chain, strlen(chain) + 1); + if (oci_digest_set_add(chain_set, chain) < 0) { + if (err) + *err = "collect_layer_roots: chain set add failed"; + return -1; + } + } + return 0; +} + +int oci_store_collect_layer_roots(oci_store_t *s, + oci_digest_set_t *out_diff_ids, + oci_digest_set_t *out_chain_ids, + const char *volume_root, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!s || !out_diff_ids || !out_chain_ids) { + *err = "collect_layer_roots: NULL argument"; + errno = EINVAL; + return -1; + } + oci_digest_set_init(out_diff_ids); + oci_digest_set_init(out_chain_ids); + + /* Source 1: pins in index.json. */ + oci_pin_list_t pins = {0}; + const char *list_err = NULL; + if (oci_store_list_refs(s, &pins, &list_err) < 0) { + *err = list_err ? list_err + : "collect_layer_roots: oci_store_list_refs failed"; + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + for (size_t i = 0; i < pins.count; i++) { + char **diff_ids = NULL; + diff_id_resolve_t rr = + resolve_image_diff_ids(s, pins.items[i].digest, &diff_ids, err); + if (rr == DIFF_ID_RESOLVE_HARD_FAIL) { + oci_pin_list_free(&pins); + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + if (rr == DIFF_ID_RESOLVE_SOFT_NONE) + continue; + int ac = + add_diff_ids_and_chains(diff_ids, out_diff_ids, out_chain_ids, err); + diff_id_strv_free(diff_ids); + if (ac < 0) { + oci_pin_list_free(&pins); + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + } + oci_pin_list_free(&pins); + + /* Source 2: unpacked image trees under /images/. The + * origin sidecar already carries the resolved diff_id list so no blob + * read is required here. + */ + if (volume_root) { + oci_volume_list_t trees = {0}; + const char *vlerr = NULL; + if (oci_volume_list_unpacked(volume_root, &trees, &vlerr) < 0) { + *err = vlerr ? vlerr + : "collect_layer_roots: volume_list_unpacked failed"; + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + for (size_t i = 0; i < trees.count; i++) { + oci_origin_t origin = {0}; + const char *oerr = NULL; + if (oci_origin_read(trees.items[i], &origin, &oerr) < 0) { + *err = oerr ? oerr + : "collect_layer_roots: origin sidecar read failed"; + oci_volume_list_free(&trees); + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + if (origin.layer_diffids) { + if (add_diff_ids_and_chains(origin.layer_diffids, out_diff_ids, + out_chain_ids, err) < 0) { + oci_origin_free(&origin); + oci_volume_list_free(&trees); + oci_digest_set_free(out_diff_ids); + oci_digest_set_free(out_chain_ids); + return -1; + } + } + oci_origin_free(&origin); + } + oci_volume_list_free(&trees); + } + return 0; +} + +/* Algorithm set this build expects to find under blobs/. Other algorithm + * subdirectories (a future operator hand-created sha384/, for instance) + * are left untouched: sweep only inspects directories it recognises. + */ +static const oci_digest_algo_t PRUNE_ALGOS[] = { + OCI_DIGEST_SHA256, + OCI_DIGEST_SHA512, +}; + +/* One dangling-blob entry produced by the classify phase and consumed + * by the apply phase. path is heap-owned. verdict starts at PRUNE and + * may be flipped to SKIP by the older-than veto or the keep-bytes + * budget. size is the on-disk byte count (st_size at classify time); + * mtime is st_mtime, used as the sort key for the LRU budget and the + * comparison source for the older-than cutoff. + */ +typedef enum { + PRUNE_VERDICT_PRUNE = 0, + PRUNE_VERDICT_SKIP = 1, +} prune_verdict_t; + +typedef struct { + char *path; + uint64_t size; + time_t mtime; + prune_verdict_t verdict; +} prune_candidate_t; + +typedef struct { + prune_candidate_t *items; + size_t count; + size_t cap; +} prune_candidate_list_t; + +/* Append one dangling-blob entry. Doubles cap from 32 so the realloc + * cost amortizes across a typical store's tens-to-hundreds of blobs. + * On alloc failure returns -1 with errno=ENOMEM; the caller is + * responsible for cleaning up entries staged so far via + * prune_candidate_list_free. + */ +static int prune_candidate_list_append(prune_candidate_list_t *list, + char *path, + uint64_t size, + time_t mtime) +{ + if (list->count == list->cap) { + size_t new_cap = list->cap ? list->cap * 2 : 32; + prune_candidate_t *grown = + realloc(list->items, new_cap * sizeof(*grown)); + if (!grown) { + errno = ENOMEM; + return -1; + } + list->items = grown; + list->cap = new_cap; + } + list->items[list->count].path = path; + list->items[list->count].size = size; + list->items[list->count].mtime = mtime; + list->items[list->count].verdict = PRUNE_VERDICT_PRUNE; + list->count++; + return 0; +} + +static void prune_candidate_list_free(prune_candidate_list_t *list) +{ + if (!list) + return; + for (size_t i = 0; i < list->count; i++) + free(list->items[i].path); + free(list->items); + list->items = NULL; + list->count = 0; + list->cap = 0; +} + +/* qsort comparator over an indirection array of candidate pointers + * (prune_candidate_t **). Sort key is ascending mtime with the path + * as tie-breaker so order stays deterministic on stores that + * materialize blobs in quick succession (the test suite needs + * stable LRU picks against a fixture). + */ +static int prune_candidate_ptr_cmp_mtime_asc(const void *a, const void *b) +{ + const prune_candidate_t *pa = *(const prune_candidate_t *const *) a; + const prune_candidate_t *pb = *(const prune_candidate_t *const *) b; + if (pa->mtime < pb->mtime) + return -1; + if (pa->mtime > pb->mtime) + return 1; + return strcmp(pa->path, pb->path); +} + +/* The three cache families share a single sweep pipeline. Family selects + * how apply_verdicts removes a PRUNE-verdict entry (unlink vs recursive rm) + * and informs diagnostics; the classify and filter passes operate uniformly + * on prune_candidate_list_t. + */ +typedef enum { + PRUNE_FAMILY_BLOB = 0, /* /blobs// regular files */ + PRUNE_FAMILY_TREE = 1, /* /layers/.../// directories */ +} prune_family_t; + +/* Classify one blobs// directory. For every regular file whose + * name is a valid lowercase hex digest of the right length, build the + * canonical ":" digest, look it up in the keep set, and + * either bump *out_kept (reachable) or append a candidate (dangling). + * lstat ENOENT mid-walk is treated as a concurrent prune and skipped + * silently. Subdirectories, dotfiles, and otherwise-shaped entries + * pass through untouched so the OCI image-layout spec's regular-blob + * convention is preserved without trampling foreign state. + * + * Returns 0 on success and -1 on unrecoverable IO failure with errno + * preserved. + */ +static int classify_algo_dir(oci_store_t *s, + oci_digest_algo_t algo, + const oci_digest_set_t *keep, + size_t *out_kept, + prune_candidate_list_t *list, + const char **err) +{ + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) { + if (err) + *err = "prune: unknown digest algorithm"; + errno = EINVAL; + return -1; + } + + char dir_path[STORE_PATH_MAX]; + int n = + snprintf(dir_path, sizeof(dir_path), "%s/blobs/%s", s->root, algo_name); + if (n < 0 || (size_t) n >= sizeof(dir_path)) { + if (err) + *err = "prune: blobs/ path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + return -1; + } + + DIR *dp = opendir(dir_path); + if (!dp) { + if (errno == ENOENT) + return 0; + if (err) + *err = "prune: opendir on blobs/ failed"; + return -1; + } + + int rc = 0; + struct dirent *de; + size_t hex_len = oci_digest_hex_len(algo); + while ((de = readdir(dp)) != NULL) { + const char *name = de->d_name; + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) + continue; + if (name[0] == '.') + continue; + /* Reject anything that is not the expected hex shape before + * paying for an lstat. This both filters subdirectories + * (whose names rarely happen to be 64 hex chars) and shields + * the digest_set lookup from non-blob filenames. + */ + if (strlen(name) != hex_len) + continue; + if (!oci_digest_hex_valid(algo, name)) + continue; + + char blob_path[STORE_PATH_MAX]; + int bn = + snprintf(blob_path, sizeof(blob_path), "%s/%s", dir_path, name); + if (bn < 0 || (size_t) bn >= sizeof(blob_path)) { + if (err) + *err = "prune: blob path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + rc = -1; + break; + } + + struct stat st; + if (lstat(blob_path, &st) < 0) { + if (errno == ENOENT) + continue; + if (err) + *err = "prune: lstat on blob failed"; + rc = -1; + break; + } + if (!S_ISREG(st.st_mode)) + continue; + + char digest[OCI_DIGEST_HEX_MAX + 16]; + int dn = snprintf(digest, sizeof(digest), "%s:%s", algo_name, name); + if (dn < 0 || (size_t) dn >= sizeof(digest)) { + if (err) + *err = "prune: digest string buffer too small"; + errno = ENAMETOOLONG; + rc = -1; + break; + } + + if (oci_digest_set_contains(keep, digest)) { + (*out_kept)++; + continue; + } + + char *path_copy = strdup(blob_path); + if (!path_copy) { + if (err) + *err = "prune: strdup blob path failed"; + errno = ENOMEM; + rc = -1; + break; + } + if (prune_candidate_list_append(list, path_copy, (uint64_t) st.st_size, + st.st_mtime) < 0) { + free(path_copy); + if (err) + *err = "prune: candidate list grow failed"; + rc = -1; + break; + } + } + closedir(dp); + return rc; +} + +/* Classify one tree-shaped cache directory (layers// or + * layers/stacks//). The base_subpath argument is the relative path + * beneath the store root, e.g. "layers/sha256" or "layers/stacks/sha256"; + * it lets one helper drive both the raw layer cache and the ChainID-keyed + * stack cache without duplicating the dir-walk plumbing. + * + * For every immediate child whose name is a valid lowercase hex digest of + * the right length AND whose lstat reports a directory, compose the + * canonical ":" digest and look it up in the keep set. Misses + * (dangling cache entries) get appended to the candidate list together with + * the recursive size of the entry tree and the directory's own st_mtime + * (set by rename(2) at commit time, so newer entries sort newer). Hits + * bump *out_kept. + * + * Non-directory entries, dotfiles, sibling .schema / .staging markers, and + * malformed names are all skipped silently so the caller never deletes + * foreign state. Missing base directory (fresh store before any unpack) + * yields 0 with no entries. Other IO failures are fatal. + */ +static int classify_tree_cache_dir(oci_store_t *s, + const char *base_subpath, + oci_digest_algo_t algo, + const oci_digest_set_t *keep, + size_t *out_kept, + prune_candidate_list_t *list, + const char **err) +{ + const char *algo_name = oci_digest_algo_name(algo); + if (!algo_name) { + if (err) + *err = "prune: unknown digest algorithm"; + errno = EINVAL; + return -1; + } + + char dir_path[STORE_PATH_MAX]; + int n = + snprintf(dir_path, sizeof(dir_path), "%s/%s", s->root, base_subpath); + if (n < 0 || (size_t) n >= sizeof(dir_path)) { + if (err) + *err = "prune: tree-cache path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + return -1; + } + + DIR *dp = opendir(dir_path); + if (!dp) { + if (errno == ENOENT) + return 0; + if (err) + *err = "prune: opendir on tree-cache dir failed"; + return -1; + } + + int rc = 0; + struct dirent *de; + size_t hex_len = oci_digest_hex_len(algo); + while ((de = readdir(dp)) != NULL) { + const char *name = de->d_name; + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) + continue; + if (name[0] == '.') + continue; + if (strlen(name) != hex_len) + continue; + if (!oci_digest_hex_valid(algo, name)) + continue; + + char entry_path[STORE_PATH_MAX]; + int en = + snprintf(entry_path, sizeof(entry_path), "%s/%s", dir_path, name); + if (en < 0 || (size_t) en >= sizeof(entry_path)) { + if (err) + *err = "prune: tree-cache entry path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + rc = -1; + break; + } + + struct stat st; + if (lstat(entry_path, &st) < 0) { + if (errno == ENOENT) + continue; + if (err) + *err = "prune: lstat on tree-cache entry failed"; + rc = -1; + break; + } + if (!S_ISDIR(st.st_mode)) + continue; + + char digest[OCI_DIGEST_HEX_MAX + 16]; + int dn = snprintf(digest, sizeof(digest), "%s:%s", algo_name, name); + if (dn < 0 || (size_t) dn >= sizeof(digest)) { + if (err) + *err = "prune: digest string buffer too small"; + errno = ENAMETOOLONG; + rc = -1; + break; + } + + if (oci_digest_set_contains(keep, digest)) { + (*out_kept)++; + continue; + } + + uint64_t tree_bytes = dir_tree_size_sum(entry_path); + char *path_copy = strdup(entry_path); + if (!path_copy) { + if (err) + *err = "prune: strdup tree-cache path failed"; + errno = ENOMEM; + rc = -1; + break; + } + if (prune_candidate_list_append(list, path_copy, tree_bytes, + st.st_mtime) < 0) { + free(path_copy); + if (err) + *err = "prune: candidate list grow failed"; + rc = -1; + break; + } + } + closedir(dp); + return rc; +} + +/* Apply the C1.4 filter passes to the candidate list. Both passes + * mutate verdict only; nothing is unlinked here. The caller invokes + * apply_verdicts afterwards to count + (when commit) unlink. + * + * older-than veto (B1) inspects each candidate independently: when + * older_than_sec is non-zero and (now - mtime) is less than the + * cutoff, verdict flips to SKIP. now is provided by the caller so a + * single time(NULL) snapshot drives the whole filter pass (avoids + * the boundary case where a candidate flips between PRUNE and SKIP + * across two sequential time(NULL) reads). + * + * keep-bytes budget (B2) operates over the candidates still in PRUNE + * state after B1. Their pointers are gathered, sorted by mtime + * ascending, and walked newest-first. The newest candidates whose + * cumulative size fits keep_bytes flip to SKIP; the first candidate + * that does not fit terminates the walk so any older candidate stays + * in PRUNE even if its own size would have fit alone. This matches + * LRU semantics: oldest evicted first, regardless of size. + */ +static int apply_filters(oci_store_prune_options_t *opts, + prune_candidate_list_t *list, + time_t now, + const char **err) +{ + if (opts->older_than_sec > 0) { + time_t cutoff = (time_t) opts->older_than_sec; + for (size_t i = 0; i < list->count; i++) { + if (list->items[i].verdict != PRUNE_VERDICT_PRUNE) + continue; + time_t age = now - list->items[i].mtime; + if (age < cutoff) + list->items[i].verdict = PRUNE_VERDICT_SKIP; + } + } + + if (opts->keep_bytes > 0 && list->count > 0) { + prune_candidate_t **active = + (prune_candidate_t **) malloc(list->count * sizeof(*active)); + if (!active) { + if (err) + *err = "prune: out of memory ranking candidates"; + errno = ENOMEM; + return -1; + } + size_t na = 0; + for (size_t i = 0; i < list->count; i++) { + if (list->items[i].verdict == PRUNE_VERDICT_PRUNE) + active[na++] = &list->items[i]; + } + if (na > 0) { + /* Sort the active subset through an indirection array so + * the candidate-list iteration order in apply_verdicts + * stays in insertion order (the test suite is easier to + * reason about when path verdicts read in the same order + * the classify phase produced them). + */ + qsort((void *) active, na, sizeof(*active), + prune_candidate_ptr_cmp_mtime_asc); + uint64_t running = 0; + for (ssize_t i = (ssize_t) na - 1; i >= 0; i--) { + /* Use unsigned arithmetic with an overflow guard so a + * pathological size never wraps the accumulator. + */ + uint64_t next = running + active[i]->size; + if (next < running) { + /* Overflow: cannot fit any more blobs under the + * budget, so stop reclassifying. + */ + break; + } + if (next <= opts->keep_bytes) { + active[i]->verdict = PRUNE_VERDICT_SKIP; + running = next; + } else { + break; + } + } + } + free((void *) active); + } + return 0; +} + +/* Forward declaration for the recursive rm helper used by the TREE family + * removal path below; the implementation lives with the layer cache helpers + * later in this file. + */ +static int layer_stage_rm(const char *path); + +/* Materialise filter verdicts onto disk and stats. Every candidate + * contributes to exactly one output bucket: SKIP -> skipped_*, PRUNE + * -> pruned_* (and removal when commit). The removal failure policy + * mirrors C1.3: ENOENT is treated as a concurrent prune and counted + * silently; any other errno is fatal so the caller's stats never + * report bytes we did not actually reclaim. family selects the + * removal primitive: BLOB uses unlink(2) on a regular file; TREE uses + * layer_stage_rm on a directory subtree so a populated cache entry is + * taken down in one call. The four output pointers let the caller + * route the counters into the per-family stats fields (kept lives in + * classify; this function only writes pruned + skipped). + */ +static int apply_verdicts(prune_candidate_list_t *list, + bool commit, + prune_family_t family, + size_t *out_pruned_count, + uint64_t *out_pruned_bytes, + size_t *out_skipped_count, + uint64_t *out_skipped_bytes, + const char **err) +{ + for (size_t i = 0; i < list->count; i++) { + if (list->items[i].verdict == PRUNE_VERDICT_SKIP) { + (*out_skipped_count)++; + *out_skipped_bytes += list->items[i].size; + continue; + } + (*out_pruned_count)++; + *out_pruned_bytes += list->items[i].size; + if (!commit) + continue; + if (family == PRUNE_FAMILY_BLOB) { + if (unlink(list->items[i].path) < 0) { + if (errno == ENOENT) + continue; + if (err) + *err = "prune: unlink on dangling blob failed"; + return -1; + } + } else { + /* TREE: recursive rm tolerates ENOENT internally via lstat + * but still returns -1 on any other failure mid-walk. The + * stats already count the entry as pruned so a partial + * teardown that succeeds for some children leaves the stats + * consistent with what was actually freed. + */ + if (layer_stage_rm(list->items[i].path) < 0) { + if (errno == ENOENT) + continue; + if (err) + *err = + "prune: recursive rm on dangling cache entry " + "failed"; + return -1; + } + } + } + return 0; +} + +int oci_store_prune(oci_store_t *s, + oci_store_prune_options_t *opts, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!s || !opts) { + *err = "prune: NULL argument"; + errno = EINVAL; + return -1; + } + + opts->kept_blobs = 0; + opts->pruned_blobs = 0; + opts->pruned_bytes = 0; + opts->skipped_blobs = 0; + opts->skipped_bytes = 0; + opts->kept_layers = 0; + opts->pruned_layers = 0; + opts->pruned_layer_bytes = 0; + opts->skipped_layers = 0; + opts->skipped_layer_bytes = 0; + opts->kept_stacks = 0; + opts->pruned_stacks = 0; + opts->pruned_stack_bytes = 0; + opts->skipped_stacks = 0; + opts->skipped_stack_bytes = 0; + + /* Serialize against oci_store_put_ref so a pull cannot publish a + * new pin between the mark snapshot and the sweep. Mark and sweep + * for all three cache families share this single lock window so + * the blob keep set, the diff_id keep set, and the chain_id keep + * set are derived from one consistent view of pins + unpacked + * sysroots. + */ + int lock_fd = acquire_index_lock(s->root, err); + if (lock_fd < 0) + return -1; + + /* Mark phase: build three keep sets in one window. oci_digest_set_free + * is safe on zero-initialised structs so a partial mark still cleans + * up correctly via the single done: label below. + */ + oci_digest_set_t keep_blobs = {0}; + oci_digest_set_t keep_diff_ids = {0}; + oci_digest_set_t keep_chain_ids = {0}; + int rc = 0; + if (oci_store_collect_roots(s, &keep_blobs, opts->volume_root, err) < 0) { + rc = -1; + goto done; + } + if (oci_store_collect_layer_roots(s, &keep_diff_ids, &keep_chain_ids, + opts->volume_root, err) < 0) { + rc = -1; + goto done; + } + + /* Sweep phase: each family classifies, filters, and applies independently + * against its own keep set and candidate list. The filter passes use the + * same opts->older_than_sec / opts->keep_bytes inputs but each family + * runs its own keep-bytes budget so a fat blob cannot crowd a layer + * eviction (or vice versa) off a shared global budget. + */ + time_t now = time(NULL); + + /* Family 1: blobs */ + prune_candidate_list_t blob_candidates = {0}; + for (size_t i = 0; i < sizeof(PRUNE_ALGOS) / sizeof(PRUNE_ALGOS[0]); i++) { + if (classify_algo_dir(s, PRUNE_ALGOS[i], &keep_blobs, &opts->kept_blobs, + &blob_candidates, err) < 0) { + prune_candidate_list_free(&blob_candidates); + rc = -1; + goto done; + } + } + if (apply_filters(opts, &blob_candidates, now, err) < 0) { + prune_candidate_list_free(&blob_candidates); + rc = -1; + goto done; + } + if (apply_verdicts(&blob_candidates, opts->commit, PRUNE_FAMILY_BLOB, + &opts->pruned_blobs, &opts->pruned_bytes, + &opts->skipped_blobs, &opts->skipped_bytes, err) < 0) { + prune_candidate_list_free(&blob_candidates); + rc = -1; + goto done; + } + prune_candidate_list_free(&blob_candidates); + + /* Family 2: layers/// raw cache directories */ + prune_candidate_list_t layer_candidates = {0}; + for (size_t i = 0; i < sizeof(PRUNE_ALGOS) / sizeof(PRUNE_ALGOS[0]); i++) { + const char *algo_name = oci_digest_algo_name(PRUNE_ALGOS[i]); + char base[STORE_PATH_MAX]; + int bn = snprintf(base, sizeof(base), "layers/%s", algo_name); + if (bn < 0 || (size_t) bn >= sizeof(base)) { + *err = "prune: layers/ subpath overflow"; + errno = ENAMETOOLONG; + prune_candidate_list_free(&layer_candidates); + rc = -1; + goto done; + } + if (classify_tree_cache_dir(s, base, PRUNE_ALGOS[i], &keep_diff_ids, + &opts->kept_layers, &layer_candidates, + err) < 0) { + prune_candidate_list_free(&layer_candidates); + rc = -1; + goto done; + } + } + if (apply_filters(opts, &layer_candidates, now, err) < 0) { + prune_candidate_list_free(&layer_candidates); + rc = -1; + goto done; + } + if (apply_verdicts(&layer_candidates, opts->commit, PRUNE_FAMILY_TREE, + &opts->pruned_layers, &opts->pruned_layer_bytes, + &opts->skipped_layers, &opts->skipped_layer_bytes, + err) < 0) { + prune_candidate_list_free(&layer_candidates); + rc = -1; + goto done; + } + prune_candidate_list_free(&layer_candidates); + + /* Family 3: layers/stacks/// ChainID-keyed snapshots */ + prune_candidate_list_t stack_candidates = {0}; + for (size_t i = 0; i < sizeof(PRUNE_ALGOS) / sizeof(PRUNE_ALGOS[0]); i++) { + const char *algo_name = oci_digest_algo_name(PRUNE_ALGOS[i]); + char base[STORE_PATH_MAX]; + int bn = snprintf(base, sizeof(base), "layers/stacks/%s", algo_name); + if (bn < 0 || (size_t) bn >= sizeof(base)) { + *err = "prune: layers/stacks/ subpath overflow"; + errno = ENAMETOOLONG; + prune_candidate_list_free(&stack_candidates); + rc = -1; + goto done; + } + if (classify_tree_cache_dir(s, base, PRUNE_ALGOS[i], &keep_chain_ids, + &opts->kept_stacks, &stack_candidates, + err) < 0) { + prune_candidate_list_free(&stack_candidates); + rc = -1; + goto done; + } + } + if (apply_filters(opts, &stack_candidates, now, err) < 0) { + prune_candidate_list_free(&stack_candidates); + rc = -1; + goto done; + } + if (apply_verdicts(&stack_candidates, opts->commit, PRUNE_FAMILY_TREE, + &opts->pruned_stacks, &opts->pruned_stack_bytes, + &opts->skipped_stacks, &opts->skipped_stack_bytes, + err) < 0) { + prune_candidate_list_free(&stack_candidates); + rc = -1; + goto done; + } + prune_candidate_list_free(&stack_candidates); + +done:; + int saved = errno; + oci_digest_set_free(&keep_chain_ids); + oci_digest_set_free(&keep_diff_ids); + oci_digest_set_free(&keep_blobs); + close(lock_fd); + errno = saved; + return rc; +} + +/* --- Plan 3 C3.2: layer cache helpers ---------------------------------- */ + +/* Parse a ":" digest into its components and the lowercase + * algorithm name used as the cache subdir. Shared by the per-layer raw + * cache (keyed by diff_id) and the ChainID-keyed stack cache (C3.3c), + * both of which materialise as /layers/.../// on disk. + * Validation matches the digest library; oci_digest_parse already rejects + * unknown algos and bad hex. + */ +static int parse_digest_for_cache_dir(const char *digest_str, + oci_digest_algo_t *out_algo, + char *out_hex, + const char **out_algo_name) +{ + if (!digest_str || !*digest_str) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(digest_str, &algo, hex)) { + errno = EINVAL; + return -1; + } + const char *name = oci_digest_algo_name(algo); + if (!name) { + errno = EINVAL; + return -1; + } + *out_algo = algo; + memcpy(out_hex, hex, strlen(hex) + 1); + *out_algo_name = name; + return 0; +} + +int oci_store_layer_has(oci_store_t *s, const char *diff_id) +{ + if (!s) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(diff_id, &algo, hex, &algo_name) < 0) + return -1; + + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/layers/%s/%s", s->root, algo_name, + hex); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + struct stat st; + if (stat(path, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + return 1; +} + +int oci_store_layer_resolve(oci_store_t *s, + const char *diff_id, + char *out, + size_t cap) +{ + if (!s || !out || cap == 0) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(diff_id, &algo, hex, &algo_name) < 0) + return -1; + int n = snprintf(out, cap, "%s/layers/%s/%s/", s->root, algo_name, hex); + if (n < 0 || (size_t) n >= cap) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +/* Produce 12 lowercase hex chars from 6 random bytes. Mirrors the local + * rand_hex helpers in src/oci/unpack.c and src/oci/clone-rootfs.c; kept + * static here so store.c stays self-contained. + */ +static int layer_stage_rand_suffix(char out[13]) +{ + uint8_t raw[6]; + if (getentropy(raw, sizeof(raw)) < 0) + return -1; + static const char hex[] = "0123456789abcdef"; + for (size_t i = 0; i < sizeof(raw); i++) { + out[i * 2] = hex[raw[i] >> 4]; + out[i * 2 + 1] = hex[raw[i] & 0xf]; + } + out[12] = '\0'; + return 0; +} + +int oci_store_layer_stage_path(oci_store_t *s, + const char *diff_id, + char *out, + size_t cap) +{ + if (!s || !out || cap == 0) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(diff_id, &algo, hex, &algo_name) < 0) + return -1; + char rand_suffix[13]; + if (layer_stage_rand_suffix(rand_suffix) < 0) + return -1; + int n = snprintf(out, cap, "%s/layers/.staging/%s-%s-%s", s->root, + algo_name, hex, rand_suffix); + if (n < 0 || (size_t) n >= cap) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +/* Recursively rm a path (file, symlink, or directory). Mirrors the discipline + * in src/oci/clone-rootfs.c so the layer stage abort path does not shell out. + * Returns 0 on success or when path was already absent; -1 with errno set on + * any unexpected IO error. Designed for staging cleanup, not as a general- + * purpose rm. + */ +static int layer_stage_rm(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (!S_ISDIR(st.st_mode)) + return unlink(path); + DIR *d = opendir(path); + if (!d) + return -1; + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[STORE_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + if (layer_stage_rm(child) < 0) { + rc = -1; + break; + } + } + closedir(d); + if (rc == 0 && rmdir(path) < 0) + rc = -1; + return rc; +} + +int oci_store_layer_commit(oci_store_t *s, + const char *stage_path, + const char *diff_id, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!s || !stage_path || !*stage_path) { + *err = "layer_commit: NULL argument"; + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(diff_id, &algo, hex, &algo_name) < 0) { + *err = "layer_commit: invalid diff_id"; + return -1; + } + char dest[STORE_PATH_MAX]; + int n = snprintf(dest, sizeof(dest), "%s/layers/%s/%s", s->root, algo_name, + hex); + if (n < 0 || (size_t) n >= sizeof(dest)) { + *err = "layer_commit: dest path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + return -1; + } + if (rename(stage_path, dest) == 0) + return 0; + int saved = errno; + if (saved == EEXIST || saved == ENOTEMPTY) { + /* Concurrent writer landed the same entry first; drop the loser's + * staging tree and treat this as a benign success. The cache content + * is content-addressed so the winning entry is byte-equivalent. + */ + (void) layer_stage_rm(stage_path); + errno = 0; + return 0; + } + *err = "layer_commit: rename to layers/// failed"; + errno = saved; + return -1; +} + +/* --- Plan 3 C3.3c: ChainID stack cache helpers ------------------------- */ + +int oci_store_stack_has(oci_store_t *s, const char *chain_id) +{ + if (!s) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(chain_id, &algo, hex, &algo_name) < 0) + return -1; + + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/layers/stacks/%s/%s", s->root, + algo_name, hex); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + struct stat st; + if (stat(path, &st) < 0) { + if (errno == ENOENT) + return 0; + return -1; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return -1; + } + return 1; +} + +int oci_store_stack_resolve(oci_store_t *s, + const char *chain_id, + char *out, + size_t cap) +{ + if (!s || !out || cap == 0) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(chain_id, &algo, hex, &algo_name) < 0) + return -1; + int n = + snprintf(out, cap, "%s/layers/stacks/%s/%s/", s->root, algo_name, hex); + if (n < 0 || (size_t) n >= cap) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int oci_store_stack_stage_path(oci_store_t *s, + const char *chain_id, + char *out, + size_t cap) +{ + if (!s || !out || cap == 0) { + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(chain_id, &algo, hex, &algo_name) < 0) + return -1; + char rand_suffix[13]; + if (layer_stage_rand_suffix(rand_suffix) < 0) + return -1; + /* The "stack-" prefix keeps stack stage paths visually distinct from + * per-layer raw cache stage paths inside the shared .staging/ dir. + * Commit publishes to a different destination tree so the prefix is + * purely a debug aid; the rename is what actually disambiguates the + * two artifact families. + */ + int n = snprintf(out, cap, "%s/layers/.staging/stack-%s-%s-%s", s->root, + algo_name, hex, rand_suffix); + if (n < 0 || (size_t) n >= cap) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int oci_store_stack_commit(oci_store_t *s, + const char *stage_path, + const char *chain_id, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!s || !stage_path || !*stage_path) { + *err = "stack_commit: NULL argument"; + errno = EINVAL; + return -1; + } + oci_digest_algo_t algo; + char hex[OCI_DIGEST_HEX_MAX + 1]; + const char *algo_name = NULL; + if (parse_digest_for_cache_dir(chain_id, &algo, hex, &algo_name) < 0) { + *err = "stack_commit: invalid chain_id"; + return -1; + } + char dest[STORE_PATH_MAX]; + int n = snprintf(dest, sizeof(dest), "%s/layers/stacks/%s/%s", s->root, + algo_name, hex); + if (n < 0 || (size_t) n >= sizeof(dest)) { + *err = "stack_commit: dest path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + return -1; + } + if (rename(stage_path, dest) == 0) + return 0; + int saved = errno; + if (saved == EEXIST || saved == ENOTEMPTY) { + /* Concurrent writer landed the same entry first; drop the loser's + * staging tree and treat this as a benign success. Stack snapshots + * are content-addressed via ChainID so the winning entry is byte- + * equivalent. + */ + (void) layer_stage_rm(stage_path); + errno = 0; + return 0; + } + *err = "stack_commit: rename to layers/stacks/// failed"; + errno = saved; + return -1; +} + +/* --- Plan 3 C3.3b: layer cache schema marker --------------------------- */ + +/* Relative path of the schema marker beneath the store root. */ +static const char LAYER_SCHEMA_REL_PATH[] = "layers/.schema"; + +/* Schema version this build writes and accepts. v1 was the implicit + * C3.2 cumulative-by-diff_id layout (no marker). v2 will be the C3.3 + * raw per-layer payload plus ChainID stack cache. C3.3b lands the + * marker + migration; C3.3c will rewrite the unpack assembly to + * populate and consume the v2 cache. + */ +#define LAYER_SCHEMA_VERSION_CURRENT 2 + +/* Body written on first migration to v2. The description field is + * informational; readers only key on schemaVersion. Trailing newline + * matches the oci-layout marker convention. */ +static const char LAYER_SCHEMA_V2_BODY[] = + "{\"schemaVersion\":2," + "\"description\":\"raw per-layer payload + ChainID stack cache\"}\n"; + +/* Upper bound on the marker file size. The expected body is ~80 bytes; + * anything larger is treated as a malformed marker rather than parsed. + */ +#define LAYER_SCHEMA_MAX_BYTES 4096 + +/* Counter used for the marker's tmp file suffix; kept distinct from the + * oci-layout counter so the two helpers do not contend on the same + * monotonic source. + */ +static unsigned long layer_schema_seq(void) +{ + static unsigned long n = 0; + return __sync_add_and_fetch(&n, 1); +} + +/* Read /layers/.schema and extract the schemaVersion field. On + * success returns 0 and writes the parsed integer to *out_version. On + * failure returns -1 with errno set (ENOENT when the marker is absent; + * EINVAL for unparseable JSON, missing schemaVersion field, wrong type, + * non-regular file, or size out of range; other errno values propagated + * from open / read / fstat). When non-NULL, *out_reason is populated on + * the -1 paths with a static description for the caller to surface via + * stderr. + */ +static int read_layer_schema_version(const char *root, + int *out_version, + const char **out_reason) +{ + if (out_reason) + *out_reason = NULL; + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/%s", root, LAYER_SCHEMA_REL_PATH); + if (n < 0 || (size_t) n >= sizeof(path)) { + if (out_reason) + *out_reason = "layers/.schema path exceeds STORE_PATH_MAX"; + errno = ENAMETOOLONG; + return -1; + } + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno == ENOENT) { + if (out_reason) + *out_reason = "layers/.schema absent"; + return -1; + } + if (out_reason) + *out_reason = "open layers/.schema failed"; + return -1; + } + struct stat st; + if (fstat(fd, &st) < 0) { + int saved = errno; + close(fd); + if (out_reason) + *out_reason = "fstat layers/.schema failed"; + errno = saved; + return -1; + } + if (!S_ISREG(st.st_mode)) { + close(fd); + if (out_reason) + *out_reason = "layers/.schema is not a regular file"; + errno = EINVAL; + return -1; + } + if (st.st_size <= 0 || st.st_size > LAYER_SCHEMA_MAX_BYTES) { + close(fd); + if (out_reason) + *out_reason = "layers/.schema size out of range"; + errno = EINVAL; + return -1; + } + char buf[LAYER_SCHEMA_MAX_BYTES + 1]; + ssize_t got = read(fd, buf, (size_t) st.st_size); + close(fd); + if (got != (ssize_t) st.st_size) { + if (out_reason) + *out_reason = "read layers/.schema failed"; + errno = EIO; + return -1; + } + buf[got] = '\0'; + cJSON *json = cJSON_Parse(buf); + if (!json) { + if (out_reason) + *out_reason = "layers/.schema JSON parse failed"; + errno = EINVAL; + return -1; + } + cJSON *v = cJSON_GetObjectItemCaseSensitive(json, "schemaVersion"); + if (!cJSON_IsNumber(v)) { + cJSON_Delete(json); + if (out_reason) + *out_reason = + "layers/.schema schemaVersion missing or not a number"; + errno = EINVAL; + return -1; + } + int version = v->valueint; + cJSON_Delete(json); + *out_version = version; + return 0; +} + +/* Recursively remove every direct child of /layers/sha256/, leaving + * the layers/sha256/ directory itself in place. Each child is dispatched + * through layer_stage_rm so symlinks, regular files, and nested + * directories are all handled identically. On the first IO failure the + * call returns -1 with errno preserved; *out_removed reflects the entry + * count successfully removed before the error. layer_stage_rm is + * ENOENT-tolerant so a partial wipe resumes cleanly on a later open. + */ +static int wipe_layers_sha256(const char *root, size_t *out_removed) +{ + *out_removed = 0; + char dir_path[STORE_PATH_MAX]; + int n = snprintf(dir_path, sizeof(dir_path), "%s/layers/sha256", root); + if (n < 0 || (size_t) n >= sizeof(dir_path)) { + errno = ENAMETOOLONG; + return -1; + } + DIR *d = opendir(dir_path); + if (!d) { + if (errno == ENOENT) + return 0; + return -1; + } + int rc = 0; + struct dirent *de; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[STORE_PATH_MAX]; + int cn = snprintf(child, sizeof(child), "%s/%s", dir_path, de->d_name); + if (cn < 0 || (size_t) cn >= sizeof(child)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + if (layer_stage_rm(child) < 0) { + rc = -1; + break; + } + (*out_removed)++; + } + closedir(d); + return rc; +} + +/* Write /layers/.schema atomically. The body is materialized into a + * pid + counter-suffixed tmp file, fsynced, and renamed into place. The + * caller must hold flock(/index.json.lock, LOCK_EX) so two openers + * cannot race the rename. Returns 0 on success, -1 with errno preserved + * on any IO failure (the tmp file is removed before returning). + */ +static int write_layer_schema_v2(const char *root) +{ + char path[STORE_PATH_MAX]; + int n = snprintf(path, sizeof(path), "%s/%s", root, LAYER_SCHEMA_REL_PATH); + if (n < 0 || (size_t) n >= sizeof(path)) { + errno = ENAMETOOLONG; + return -1; + } + char tmp[STORE_PATH_MAX]; + n = snprintf(tmp, sizeof(tmp), "%s.tmp-%d-%lu", path, (int) getpid(), + layer_schema_seq()); + if (n < 0 || (size_t) n >= sizeof(tmp)) { + errno = ENAMETOOLONG; + return -1; + } + int fd = open(tmp, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd < 0) + return -1; + size_t body_len = sizeof(LAYER_SCHEMA_V2_BODY) - 1; + if (write(fd, LAYER_SCHEMA_V2_BODY, body_len) != (ssize_t) body_len) { + int saved = errno; + close(fd); + unlink(tmp); + errno = saved; + return -1; + } + if (fsync(fd) < 0) { + int saved = errno; + close(fd); + unlink(tmp); + errno = saved; + return -1; + } + if (close(fd) < 0) { + int saved = errno; + unlink(tmp); + errno = saved; + return -1; + } + if (rename(tmp, path) < 0) { + int saved = errno; + unlink(tmp); + errno = saved; + return -1; + } + return 0; +} + +static int ensure_layer_schema_marker(const char *root) +{ + /* First probe is lock-free so the marker-present fast path does not + * pay the flock cost on every open. */ + int version = 0; + const char *reason = NULL; + int rc = read_layer_schema_version(root, &version, &reason); + if (rc == 0) { + if (version == LAYER_SCHEMA_VERSION_CURRENT) + return 0; + fprintf(stderr, + "elfuse oci: unsupported layers schema version %d at %s; " + "this build understands up to %d\n", + version, root, LAYER_SCHEMA_VERSION_CURRENT); + errno = EINVAL; + return -1; + } + if (errno != ENOENT) { + int saved = errno; + fprintf(stderr, "elfuse oci: layers/.schema unreadable at %s: %s\n", + root, reason ? reason : "unknown error"); + errno = saved; + return -1; + } + + /* Marker absent. ELFUSE_OCI_NO_MIGRATE leaves the store untouched so + * a downgrade test or recovery workflow can inspect any pre-existing + * v1 entries without the daemon helpfully rewriting state. */ + const char *no_migrate = getenv(NO_MIGRATE_ENV); + if (no_migrate && *no_migrate) + return 0; + + const char *lock_err = NULL; + int lock_fd = acquire_index_lock(root, &lock_err); + if (lock_fd < 0) { + int saved = errno; + fprintf(stderr, "elfuse oci: %s\n", + lock_err ? lock_err : "failed to acquire index.json.lock"); + errno = saved; + return -1; + } + + /* Re-stat under hold: a racing opener may already have migrated. */ + rc = read_layer_schema_version(root, &version, &reason); + if (rc == 0) { + close(lock_fd); + if (version == LAYER_SCHEMA_VERSION_CURRENT) + return 0; + fprintf(stderr, + "elfuse oci: unsupported layers schema version %d at %s; " + "this build understands up to %d\n", + version, root, LAYER_SCHEMA_VERSION_CURRENT); + errno = EINVAL; + return -1; + } + if (errno != ENOENT) { + int saved = errno; + close(lock_fd); + fprintf(stderr, "elfuse oci: layers/.schema unreadable at %s: %s\n", + root, reason ? reason : "unknown error"); + errno = saved; + return -1; + } + + /* Marker still absent under hold: wipe pre-existing v1 entries and + * publish the v2 marker. */ + size_t removed = 0; + if (wipe_layers_sha256(root, &removed) < 0) { + int saved = errno; + close(lock_fd); + fprintf(stderr, + "elfuse oci: failed to migrate layer cache schema at %s: " + "wipe partial (%zu entr%s removed before error)\n", + root, removed, removed == 1 ? "y" : "ies"); + errno = saved; + return -1; + } + if (removed > 0) { + fprintf(stderr, + "elfuse oci: layer cache schema v1 detected; cleared %zu " + "entr%s from %s/layers/sha256 to migrate to v2 " + "(raw + ChainID stack)\n", + removed, removed == 1 ? "y" : "ies", root); + } + + if (write_layer_schema_v2(root) < 0) { + int saved = errno; + close(lock_fd); + fprintf(stderr, "elfuse oci: failed to write layers/.schema at %s\n", + root); + errno = saved; + return -1; + } + close(lock_fd); + return 0; +} diff --git a/src/oci/store.h b/src/oci/store.h new file mode 100644 index 0000000..805b628 --- /dev/null +++ b/src/oci/store.h @@ -0,0 +1,532 @@ +/* Local OCI image store: blobs + tag-to-digest pinning + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Wraps the slice-2 content-addressable blob store with a tag-to-digest pin + * table so that elfuse oci pull / inspect can reproduce a pull by name. The + * on-disk layout under follows the OCI image-layout spec (v1.0.0) so + * external tools (skopeo, umoci, crane) can consume the store directly: + * + * oci-layout OCI image-layout 1.0.0 marker + * index.json OCI image-index of all pins + * index.json.lock flock target for serializing + * writers blobs// finalized blob (immutable) + * tmp/blob---XXXXXX in-flight blob staging + * + * Each pin is one descriptor in index.json's manifests[] array. The pin name + * (canonical "/:") is stored in the descriptor's + * org.opencontainers.image.ref.name annotation. The descriptor's mediaType, + * digest, and size mirror the manifest blob in blobs//. Writers + * serialize through flock(/index.json.lock, LOCK_EX) and publish via + * tmp + rename so a concurrent reader always observes a complete document. + * Readers parse the snapshot lock-free: rename is atomic and cJSON consumes + * the file in one open + read. + * + * Pre-C2.2 stores used a refs/// flat-file layout + * instead of index.json. oci_store_open auto-migrates such stores: refs/ is + * scanned recursively, every well-formed pin file becomes a manifests[] + * descriptor in a freshly written index.json, and refs/ is kept on disk for + * one release so a downgrade still finds the legacy data. Pins whose + * manifest blob is missing from blobs/ are skipped with a warning rather + * than aborting the open. Migration is suppressed when the environment + * variable ELFUSE_OCI_NO_MIGRATE is set to any non-empty value, which lets + * a downgrade test or recovery workflow inspect the legacy layout. + * + * Phase 1 keeps as a plain directory. The sparse case-sensitive APFS + * volume bootstrap (oci-roadmap Q1) is a Phase 2 concern; the volume mount + * point will sit at the same default path so this API does not change. + */ + +#pragma once + +#include +#include +#include + +#include "blob-store.h" +#include "digest-set.h" +#include "ref.h" + +typedef struct oci_store oci_store_t; + +/* One pin entry produced by oci_store_list_refs. name is the canonical + * "/:" string captured from the descriptor's + * org.opencontainers.image.ref.name annotation; digest is the manifest + * digest in ":" form. Both fields are heap-allocated and owned + * by the enclosing oci_pin_list_t. + */ +typedef struct { + char *name; + char *digest; +} oci_pin_entry_t; + +typedef struct { + oci_pin_entry_t *items; + size_t count; +} oci_pin_list_t; + +/* Open or create the store rooted at `root`. Ensures blobs//, tmp/, + * and the OCI image-layout 1.0.0 marker exist. Marker writes are idempotent: + * a pre-existing oci-layout file is never rewritten so a third party that + * bumped the imageLayoutVersion is preserved. The index.json file is not + * materialized until the first oci_store_put_ref so an empty store stays + * literally empty on disk. + * + * If the root contains a pre-C2.2 refs/ tree but no index.json, this call + * also rebuilds index.json from refs/ contents under flock(index.json.lock, + * LOCK_EX) so a concurrent put_ref cannot race the migration. refs/ is + * preserved on disk for a downgrade fallback. Set ELFUSE_OCI_NO_MIGRATE to + * any non-empty value to skip the migration on this open. Migration failure + * is propagated through this call (NULL return with errno preserved); per-pin + * issues (missing blob, malformed pin file) are logged to stderr and skipped + * without failing the open. + * + * The Plan 3 C3.3b layer cache schema marker at /layers/.schema is + * also written or validated here. A v1 store (no marker but an existing + * layers/sha256/ subtree from C3.2) has its layers/sha256/ children wiped + * before the v2 marker is published; the wipe is scoped to layers/sha256/ + * only so blobs/, images/, refs/, index.json, and layers/.staging/ are + * never touched. A marker whose schemaVersion is unknown to this build + * (forward incompatibility, corruption, or an experimental schema) is + * fatal and returns NULL with errno=EINVAL. ELFUSE_OCI_NO_MIGRATE gates + * this migration too: when set and the marker is absent, the wipe and + * the marker write are both skipped so a downgrade test can inspect any + * v1 entries on disk. + * + * Returns NULL on failure with errno preserved. + */ +oci_store_t *oci_store_open(const char *root); + +/* Close the store handle. Does not delete on-disk state. Safe on NULL. */ +void oci_store_close(oci_store_t *s); + +/* Return the store root path. The returned pointer is owned by the store and + * is valid until oci_store_close. + */ +const char *oci_store_root(const oci_store_t *s); + +/* Return the underlying blob store handle. The returned pointer is owned by + * the store; do not close it directly. + */ +oci_blob_store_t *oci_store_blobs(oci_store_t *s); + +/* Return the default store root for the current user. macOS XDG-ish: + * $XDG_DATA_HOME/elfuse/store when XDG_DATA_HOME is set + * $HOME/Library/Application Support/elfuse/store otherwise + * Returns a heap-allocated string the caller must free, or NULL on env miss + * (errno=ENOENT) or oom (errno=ENOMEM). + */ +char *oci_store_default_root(void); + +/* Upsert a tag-to-digest pin for ref. ref->tag must be set; digest-only refs + * are self-pinning by their digest field and putting a pin for them is an + * EINVAL. digest_str is the canonical ":" form of the manifest + * digest captured at pull time; the manifest blob must already be present + * under /blobs// so the descriptor's size and mediaType + * can be derived from the on-disk blob (mediaType is read from the JSON + * body and inferred from structure when absent). + * + * Concurrency: the write is serialized by flock(/index.json.lock, + * LOCK_EX) and published via tmp + rename so a concurrent reader never + * observes a partial index.json. Re-pinning the same canonical name + * replaces the existing descriptor in place rather than appending a + * duplicate entry. + * + * Returns 0 on success, -1 with errno preserved and *err_msg (when non-NULL) + * pointing at a static description on failure. + */ +int oci_store_put_ref(oci_store_t *s, + const oci_ref_t *ref, + const char *digest_str, + const char **err_msg); + +/* Read the pinned manifest digest for ref. ref->tag must be set; digest-only + * refs are self-pinning and trigger EINVAL. On hit returns 0 and writes a + * heap-allocated ":" string into *out_digest (caller frees). On + * miss returns -1 with errno=ENOENT and *out_digest=NULL. Other IO errors + * return -1 with errno preserved. *err_msg (when non-NULL) is populated on + * any non-success path. + * + * The read is lock-free: tmp + rename in oci_store_put_ref makes the + * index.json switch atomic so a single open + read snapshots a complete + * document. + */ +int oci_store_get_ref(oci_store_t *s, + const oci_ref_t *ref, + char **out_digest, + const char **err_msg); + +/* Enumerate every pin currently recorded in index.json. On success returns + * 0 and populates *out with a heap-allocated array of (name, digest) + * entries; an empty store yields count == 0 and items == NULL. The caller + * releases the result via oci_pin_list_free. Missing index.json is treated + * as an empty store, not as an error. Other IO or schema errors return -1 + * with errno preserved and *err_msg (when non-NULL) populated. + * + * The order of returned entries matches the order in index.json; callers + * that need a stable sort must impose it themselves. + */ +int oci_store_list_refs(oci_store_t *s, + oci_pin_list_t *out, + const char **err_msg); + +/* Release every name / digest string in list and zero the struct. Safe on + * a zero-initialised list and on NULL. + */ +void oci_pin_list_free(oci_pin_list_t *list); + +/* Mark phase of the Plan 1 garbage collector: enumerate every blob + * digest still reachable from on-disk state and accumulate them in + * *out. Two sources are walked: + * + * 1. Pins in index.json. For each pin's manifest digest, the + * manifest blob is read and parsed; the config descriptor and + * every layer descriptor are added to the set. If the pinned + * blob is an OCI image-index instead of an image manifest, every + * sub-manifest descriptor digest is added, and for each + * sub-manifest whose blob is on disk the walk recurses into its + * config + layers. Sub-manifests not on disk (the multi-arch + * case where only one platform was fetched) are still added to + * the keep set so a sweep does not delete a sub-manifest blob + * that did materialise locally. + * + * 2. Unpacked image trees under /images/sha256-/. + * Each tree's .elfuse-origin.json is parsed; its manifest_digest + * drives the same manifest/config/layer expansion as a pin. + * + * Failure policy is fail-fast on anything that would let prune later + * delete a reachable blob: a missing manifest blob, an unparseable + * manifest, a missing or malformed .elfuse-origin.json, or a missing + * image-config blob all return -1 with *err populated so the operator + * can repair the store before retrying. A missing + * /images/ tree is treated as the fresh-store case + * (count == 0 contribution from that source) and not an error. + * + * volume_root may be NULL, in which case the unpacked-tree walk is + * skipped entirely. The pin walk runs unconditionally. + * + * Returns 0 on success; on failure returns -1 with errno preserved + * and *err (when non-NULL) pointing at a static description. On + * failure *out is left in a freed-empty state. + */ +int oci_store_collect_roots(oci_store_t *s, + oci_digest_set_t *out, + const char *volume_root, + const char **err); + +/* Plan 3 C3.3d mark walker for the layer + stack caches. Computes the + * reachable set of layer raw-cache and stack-cache entries from the + * same two sources oci_store_collect_roots reads: pins in index.json + * (resolved through one image-index level into a linux/arm64 + * sub-manifest as needed) and unpacked sysroots under + * /images/. For every image the walker can resolve: + * + * - Every layer's diff_id (from rootfs.diff_ids in the image-config + * blob for pinned images, or directly from .elfuse-origin.json's + * layer_diffids for unpacked sysroots) is added to *out_diff_ids. + * These name the entries under /layers///. + * + * - Every prefix ChainID through the layer list is added to + * *out_chain_ids. ChainID(L0) == DiffID(L0); ChainID(Li) == + * sha256(" "). oci_unpack writes one stack + * snapshot per prefix during the apply loop (src/oci/unpack.c + * line 1063), so a prune sweep must keep every prefix that maps + * to a reachable image, not only the terminating chain. These + * name the entries under /layers/stacks///. + * + * Both sets are populated by the same walker pass so they stay + * consistent across the two sources. + * + * Failure policy mirrors oci_store_collect_roots: a missing or + * unparseable image-config blob for a pinned image-manifest, a + * malformed origin sidecar, or a chainid_compute failure aborts the + * mark phase with -1 / errno set so prune cannot proceed to delete + * reachable cache entries. Soft cases (image-index pins whose + * linux/arm64 sub-manifest blob is not on disk, image-index pins + * with no linux/arm64 entry at all, a missing /images/ + * directory) contribute nothing without surfacing as errors so a + * multi-arch operator stays unblocked. + * + * volume_root may be NULL, in which case only the pin source is + * walked. On entry *out_diff_ids and *out_chain_ids are + * initialised so the caller may pass uninitialised structs. On + * failure both sets are freed back to empty. + */ +int oci_store_collect_layer_roots(oci_store_t *s, + oci_digest_set_t *out_diff_ids, + oci_digest_set_t *out_chain_ids, + const char *volume_root, + const char **err); + +/* Options + stats for oci_store_prune. Output fields are filled + * regardless of dry-run vs commit so callers can render a uniform + * report from the same struct. + * + * older_than_sec and keep_bytes shape which dangling entries survive + * the sweep. Both default to 0 with the documented meaning of "no + * filter" so the C1.3 behaviour (every dangling blob is pruned) is + * preserved when the caller does not opt in. + * + * older_than_sec > 0 vetoes per-entry: a dangling entry whose mtime + * is younger than (now - older_than_sec) is reported in the + * skipped_* family and left on disk. This is the grace window for a + * half-completed pull whose blob has been committed but whose + * put_ref has not landed yet, or for a layer/stack cache entry an + * unpack is still publishing. + * + * keep_bytes > 0 enforces a per-family LRU budget over the + * candidates that survive the older-than veto: candidates are + * sorted by mtime ascending and walked newest-first, the newest + * entries whose cumulative size fits under keep_bytes are + * reclassified as skipped, the rest stay pruned. A single + * candidate that does not fit the budget terminates the keep walk + * so older candidates are always evicted first even when an older + * entry would fit alone. The budget is applied independently to + * each cache family (blobs, layers, stacks) so a fat blob cannot + * crowd out a layer-cache eviction. + * + * The two filters compose by running older-than first and keep-bytes + * second per family: a transient just-pulled entry never enters the + * LRU budget computation so the grace window holds. + * + * The Plan 3 C3.3d extension introduces per-layer and per-stack + * counters that behave just like the blob counters: kept entries + * survive the mark phase, pruned entries are unreachable and (when + * commit is true) recursively removed, skipped entries were + * unreachable but the filters spared them. layer entries live under + * /layers/// and stack entries under + * /layers/stacks///; both are directory trees so + * the *_bytes counters are the recursive st_size sum of every + * regular file beneath the entry directory. + */ +typedef struct { + /* Inputs */ + bool commit; /* false (default) = dry-run; true = unlink */ + const char *volume_root; /* NULL = pin-only walk (see collect_roots) */ + uint64_t older_than_sec; /* 0 = no mtime filter */ + uint64_t keep_bytes; /* 0 = no size budget (no filter) */ + + /* Outputs - blobs */ + size_t kept_blobs; + size_t pruned_blobs; + uint64_t pruned_bytes; + size_t + skipped_blobs; /* dangling but spared by older_than_sec or keep_bytes */ + uint64_t skipped_bytes; /* sum of st_size for skipped_blobs */ + + /* Outputs - per-layer raw cache entries (C3.3d) */ + size_t kept_layers; + size_t pruned_layers; + uint64_t pruned_layer_bytes; + size_t skipped_layers; + uint64_t skipped_layer_bytes; + + /* Outputs - ChainID-keyed stack cache entries (C3.3d) */ + size_t kept_stacks; + size_t pruned_stacks; + uint64_t pruned_stack_bytes; + size_t skipped_stacks; + uint64_t skipped_stack_bytes; +} oci_store_prune_options_t; + +/* Garbage-collect dangling entries from three cache families under + * the store root. The mark phase pairs oci_store_collect_roots (blob + * keep set) with oci_store_collect_layer_roots (diff_id and ChainID + * keep sets) so all three sweeps share a single consistent snapshot + * of pins + unpacked sysroots: + * + * - /blobs// regular files (Plan 1) + * - /layers/// raw layer dirs (C3.3d) + * - /layers/stacks/// stack snapshot dirs (C3.3d) + * + * The sweep phase walks each family in turn, comparing every entry's + * : against its family's keep set; entries not reachable + * are counted into the family's pruned_* counters and (when + * opts->commit is true) removed (unlink for blobs, recursive rm for + * layer / stack directory trees). pruned_layer_bytes and + * pruned_stack_bytes hold the recursive sum of regular-file st_size + * beneath each removed directory. + * + * The whole operation runs under flock(/index.json.lock, + * LOCK_EX), which is the same write lock oci_store_put_ref holds. + * That bounds the race where a concurrent pull writes a new pin + * after the mark phase already snapshotted index.json. Layer / stack + * cache writers (oci_unpack / oci_rebuild_cache) do NOT take this + * lock so they may publish new cache entries concurrent with prune; + * those entries are reachable from their image's pin or unpacked + * sysroot, both of which the mark phase already captured, so their + * diff_id / ChainID is in the keep set even if the directory did + * not yet exist when sweep enumerated. The remaining window is the + * mid-pull case (a layer extracted before put_ref lands) which + * matches the C1.3 blob-mid-pull semantic: the operator retries. + * + * On entry every counter in opts is reset to zero so the caller does + * not have to memset between invocations. Entries whose name is not + * a valid lowercase hex digest for the enclosing algorithm subdir, + * non-directory entries inside layers/, and dotfiles are all skipped + * without surfacing as errors so any foreign state under the store + * root stays untouched. + * + * When opts->older_than_sec or opts->keep_bytes is set, each family + * gathers dangling candidates first and then applies the filters + * (older-than veto, then keep-bytes LRU budget) before any removal. + * The keep-bytes budget is independent per family: a 100 MiB budget + * means "keep up to 100 MiB of newest dangling blobs AND up to + * 100 MiB of newest dangling layer trees AND up to 100 MiB of + * newest dangling stack trees", so a fat blob cannot crowd a layer + * eviction off the budget. Candidates spared by either filter + * contribute to skipped_* rather than pruned_* so the caller can + * render a three-way kept/pruned/skipped split per family. + * + * Returns 0 on success and -1 on failure with errno preserved and + * *err (when non-NULL) populated. Mark-phase failure is fatal and + * aborts before any removal so a corrupt or torn manifest / config + * blob cannot cause prune to delete reachable entries. + */ +int oci_store_prune(oci_store_t *s, + oci_store_prune_options_t *opts, + const char **err); + +/* Plan 3 C3.2: per-layer unpack snapshot cache. + * + * Layer caches live under /layers/// in the same content- + * addressed shape as /blobs//. Each cache directory holds a + * snapshot of the unpack stage_dir state immediately after applying that + * layer's tar payload. clonefile(2) populates and consumes the snapshots so + * the cache and the live unpack stage must live on the same APFS volume; an + * EXDEV during snapshot is propagated as a hard error rather than silently + * falling back to a copy. + * + * Cache semantics are CUMULATIVE: the directory at layers/sha256// + * holds the stage_dir state assembled by the unpacker WHEN this layer was + * applied, which means it includes every prior layer's contribution along + * with the current layer. A second unpack of the same image short-circuits + * the extract loop entirely. Cross-image dedup (two images sharing a base + * layer prefix but diverging upstream) is NOT yet correct under this scheme; + * Plan 3 C3.3 introduces raw-tar staging + clonefile-stack assembly to fix + * the cross-image case. C3.2 lands the directory layout, the path helpers, + * and the per-image fast path the C3.3 rewrite will build on. + * + * C3.2 deliberately does NOT extend oci_store_collect_roots / oci_store_prune + * to walk layers/. The cache grows monotonically until C3.5's + * `oci image rebuild-cache` (or C3.3's prune work) consumes it. Skipping the + * keep-set walk now keeps this commit focused on the layout invariants. + * + * No refcount sidecar is written. Reachability is recomputed at GC time from + * each manifest's image-config rootfs.diff_ids list, mirroring how blobs/ + * reachability is recomputed by oci_store_collect_roots. + * + * Concurrency: cache_has is a single stat(2) and is inherently racy with + * concurrent writers, but the worst outcome is a redundant extract. + * oci_store_layer_commit publishes via rename(2) (atomic) and treats + * EEXIST / ENOTEMPTY at the destination as a benign loss to a racing + * winner: the loser's staging directory is removed and 0 is returned so the + * caller can proceed as though the entry was already on disk. No store-wide + * lock is required. + */ + +/* Probe whether /layers/// exists. diff_id is in canonical + * ":" form. Returns 1 (present, is a directory), 0 (absent), or + * -1 with errno preserved on any unexpected IO error. A malformed diff_id + * returns -1 with errno=EINVAL. + */ +int oci_store_layer_has(oci_store_t *s, const char *diff_id); + +/* Compose /layers/// for diff_id into out. Trailing slash + * included so a downstream strcat(child) composes cleanly. Pure path + * computation; does not stat or mkdir. Returns 0 on success, -1 with errno + * EINVAL on malformed diff_id, ENAMETOOLONG on buffer overflow. + */ +int oci_store_layer_resolve(oci_store_t *s, + const char *diff_id, + char *out, + size_t cap); + +/* Compose /layers/.staging/-- for diff_id into out. + * The path is unique per call; the directory is NOT created (clonefile(2) + * creates it as a side effect). Returns 0 on success, -1 with errno EINVAL + * on malformed diff_id, ENAMETOOLONG on overflow, or other errno values + * propagated from getentropy(2). + */ +int oci_store_layer_stage_path(oci_store_t *s, + const char *diff_id, + char *out, + size_t cap); + +/* Atomically publish a populated staging directory as the layer cache entry + * for diff_id via rename(stage_path, /layers///). If the + * destination already exists (EEXIST / ENOTEMPTY: a concurrent writer landed + * the same entry first) the staging directory is removed and 0 is returned. + * Any other failure returns -1 with errno preserved and *err (when non-NULL) + * populated; the staging directory is left in place so the caller can retry + * or inspect it. + */ +int oci_store_layer_commit(oci_store_t *s, + const char *stage_path, + const char *diff_id, + const char **err); + +/* Plan 3 C3.3c: ChainID-keyed assembled-stack cache. + * + * The stack cache lives under /layers/stacks/// in the same + * content-addressed shape as the per-layer raw cache. Each entry holds an + * assembled cumulative stage_dir state through some prefix of an image's + * layer list, keyed by the OCI ChainID for the terminating layer (see + * src/oci/digest.h::oci_chainid_compute). Cross-image dedup works because + * any two images that share the same ordered layer prefix produce the same + * ChainID for that prefix; the longest-prefix match short-circuits the + * per-layer assembly during oci_unpack. + * + * Staging shares the /layers/.staging/ directory with the per-layer + * raw cache. Stack stage paths are prefixed with "stack-" so a debug walk + * of .staging/ can tell the two artifact families apart at a glance. The + * commit destination is what disambiguates the two on disk. + * + * Concurrency mirrors the layer-cache APIs: stack_has is a single stat(2) + * (racy with concurrent writers; worst case is one redundant assembly), + * stack_commit publishes via rename(2) and treats EEXIST / ENOTEMPTY as a + * benign loss to the racing winner with the staging tree torn down. No + * store-wide lock is required. + */ + +/* Probe whether /layers/stacks/// exists. chain_id is in + * canonical ":" form. Returns 1 (present, is a directory), 0 + * (absent), or -1 with errno preserved on any unexpected IO error. A + * malformed chain_id returns -1 with errno=EINVAL. + */ +int oci_store_stack_has(oci_store_t *s, const char *chain_id); + +/* Compose /layers/stacks/// for chain_id into out. + * Trailing slash included so a downstream strcat(child) composes cleanly. + * Pure path computation; does not stat or mkdir. Returns 0 on success, -1 + * with errno EINVAL on malformed chain_id, ENAMETOOLONG on buffer overflow. + */ +int oci_store_stack_resolve(oci_store_t *s, + const char *chain_id, + char *out, + size_t cap); + +/* Compose /layers/.staging/stack--- for chain_id + * into out. The path is unique per call; the directory is NOT created + * (clonefile(2) creates it as a side effect). Returns 0 on success, -1 with + * errno EINVAL on malformed chain_id, ENAMETOOLONG on overflow, or other + * errno values propagated from getentropy(2). + */ +int oci_store_stack_stage_path(oci_store_t *s, + const char *chain_id, + char *out, + size_t cap); + +/* Atomically publish a populated staging directory as the stack cache entry + * for chain_id via rename(stage_path, /layers/stacks///). + * If the destination already exists (EEXIST / ENOTEMPTY: a concurrent writer + * landed the same entry first) the staging directory is removed and 0 is + * returned. Any other failure returns -1 with errno preserved and *err + * (when non-NULL) populated; the staging directory is left in place so the + * caller can retry or inspect it. + */ +int oci_store_stack_commit(oci_store_t *s, + const char *stage_path, + const char *chain_id, + const char **err); diff --git a/src/oci/tar.c b/src/oci/tar.c new file mode 100644 index 0000000..6dbf22b --- /dev/null +++ b/src/oci/tar.c @@ -0,0 +1,759 @@ +/* OCI tar reader implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include "oci/tar.h" + +#define TAR_BLOCK_SIZE 512 +#define TAR_PATH_MAX 4096 +#define TAR_LINKNAME_MAX 4096 +#define TAR_MAX_LONG_RECORD (TAR_PATH_MAX * 4) + +/* POSIX 1003.1-1990 ustar header offsets and lengths. The reader does + * not synthesize the struct; it indexes the raw block directly to + * avoid any padding/alignment ambiguity. + */ +#define OFF_NAME 0 +#define OFF_MODE 100 +#define OFF_UID 108 +#define OFF_GID 116 +#define OFF_SIZE 124 +#define OFF_MTIME 136 +#define OFF_CHKSUM 148 +#define OFF_TYPEFLAG 156 +#define OFF_LINKNAME 157 +#define OFF_MAGIC 257 +#define OFF_VERSION 263 +#define OFF_PREFIX 345 + +#define LEN_NAME 100 +#define LEN_MODE 8 +#define LEN_UID 8 +#define LEN_GID 8 +#define LEN_SIZE 12 +#define LEN_MTIME 12 +#define LEN_CHKSUM 8 +#define LEN_LINKNAME 100 +#define LEN_MAGIC 6 +#define LEN_VERSION 2 +#define LEN_PREFIX 155 + +struct oci_tar_reader { + oci_tar_read_fn read_fn; + void *ctx; + + /* Current entry buffers. path and linkname are reused across + * entries; they grow on demand up to TAR_PATH_MAX / TAR_LINKNAME_MAX. + */ + char *path; + size_t path_cap; + char *linkname; + size_t linkname_cap; + + /* GNU long-name pending payload. When the previous block was an 'L' + * or 'K' typeflag, the next non-extension entry consumes this + * buffer instead of the in-header name/linkname. + */ + char *pending_long_name; + char *pending_long_link; + + /* Payload tracking for the current entry. bytes_remaining counts + * unread payload bytes; padding_remaining is the trailing zero + * padding the reader must consume before the next header. + */ + uint64_t bytes_remaining; + uint32_t padding_remaining; + + /* Two consecutive zero blocks terminate the archive. */ + bool saw_first_zero_block; +}; + +static int read_full(oci_tar_reader_t *r, void *buf, size_t want) +{ + uint8_t *p = buf; + size_t got = 0; + while (got < want) { + ssize_t n = r->read_fn(r->ctx, p + got, want - got); + if (n < 0) + return -1; + if (n == 0) + return 0; + got += (size_t) n; + } + return 1; +} + +static int fill_block(oci_tar_reader_t *r, uint8_t *block) +{ + return read_full(r, block, TAR_BLOCK_SIZE); +} + +static int discard_bytes(oci_tar_reader_t *r, uint64_t bytes) +{ + uint8_t scratch[TAR_BLOCK_SIZE]; + while (bytes > 0) { + size_t take = bytes > TAR_BLOCK_SIZE ? TAR_BLOCK_SIZE : (size_t) bytes; + int rc = read_full(r, scratch, take); + if (rc <= 0) + return -1; + bytes -= take; + } + return 0; +} + +static bool is_zero_block(const uint8_t *block) +{ + for (size_t i = 0; i < TAR_BLOCK_SIZE; i++) + if (block[i] != 0) + return false; + return true; +} + +/* Parse a NUL- or space-terminated octal field. Returns -1 on + * unparseable input. Empty (all-zero) fields read as 0, which is the + * tar convention for fields the writer left unset. + */ +static int parse_octal(const uint8_t *field, size_t len, uint64_t *out) +{ + /* GNU base-256 extension: high bit set in the first byte means the + * remaining bytes are a big-endian binary integer. tar uses this + * for sizes that overflow the 11-octal-digit ceiling (~8 GiB). + */ + if (field[0] & 0x80) { + uint64_t v = 0; + for (size_t i = 1; i < len; i++) { + if (v > (UINT64_MAX >> 8)) + return -1; + v = (v << 8) | field[i]; + } + /* The first byte's low 7 bits also belong to the integer. */ + v |= (uint64_t) (field[0] & 0x7f) << ((len - 1) * 8); + *out = v; + return 0; + } + + uint64_t v = 0; + size_t i = 0; + while (i < len && (field[i] == ' ' || field[i] == '\0')) + i++; + if (i == len) { + *out = 0; + return 0; + } + for (; i < len; i++) { + uint8_t c = field[i]; + if (c == ' ' || c == '\0') + break; + if (c < '0' || c > '7') + return -1; + if (v > (UINT64_MAX >> 3)) + return -1; + v = (v << 3) | (uint64_t) (c - '0'); + } + *out = v; + return 0; +} + +static bool verify_chksum(const uint8_t *block) +{ + /* The chksum field itself is included in the sum as 8 spaces. */ + uint64_t parsed = 0; + if (parse_octal(block + OFF_CHKSUM, LEN_CHKSUM, &parsed) < 0) + return false; + uint32_t expected = (uint32_t) parsed; + + uint32_t sum_unsigned = 0; + int32_t sum_signed = 0; + for (size_t i = 0; i < TAR_BLOCK_SIZE; i++) { + uint8_t b = block[i]; + if (i >= OFF_CHKSUM && i < OFF_CHKSUM + LEN_CHKSUM) + b = ' '; + sum_unsigned += b; + sum_signed += (int8_t) b; + } + /* Historical tar implementations used signed bytes when computing + * the checksum; accept either to interop with both. + */ + return sum_unsigned == expected || (uint32_t) sum_signed == expected; +} + +static bool is_ustar_magic(const uint8_t *block) +{ + /* POSIX "ustar\0" version "00", or GNU "ustar \0" (space-space-NUL + * starting at OFF_MAGIC). Both shapes are common in real images. + */ + if (memcmp(block + OFF_MAGIC, "ustar", 5) != 0) + return false; + return true; +} + +static int copy_field_string(const uint8_t *field, + size_t field_len, + char *out, + size_t out_cap) +{ + /* Tar fields are NUL-terminated only when shorter than the slot. + * Truncation is normal: copy up to the first NUL (or field_len), + * then NUL-terminate the destination. + */ + size_t n = 0; + while (n < field_len && field[n] != '\0') + n++; + if (n + 1 > out_cap) + return -1; + memcpy(out, field, n); + out[n] = '\0'; + return 0; +} + +static int build_path_from_header(const uint8_t *block, + char *out, + size_t out_cap) +{ + char name[LEN_NAME + 1]; + char prefix[LEN_PREFIX + 1]; + if (copy_field_string(block + OFF_NAME, LEN_NAME, name, sizeof(name)) < 0) + return -1; + if (copy_field_string(block + OFF_PREFIX, LEN_PREFIX, prefix, + sizeof(prefix)) < 0) + return -1; + if (prefix[0] == '\0') { + if (strlen(name) + 1 > out_cap) + return -1; + memcpy(out, name, strlen(name) + 1); + return 0; + } + /* ustar joins prefix + '/' + name. */ + size_t pn = strlen(prefix); + size_t nn = strlen(name); + if (pn + 1 + nn + 1 > out_cap) + return -1; + memcpy(out, prefix, pn); + out[pn] = '/'; + memcpy(out + pn + 1, name, nn); + out[pn + 1 + nn] = '\0'; + return 0; +} + +static int ensure_capacity(char **buf, size_t *cap, size_t want) +{ + if (*cap >= want) + return 0; + size_t new_cap = *cap == 0 ? 256 : *cap; + while (new_cap < want) + new_cap *= 2; + char *grown = realloc(*buf, new_cap); + if (!grown) + return -1; + *buf = grown; + *cap = new_cap; + return 0; +} + +static const char *strip_trailing_slash(char *s) +{ + size_t n = strlen(s); + while (n > 1 && s[n - 1] == '/') + s[--n] = '\0'; + return s; +} + +static void apply_whiteout_flags(oci_tar_entry_t *e) +{ + e->is_whiteout = false; + e->is_opaque_whiteout = false; + if (!e->path) + return; + const char *slash = strrchr(e->path, '/'); + const char *base = slash ? slash + 1 : e->path; + if (strcmp(base, ".wh..wh..opq") == 0) + e->is_opaque_whiteout = true; + else if (strncmp(base, ".wh.", 4) == 0) + e->is_whiteout = true; +} + +oci_tar_reader_t *oci_tar_reader_new(oci_tar_read_fn read_fn, void *ctx) +{ + if (!read_fn) + return NULL; + oci_tar_reader_t *r = calloc(1, sizeof(*r)); + if (!r) + return NULL; + r->read_fn = read_fn; + r->ctx = ctx; + return r; +} + +void oci_tar_reader_free(oci_tar_reader_t *r) +{ + if (!r) + return; + free(r->path); + free(r->linkname); + free(r->pending_long_name); + free(r->pending_long_link); + free(r); +} + +/* Consume a GNU 'L' or 'K' typeflag payload into a freshly allocated + * buffer that will be claimed by the next non-extension entry. + */ +static int consume_long_record(oci_tar_reader_t *r, + uint64_t size, + char **out, + const char **err) +{ + if (size == 0 || size > TAR_MAX_LONG_RECORD) { + *err = "tar GNU long-name record out of bounds"; + errno = ENAMETOOLONG; + return -1; + } + char *buf = malloc((size_t) size + 1); + if (!buf) { + *err = "tar long-name buffer allocation failed"; + errno = ENOMEM; + return -1; + } + int rc = read_full(r, buf, (size_t) size); + if (rc <= 0) { + free(buf); + *err = "tar GNU long-name record truncated"; + errno = EIO; + return -1; + } + /* Padding alignment is computed against the on-wire record length, + * NOT the C-string length after stripping trailing NULs. Doing the + * trim before discard_bytes would drift the source position by the + * trim count and misalign the next header read. + */ + uint64_t pad = (TAR_BLOCK_SIZE - (size % TAR_BLOCK_SIZE)) % TAR_BLOCK_SIZE; + if (pad > 0 && discard_bytes(r, pad) < 0) { + free(buf); + *err = "tar GNU long-name padding truncated"; + errno = EIO; + return -1; + } + /* GNU records are NUL-terminated on the wire, but defensively + * ensure the caller-visible string has a terminator regardless of + * the in-record byte layout. + */ + buf[size] = '\0'; + free(*out); + *out = buf; + return 0; +} + +/* Consume an 'x' (per-file) or 'g' (global) PAX extended-header + * payload. Per POSIX.1-2001 the payload is a stream of records of + * the form " =\n" where is the total byte + * count of the record including its own length digits and trailing + * newline. The unpack pipeline cares only about long names: `path` + * overrides the next entry's name and `linkpath` overrides its + * linkname. Both are promoted into the same pending buffers the GNU + * 'L'/'K' path uses so downstream code stays unaware of the format. + * + * Global ('g') records establish defaults for all subsequent + * entries; container builders almost never set path / linkpath + * defaults globally (the use case is local mtime / atime / uid + * defaults), so the implementation discards the payload bytes- + * correctly without parsing. Other PAX keys (size, mtime, atime, + * uid, gid, xattrs) are not tracked by the unpack pipeline and are + * silently ignored from per-file records as well. + */ +static int consume_pax_record(oci_tar_reader_t *r, + uint64_t size, + int is_global, + const char **err) +{ + if (size > TAR_MAX_LONG_RECORD) { + *err = "tar PAX record out of bounds"; + errno = ENAMETOOLONG; + return -1; + } + char *buf = NULL; + if (size > 0) { + buf = malloc((size_t) size + 1); + if (!buf) { + *err = "tar PAX buffer allocation failed"; + errno = ENOMEM; + return -1; + } + int rc = read_full(r, buf, (size_t) size); + if (rc <= 0) { + free(buf); + *err = "tar PAX record truncated"; + errno = EIO; + return -1; + } + buf[size] = '\0'; + } + /* Padding alignment is computed against the on-wire record length, + * matching consume_long_record so the next header read stays + * aligned even when size is not a multiple of TAR_BLOCK_SIZE. + */ + uint64_t pad = (TAR_BLOCK_SIZE - (size % TAR_BLOCK_SIZE)) % TAR_BLOCK_SIZE; + if (pad > 0 && discard_bytes(r, pad) < 0) { + free(buf); + *err = "tar PAX padding truncated"; + errno = EIO; + return -1; + } + + if (is_global || size == 0) { + free(buf); + return 0; + } + + char *p = buf; + char *end = buf + size; + while (p < end) { + char *space = memchr(p, ' ', (size_t) (end - p)); + if (!space) { + free(buf); + *err = "tar PAX record missing length terminator"; + errno = EINVAL; + return -1; + } + char *endp = NULL; + long len = strtol(p, &endp, 10); + if (endp != space || len <= 0) { + free(buf); + *err = "tar PAX record length unparseable"; + errno = EINVAL; + return -1; + } + char *record_end = p + len; + if (record_end > end || record_end[-1] != '\n') { + free(buf); + *err = "tar PAX record framing invalid"; + errno = EINVAL; + return -1; + } + char *kvp = space + 1; + char *eq = memchr(kvp, '=', (size_t) (record_end - 1 - kvp)); + if (eq) { + size_t key_len = (size_t) (eq - kvp); + const char *val = eq + 1; + size_t val_len = (size_t) (record_end - 1 - val); + char **slot = NULL; + if (key_len == 4 && memcmp(kvp, "path", 4) == 0) + slot = &r->pending_long_name; + else if (key_len == 8 && memcmp(kvp, "linkpath", 8) == 0) + slot = &r->pending_long_link; + if (slot) { + char *copy = malloc(val_len + 1); + if (!copy) { + free(buf); + *err = "tar PAX value allocation failed"; + errno = ENOMEM; + return -1; + } + memcpy(copy, val, val_len); + copy[val_len] = '\0'; + free(*slot); + *slot = copy; + } + } + p = record_end; + } + free(buf); + return 0; +} + +static int classify_typeflag(uint8_t flag, oci_tar_type_t *out) +{ + switch (flag) { + case '\0': + case '0': + case '7': /* contiguous file: treat as regular */ + *out = OCI_TAR_REG; + return 0; + case '1': + *out = OCI_TAR_HARDLINK; + return 0; + case '2': + *out = OCI_TAR_SYMLINK; + return 0; + case '3': + case '4': + case '6': + *out = OCI_TAR_UNSUPPORTED; /* char/block/fifo */ + return 0; + case '5': + *out = OCI_TAR_DIR; + return 0; + case 'L': + case 'K': + return 1; /* GNU long-name extension; caller handles payload */ + case 'x': + case 'g': + return 2; /* PAX extended; caller rejects */ + default: + return -1; /* unknown */ + } +} + +int oci_tar_next(oci_tar_reader_t *r, oci_tar_entry_t *out, const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + + if (!r || !out) { + *err = "tar next called with NULL argument"; + errno = EINVAL; + return -1; + } + + /* Any unconsumed payload from a prior entry must be drained before + * the next header is read; the contract is that callers either + * read fully or skip explicitly, but defensively flush here too. + */ + if (r->bytes_remaining > 0 || r->padding_remaining > 0) { + if (oci_tar_skip_payload(r, err) < 0) + return -1; + } + + for (;;) { + uint8_t block[TAR_BLOCK_SIZE]; + int rc = fill_block(r, block); + if (rc < 0) { + *err = "tar header read failed"; + errno = EIO; + return -1; + } + if (rc == 0) { + /* Stream ended cleanly mid-archive. Treat as EOF; the + * archive may have omitted the final zero blocks, which + * happens with hand-rolled tarballs. + */ + return 0; + } + + if (is_zero_block(block)) { + if (r->saw_first_zero_block) + return 0; + r->saw_first_zero_block = true; + continue; + } + r->saw_first_zero_block = false; + + if (!verify_chksum(block)) { + *err = "tar header checksum mismatch"; + errno = EINVAL; + return -1; + } + if (!is_ustar_magic(block)) { + *err = "tar header missing ustar magic"; + errno = EINVAL; + return -1; + } + + uint8_t typeflag = block[OFF_TYPEFLAG]; + oci_tar_type_t type; + int klass = classify_typeflag(typeflag, &type); + if (klass < 0) { + *err = "tar header carries unknown typeflag"; + errno = EINVAL; + return -1; + } + + uint64_t size = 0; + if (parse_octal(block + OFF_SIZE, LEN_SIZE, &size) < 0) { + *err = "tar header size field unparseable"; + errno = EINVAL; + return -1; + } + + if (klass == 1) { + /* GNU long-name / long-link extension entry. The payload + * carries the path or linkname for the NEXT real entry. + */ + char **slot = + typeflag == 'L' ? &r->pending_long_name : &r->pending_long_link; + if (consume_long_record(r, size, slot, err) < 0) + return -1; + continue; + } + if (klass == 2) { + if (consume_pax_record(r, size, typeflag == 'g', err) < 0) + return -1; + continue; + } + + /* Real entry. Materialize path and linkname into reader-owned + * buffers so the caller can read entry.path stably until the + * next oci_tar_next call. + */ + if (r->pending_long_name) { + size_t want = strlen(r->pending_long_name) + 1; + if (ensure_capacity(&r->path, &r->path_cap, want) < 0) { + *err = "tar path buffer allocation failed"; + errno = ENOMEM; + return -1; + } + memcpy(r->path, r->pending_long_name, want); + free(r->pending_long_name); + r->pending_long_name = NULL; + } else { + if (ensure_capacity(&r->path, &r->path_cap, TAR_PATH_MAX) < 0) { + *err = "tar path buffer allocation failed"; + errno = ENOMEM; + return -1; + } + if (build_path_from_header(block, r->path, r->path_cap) < 0) { + *err = "tar header path overflow"; + errno = ENAMETOOLONG; + return -1; + } + } + if (r->pending_long_link) { + size_t want = strlen(r->pending_long_link) + 1; + if (ensure_capacity(&r->linkname, &r->linkname_cap, want) < 0) { + *err = "tar linkname buffer allocation failed"; + errno = ENOMEM; + return -1; + } + memcpy(r->linkname, r->pending_long_link, want); + free(r->pending_long_link); + r->pending_long_link = NULL; + } else { + if (ensure_capacity(&r->linkname, &r->linkname_cap, + LEN_LINKNAME + 1) < 0) { + *err = "tar linkname buffer allocation failed"; + errno = ENOMEM; + return -1; + } + if (copy_field_string(block + OFF_LINKNAME, LEN_LINKNAME, + r->linkname, r->linkname_cap) < 0) { + *err = "tar linkname overflow"; + errno = ENAMETOOLONG; + return -1; + } + } + + /* Dirs may carry a trailing slash in name; normalize it away so + * downstream matchers do not have to special-case both forms. + */ + if (type == OCI_TAR_DIR) + strip_trailing_slash(r->path); + + /* DIR entries should have size 0 in practice, but tolerate + * archives that record an unused size. The payload contract + * is "advertise size bytes; the reader consumes them". + */ + out->path = r->path; + out->linkname = (type == OCI_TAR_SYMLINK || type == OCI_TAR_HARDLINK) + ? r->linkname + : NULL; + out->type = type; + + uint64_t mode = 0; + uint64_t uid = 0; + uint64_t gid = 0; + uint64_t mtime = 0; + if (parse_octal(block + OFF_MODE, LEN_MODE, &mode) < 0 || + parse_octal(block + OFF_UID, LEN_UID, &uid) < 0 || + parse_octal(block + OFF_GID, LEN_GID, &gid) < 0 || + parse_octal(block + OFF_MTIME, LEN_MTIME, &mtime) < 0) { + *err = "tar header numeric field unparseable"; + errno = EINVAL; + return -1; + } + out->mode = (uint32_t) (mode & 07777); + out->uid = uid; + out->gid = gid; + out->mtime = mtime; + out->size = type == OCI_TAR_REG ? size : 0; + + apply_whiteout_flags(out); + + /* Stage payload tracking. Even non-regular entries may have a + * recorded size that the writer expects the reader to skip + * (rare, but real). The reader honors whatever size field the + * header advertises so the stream stays aligned. + */ + r->bytes_remaining = size; + r->padding_remaining = + size > 0 ? (uint32_t) ((TAR_BLOCK_SIZE - (size % TAR_BLOCK_SIZE)) % + TAR_BLOCK_SIZE) + : 0; + + return 1; + } +} + +int oci_tar_read_payload(oci_tar_reader_t *r, + void *buf, + size_t cap, + size_t *got, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (got) + *got = 0; + if (!r || !buf || !got) { + *err = "tar read called with NULL argument"; + errno = EINVAL; + return -1; + } + + if (r->bytes_remaining == 0) { + /* Drain any tail padding so the next oci_tar_next aligns. */ + if (r->padding_remaining > 0) { + if (discard_bytes(r, r->padding_remaining) < 0) { + *err = "tar padding read failed"; + errno = EIO; + return -1; + } + r->padding_remaining = 0; + } + return 0; + } + + size_t want = cap > r->bytes_remaining ? (size_t) r->bytes_remaining : cap; + int rc = read_full(r, buf, want); + if (rc <= 0) { + *err = "tar payload truncated"; + errno = EIO; + return -1; + } + r->bytes_remaining -= want; + *got = want; + return 0; +} + +int oci_tar_skip_payload(oci_tar_reader_t *r, const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!r) { + *err = "tar skip called with NULL reader"; + errno = EINVAL; + return -1; + } + uint64_t drop = r->bytes_remaining + r->padding_remaining; + r->bytes_remaining = 0; + r->padding_remaining = 0; + if (drop == 0) + return 0; + if (discard_bytes(r, drop) < 0) { + *err = "tar skip read failed"; + errno = EIO; + return -1; + } + return 0; +} diff --git a/src/oci/tar.h b/src/oci/tar.h new file mode 100644 index 0000000..54e9615 --- /dev/null +++ b/src/oci/tar.h @@ -0,0 +1,88 @@ +/* OCI tar reader (POSIX ustar + GNU long-name; PAX rejected) + * + * Streams tar entries out of a generic byte source so the parser stays + * compression-agnostic; oci/decompress.c feeds either zlib, libzstd, or + * a passthrough stream into the read callback. The applier in + * oci/layer-apply.c then walks entries in order, dispatching by + * oci_tar_type_t. + * + * Block, char, fifo, and socket entries collapse to OCI_TAR_UNSUPPORTED + * so the applier can emit a precise refusal without re-decoding the + * typeflag. PAX extended headers are rejected outright per + * oci-roadmap.md Q3 asymmetric subset; if a real image is ever found + * to require PAX mtime or path records, expand the accept list with + * targeted parsing rather than enabling generic PAX. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +typedef enum { + OCI_TAR_REG, + OCI_TAR_DIR, + OCI_TAR_SYMLINK, + OCI_TAR_HARDLINK, + OCI_TAR_UNSUPPORTED, +} oci_tar_type_t; + +typedef struct { + /* path and linkname are owned by the reader and remain valid until + * the next oci_tar_next call. They are const to the caller: the backing + * storage belongs to the reader, so callers must not mutate it, and any + * value kept past the next iteration must be duplicated. + */ + const char *path; + const char *linkname; + uint64_t size; + uint32_t mode; + uint64_t uid; + uint64_t gid; + uint64_t mtime; + oci_tar_type_t type; + bool is_whiteout; + bool is_opaque_whiteout; +} oci_tar_entry_t; + +/* Byte source callback. Returns >=0 bytes read into buf (0 means EOF) + * or -1 on I/O error with errno set. Short reads are tolerated; the + * reader retries until a full 512-byte block is available. + */ +typedef ssize_t (*oci_tar_read_fn)(void *ctx, void *buf, size_t cap); + +typedef struct oci_tar_reader oci_tar_reader_t; + +oci_tar_reader_t *oci_tar_reader_new(oci_tar_read_fn read_fn, void *ctx); +void oci_tar_reader_free(oci_tar_reader_t *r); + +/* Pull the next header. + * returns 1: entry populated; payload available via read or skip + * returns 0: clean EOF (two zero blocks or stream end) + * returns -1: protocol or I/O error; *err points to a static string + * + * Caller must call exactly one of oci_tar_read_payload or + * oci_tar_skip_payload (or read until *got == 0) before the next + * oci_tar_next so the reader can realign to the next 512-byte block. + */ +int oci_tar_next(oci_tar_reader_t *r, oci_tar_entry_t *out, const char **err); + +/* Copy up to cap bytes of the current entry's payload into buf. + * Returns 0 on success; *got carries the byte count (0 once the + * payload is exhausted). Returns -1 on read error with *err set. + */ +int oci_tar_read_payload(oci_tar_reader_t *r, + void *buf, + size_t cap, + size_t *got, + const char **err); + +/* Discard the rest of the current entry's payload plus its 512-byte + * block padding so the next oci_tar_next sees a fresh header. + */ +int oci_tar_skip_payload(oci_tar_reader_t *r, const char **err); diff --git a/src/oci/unpack.c b/src/oci/unpack.c new file mode 100644 index 0000000..fd99917 --- /dev/null +++ b/src/oci/unpack.c @@ -0,0 +1,1155 @@ +/* OCI layer unpack orchestrator implementation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oci/blob-store.h" +#include "oci/decompress.h" +#include "oci/digest.h" +#include "oci/layer-apply.h" +#include "oci/layer-meta.h" +#include "oci/manifest.h" +#include "oci/media-type.h" +#include "oci/origin-meta.h" +#include "oci/ref.h" +#include "oci/store.h" +#include "oci/tar.h" +#include "oci/unpack.h" +#include "oci/volume.h" + +#define UN_PATH_MAX 4096 +#define UN_BLOB_BUF 65536 + +typedef struct { + oci_stream_t *s; +} unpack_stream_ctx_t; + +static ssize_t unpack_stream_read_cb(void *ctx, void *buf, size_t cap) +{ + unpack_stream_ctx_t *c = ctx; + return oci_stream_read(c->s, buf, cap); +} + +static int set_err(const char **err, const char *msg, int err_no) +{ + if (err) + *err = msg; + errno = err_no; + return -1; +} + +static int mkdir_p(const char *path) +{ + char buf[UN_PATH_MAX]; + size_t n = strlen(path); + if (n >= sizeof(buf)) + return -1; + memcpy(buf, path, n + 1); + for (char *p = buf + 1; *p; p++) { + if (*p != '/') + continue; + *p = '\0'; + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + *p = '/'; + } + if (mkdir(buf, 0755) < 0 && errno != EEXIST) + return -1; + return 0; +} + +static int rand_hex(char *out, size_t n_hex) +{ + size_t need = n_hex / 2; + uint8_t buf[16]; + if (need > sizeof(buf)) + return -1; + if (getentropy(buf, need) < 0) + return -1; + static const char hex[] = "0123456789abcdef"; + for (size_t i = 0; i < need; i++) { + out[i * 2] = hex[buf[i] >> 4]; + out[i * 2 + 1] = hex[buf[i] & 0xf]; + } + out[n_hex] = '\0'; + return 0; +} + +static int read_blob(oci_blob_store_t *bs, + oci_digest_algo_t algo, + const char *hex, + uint8_t **out_buf, + size_t *out_len, + const char **err) +{ + char path[UN_PATH_MAX]; + if (oci_blob_store_path(bs, algo, hex, path, sizeof(path)) < 0) + return set_err(err, "unpack: blob path resolve failed", errno); + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return set_err(err, "unpack: blob open failed", errno); + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + return set_err(err, "unpack: blob fstat failed", errno); + } + if (st.st_size < 0 || st.st_size > (off_t) (256 * 1024 * 1024)) { + close(fd); + return set_err(err, "unpack: blob size out of bounds", EINVAL); + } + uint8_t *buf = malloc((size_t) st.st_size + 1); + if (!buf) { + close(fd); + return set_err(err, "unpack: blob buffer alloc failed", ENOMEM); + } + ssize_t got = read(fd, buf, (size_t) st.st_size); + close(fd); + if (got != st.st_size) { + free(buf); + return set_err(err, "unpack: blob short read", EIO); + } + buf[got] = '\0'; + *out_buf = buf; + *out_len = (size_t) got; + return 0; +} + +/* Open the on-disk blob path and confirm its sha256 hash matches the + * descriptor's expected digest. Phase 1 already verified at write time, + * but unpack re-verifies in case a host-side tool modified the blob. + */ +static int reverify_layer_digest(oci_blob_store_t *bs, + const oci_descriptor_t *desc, + const char **err) +{ + char path[UN_PATH_MAX]; + if (oci_blob_store_path(bs, desc->algo, desc->hex, path, sizeof(path)) < 0) + return set_err(err, "unpack: layer path resolve failed", errno); + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return set_err(err, "unpack: layer open failed", errno); + + oci_digester_t *d = oci_digester_new(desc->algo); + if (!d) { + close(fd); + return set_err(err, "unpack: digester alloc failed", ENOMEM); + } + uint8_t buf[UN_BLOB_BUF]; + for (;;) { + ssize_t n = read(fd, buf, sizeof(buf)); + if (n < 0) { + if (errno == EINTR) + continue; + oci_digester_free(d); + close(fd); + return set_err(err, "unpack: layer read failed", errno); + } + if (n == 0) + break; + oci_digester_update(d, buf, (size_t) n); + } + close(fd); + char got_hex[OCI_DIGEST_HEX_MAX + 1]; + oci_digester_finish_hex(d, got_hex); + oci_digester_free(d); + if (strcmp(got_hex, desc->hex) != 0) + return set_err(err, "unpack: layer blob digest mismatch", EINVAL); + return 0; +} + +/* Recursively rm a path so clonefile(2) can recreate it. Matches the + * discipline used in src/oci/clone-rootfs.c: lstat + recurse, no shell-out. + * Returns 0 on success or when path was already absent; -1 with errno set + * on any unexpected IO error. The caller is responsible for ensuring path + * is safe to remove (e.g. a freshly mkdir'd stage_dir, not a user dir). + */ +static int rm_recursive(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return errno == ENOENT ? 0 : -1; + if (!S_ISDIR(st.st_mode)) + return unlink(path); + DIR *d = opendir(path); + if (!d) + return -1; + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[UN_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) { + errno = ENAMETOOLONG; + rc = -1; + break; + } + if (rm_recursive(child) < 0) { + rc = -1; + break; + } + } + closedir(d); + if (rc == 0 && rmdir(path) < 0) + rc = -1; + return rc; +} + +/* Shared reverify + decompress + apply pipeline for the two + * single-layer entry points. UNPACK_MODE_OVERLAY drives + * oci_layer_apply (whiteout / opaque interpreted against root_dir); + * UNPACK_MODE_RAW drives oci_layer_apply_raw_tar (whiteout markers + * preserved as zero-byte regular files for the C3.3c raw per-layer + * cache populate path). + */ +typedef enum { + UNPACK_MODE_OVERLAY, + UNPACK_MODE_RAW, +} unpack_mode_t; + +static int unpack_layer_impl(oci_blob_store_t *bs, + const oci_descriptor_t *desc, + const char *root_dir, + unpack_mode_t mode, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char *log_label, + const char **err) +{ + if (!bs || !desc || !root_dir) + return set_err(err, "unpack_layer: NULL argument", EINVAL); + + if (oci_media_type_is_foreign(desc->media_type)) + return set_err(err, "unpack: layer is foreign / nondistributable", + ENOTSUP); + + if (reverify_layer_digest(bs, desc, err) < 0) + return -1; + char path[UN_PATH_MAX]; + if (oci_blob_store_path(bs, desc->algo, desc->hex, path, sizeof(path)) < 0) + return set_err(err, "unpack: layer path resolve failed", errno); + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return set_err(err, "unpack: layer open failed", errno); + + oci_compression_t alg = oci_media_type_compression(desc->media_type); + oci_stream_t *stream = oci_decompress_open(fd, alg, err); + if (!stream) { + close(fd); + return -1; + } + + unpack_stream_ctx_t tctx = {.s = stream}; + oci_tar_reader_t *r = oci_tar_reader_new(unpack_stream_read_cb, &tctx); + if (!r) { + oci_stream_close(stream); + close(fd); + return set_err(err, "unpack: tar reader alloc failed", ENOMEM); + } + + if (log_label) + fprintf(stderr, " %s: %s\n", log_label, desc->digest_str); + + oci_layer_apply_stats_t local_stats = {0}; + int rc; + if (mode == UNPACK_MODE_OVERLAY) + rc = oci_layer_apply(r, root_dir, &local_stats, meta, err); + else + rc = oci_layer_apply_raw_tar(r, root_dir, &local_stats, meta, err); + + oci_tar_reader_free(r); + oci_stream_close(stream); + close(fd); + + if (rc < 0) + return -1; + if (stats) { + stats->files += local_stats.files; + stats->dirs += local_stats.dirs; + stats->symlinks += local_stats.symlinks; + stats->hardlinks += local_stats.hardlinks; + stats->whiteouts += local_stats.whiteouts; + stats->opaques += local_stats.opaques; + } + if (log_label) + fprintf(stderr, + " +files=%zu dirs=%zu symlinks=%zu hardlinks=%zu " + "whiteouts=%zu opaques=%zu\n", + local_stats.files, local_stats.dirs, local_stats.symlinks, + local_stats.hardlinks, local_stats.whiteouts, + local_stats.opaques); + return 0; +} + +int oci_unpack_layer(oci_blob_store_t *bs, + const oci_descriptor_t *desc, + const char *stage_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char *log_label, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + return unpack_layer_impl(bs, desc, stage_dir, UNPACK_MODE_OVERLAY, stats, + meta, log_label, err); +} + +int oci_unpack_layer_raw(oci_blob_store_t *bs, + const oci_descriptor_t *desc, + const char *raw_dir, + oci_layer_apply_stats_t *stats, + oci_meta_table_t *meta, + const char *log_label, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + return unpack_layer_impl(bs, desc, raw_dir, UNPACK_MODE_RAW, stats, meta, + log_label, err); +} + +/* --- C3.3c-ii two-pass overlay assembler ------------------------------- */ + +#define UN_RAW_META_SIDECAR ".elfuse-meta.layer.json" + +static bool is_whiteout_name(const char *name) +{ + return strncmp(name, ".wh.", 4) == 0; +} + +/* Remove every direct child of path, leaving path itself in place. Used + * to honour the OCI ".wh..wh..opq" opaque marker: the parent directory + * stays so this layer's siblings can land on top. + */ +static int clear_dir_contents(const char *path, const char **err) +{ + DIR *d = opendir(path); + if (!d) { + if (errno == ENOENT) + return 0; + return set_err(err, "assemble: clear opendir failed", errno); + } + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + char child[UN_PATH_MAX]; + int n = snprintf(child, sizeof(child), "%s/%s", path, de->d_name); + if (n < 0 || (size_t) n >= sizeof(child)) { + rc = set_err(err, "assemble: clear path overflow", ENAMETOOLONG); + break; + } + if (rm_recursive(child) < 0) { + rc = set_err(err, "assemble: clear rm child failed", errno); + break; + } + } + closedir(d); + return rc; +} + +static int assembly_walk_whiteouts(const char *raw_dir, + const char *stage_dir, + const char **err) +{ + DIR *d = opendir(raw_dir); + if (!d) + return set_err(err, "assemble: whiteout opendir failed", errno); + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + if (strcmp(de->d_name, UN_RAW_META_SIDECAR) == 0) + continue; + char raw_child[UN_PATH_MAX]; + char stage_child[UN_PATH_MAX]; + int n1 = snprintf(raw_child, sizeof(raw_child), "%s/%s", raw_dir, + de->d_name); + int n2 = snprintf(stage_child, sizeof(stage_child), "%s/%s", stage_dir, + de->d_name); + if (n1 < 0 || (size_t) n1 >= sizeof(raw_child) || n2 < 0 || + (size_t) n2 >= sizeof(stage_child)) { + rc = set_err(err, "assemble: whiteout path overflow", ENAMETOOLONG); + break; + } + struct stat st; + if (lstat(raw_child, &st) < 0) { + rc = set_err(err, "assemble: whiteout lstat failed", errno); + break; + } + if (S_ISDIR(st.st_mode)) { + /* Recurse: subdirectories may carry their own markers. The + * stage_child counterpart may not exist yet (pass 2 creates + * the missing directories), which is fine: descending into + * the raw side still finds the markers, and the rm-r / + * clear-contents calls below tolerate a missing target. + */ + if (assembly_walk_whiteouts(raw_child, stage_child, err) < 0) { + rc = -1; + break; + } + continue; + } + if (!S_ISREG(st.st_mode)) + continue; + if (strcmp(de->d_name, ".wh..wh..opq") == 0) { + if (clear_dir_contents(stage_dir, err) < 0) { + rc = -1; + break; + } + continue; + } + if (is_whiteout_name(de->d_name)) { + char target[UN_PATH_MAX]; + int nt = snprintf(target, sizeof(target), "%s/%s", stage_dir, + de->d_name + 4); + if (nt < 0 || (size_t) nt >= sizeof(target)) { + rc = set_err(err, "assemble: whiteout target overflow", + ENAMETOOLONG); + break; + } + if (rm_recursive(target) < 0) { + rc = set_err(err, "assemble: whiteout rm failed", errno); + break; + } + } + } + closedir(d); + return rc; +} + +static int assembly_walk_content(const char *raw_dir, + const char *stage_dir, + const char **err) +{ + DIR *d = opendir(raw_dir); + if (!d) + return set_err(err, "assemble: content opendir failed", errno); + struct dirent *de; + int rc = 0; + while ((de = readdir(d))) { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + if (strcmp(de->d_name, UN_RAW_META_SIDECAR) == 0) + continue; + if (is_whiteout_name(de->d_name)) + continue; + char raw_child[UN_PATH_MAX]; + char stage_child[UN_PATH_MAX]; + int n1 = snprintf(raw_child, sizeof(raw_child), "%s/%s", raw_dir, + de->d_name); + int n2 = snprintf(stage_child, sizeof(stage_child), "%s/%s", stage_dir, + de->d_name); + if (n1 < 0 || (size_t) n1 >= sizeof(raw_child) || n2 < 0 || + (size_t) n2 >= sizeof(stage_child)) { + rc = set_err(err, "assemble: content path overflow", ENAMETOOLONG); + break; + } + struct stat raw_st; + if (lstat(raw_child, &raw_st) < 0) { + rc = set_err(err, "assemble: content lstat failed", errno); + break; + } + if (S_ISDIR(raw_st.st_mode)) { + struct stat dst; + if (lstat(stage_child, &dst) < 0) { + if (errno != ENOENT) { + rc = set_err(err, "assemble: stage dir stat failed", errno); + break; + } + if (mkdir(stage_child, 0755) < 0) { + rc = set_err(err, "assemble: stage mkdir failed", errno); + break; + } + } else if (!S_ISDIR(dst.st_mode)) { + /* Lower-layer non-dir collides with this layer's dir. + * Overlay semantics: this layer's dir wins. unlink the + * lower entry and create the dir. + */ + if (unlink(stage_child) < 0) { + rc = set_err(err, "assemble: stage unlink-for-dir failed", + errno); + break; + } + if (mkdir(stage_child, 0755) < 0) { + rc = set_err(err, "assemble: stage mkdir-replace failed", + errno); + break; + } + } + if (assembly_walk_content(raw_child, stage_child, err) < 0) { + rc = -1; + break; + } + continue; + } + /* Regular file or symlink (or any other non-directory): unlink + * any existing destination then copyfile with COPYFILE_CLONE + * so APFS COW keeps the byte cost flat when raw cache and + * stage share a volume, and falls back to a byte copy when + * they do not (default elfuse layout puts the store on the + * root volume and the stage on a sparsebundle, so the EXDEV + * fallback is the steady-state path for fresh unpacks until + * the layouts are unified). Per D8, hardlink relationships + * from the tar are not reconstructed (each copyfile produces + * an independent inode). + */ + struct stat dst; + if (lstat(stage_child, &dst) == 0) { + if (rm_recursive(stage_child) < 0) { + rc = set_err(err, "assemble: unlink dst failed", errno); + break; + } + } else if (errno != ENOENT) { + rc = set_err(err, "assemble: dst lstat failed", errno); + break; + } + if (copyfile(raw_child, stage_child, NULL, + COPYFILE_CLONE | COPYFILE_ALL) < 0) { + rc = set_err(err, "assemble: copyfile failed", errno); + break; + } + } + closedir(d); + return rc; +} + +int oci_unpack_assemble_layer(const char *raw_dir, + const char *stage_dir, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!raw_dir || !stage_dir) + return set_err(err, "assemble: NULL argument", EINVAL); + struct stat st; + if (lstat(raw_dir, &st) < 0 || !S_ISDIR(st.st_mode)) + return set_err(err, "assemble: raw_dir is not a directory", ENOTDIR); + if (lstat(stage_dir, &st) < 0 || !S_ISDIR(st.st_mode)) + return set_err(err, "assemble: stage_dir is not a directory", ENOTDIR); + if (assembly_walk_whiteouts(raw_dir, stage_dir, err) < 0) + return -1; + if (assembly_walk_content(raw_dir, stage_dir, err) < 0) + return -1; + return 0; +} + +/* Resolve the manifest digest for ref: prefer ref->digest_str when + * present, else read the pin file via oci_store_get_ref. + */ +static int resolve_manifest_digest(oci_store_t *store, + const oci_ref_t *ref, + char *out_str, + size_t out_cap, + const char **err) +{ + if (ref->digest && ref->digest[0]) { + if (strlen(ref->digest) + 1 > out_cap) + return set_err(err, "unpack: digest string overflow", ENAMETOOLONG); + memcpy(out_str, ref->digest, strlen(ref->digest) + 1); + return 0; + } + char *pin = NULL; + const char *perr = NULL; + if (oci_store_get_ref(store, ref, &pin, &perr) < 0) { + if (errno == ENOENT) + return set_err( + err, "unpack: tag pin missing; run 'elfuse oci pull' first", + ENOENT); + return set_err(err, perr ? perr : "unpack: tag pin read failed", + errno ? errno : EIO); + } + if (strlen(pin) + 1 > out_cap) { + free(pin); + return set_err(err, "unpack: pin string overflow", ENAMETOOLONG); + } + memcpy(out_str, pin, strlen(pin) + 1); + free(pin); + return 0; +} + +int oci_unpack(oci_store_t *store, + const oci_ref_t *ref, + const oci_unpack_options_t *opts, + char **out_image_dir, + const char **err) +{ + static const char *dummy_err; + if (!err) + err = &dummy_err; + *err = NULL; + if (!store || !ref || !out_image_dir) + return set_err(err, "unpack: NULL argument", EINVAL); + *out_image_dir = NULL; + + bool quiet = opts && opts->quiet; + bool force = opts && opts->force_relayer; + + /* Resolve / provision the sysroot volume. */ + char *volume_root = NULL; + if (oci_volume_ensure(opts ? opts->volume_root : NULL, &volume_root, err) < + 0) + return -1; + + /* Ensure images/ and images/.staging/ exist. */ + char *images_dir = NULL; + char *staging_dir = NULL; + if (oci_volume_subdir(volume_root, "images", &images_dir, err) < 0) + goto fail_volume; + if (oci_volume_subdir(volume_root, "images/.staging", &staging_dir, err) < + 0) + goto fail_images; + + /* Resolve the manifest digest. */ + char manifest_digest[OCI_DIGEST_HEX_MAX + 16]; + if (resolve_manifest_digest(store, ref, manifest_digest, + sizeof(manifest_digest), err) < 0) + goto fail_staging; + + oci_digest_algo_t algo; + char manifest_hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(manifest_digest, &algo, manifest_hex)) + return set_err(err, "unpack: manifest digest parse failed", EINVAL); + + /* Read the manifest blob. If it is an image-index, pick linux/arm64 + * and re-read the sub-manifest. + */ + oci_blob_store_t *bs = oci_store_blobs(store); + if (!bs) { + set_err(err, "unpack: blob store unavailable", EIO); + goto fail_staging; + } + uint8_t *body = NULL; + size_t body_len = 0; + if (read_blob(bs, algo, manifest_hex, &body, &body_len, err) < 0) + goto fail_staging; + + oci_manifest_t manifest = {0}; + oci_index_t index = {0}; + const char *perr = NULL; + char *image_hex = NULL; + /* Try manifest first; if it fails, try index. */ + if (oci_manifest_parse((const char *) body, body_len, &manifest, &perr) < + 0) { + memset(&manifest, 0, sizeof(manifest)); + if (oci_index_parse((const char *) body, body_len, &index, &perr) < 0) { + set_err(err, perr ? perr : "unpack: manifest parse failed", EINVAL); + free(body); + goto fail_staging; + } + free(body); + const oci_index_entry_t *pick = oci_index_pick_linux_arm64(&index); + if (!pick) { + oci_index_free(&index); + set_err(err, "unpack: no linux/arm64 entry in image index", ENOENT); + goto fail_staging; + } + char sub_digest[OCI_DIGEST_HEX_MAX + 16]; + if (strlen(pick->desc.digest_str) >= sizeof(sub_digest)) { + oci_index_free(&index); + set_err(err, "unpack: sub-manifest digest overflow", ENAMETOOLONG); + goto fail_staging; + } + memcpy(sub_digest, pick->desc.digest_str, + strlen(pick->desc.digest_str) + 1); + oci_digest_algo_t sub_algo; + char sub_hex[OCI_DIGEST_HEX_MAX + 1]; + if (!oci_digest_parse(sub_digest, &sub_algo, sub_hex)) { + oci_index_free(&index); + set_err(err, "unpack: sub-manifest digest parse failed", EINVAL); + goto fail_staging; + } + oci_index_free(&index); + if (read_blob(bs, sub_algo, sub_hex, &body, &body_len, err) < 0) + goto fail_staging; + if (oci_manifest_parse((const char *) body, body_len, &manifest, + &perr) < 0) { + set_err(err, perr ? perr : "unpack: sub-manifest parse failed", + EINVAL); + free(body); + goto fail_staging; + } + image_hex = strdup(sub_hex); + } else { + image_hex = strdup(manifest_hex); + } + free(body); + + if (!image_hex) { + oci_manifest_free(&manifest); + set_err(err, "unpack: image hex strdup failed", ENOMEM); + goto fail_staging; + } + + /* Final target: /images/sha256-/. The directory has + * '-' instead of ':' to keep the path filesystem-friendly. + */ + char final_dir[UN_PATH_MAX]; + if ((size_t) snprintf(final_dir, sizeof(final_dir), "%s/sha256-%s", + images_dir, image_hex) >= sizeof(final_dir)) { + free(image_hex); + oci_manifest_free(&manifest); + set_err(err, "unpack: final dir overflow", ENAMETOOLONG); + goto fail_staging; + } + + struct stat st; + if (lstat(final_dir, &st) == 0 && !force) { + /* Idempotent rerun: image sysroot already exists. */ + free(image_hex); + oci_manifest_free(&manifest); + size_t want = strlen(final_dir) + 2; + char *dup = malloc(want); + if (!dup) { + set_err(err, "unpack: strdup final path failed", ENOMEM); + goto fail_staging; + } + snprintf(dup, want, "%s/", final_dir); + *out_image_dir = dup; + free(staging_dir); + free(images_dir); + free(volume_root); + return 0; + } + if (force) { + /* Remove any prior commit so the staging rename does not race. */ + char rm[UN_PATH_MAX]; + snprintf(rm, sizeof(rm), "rm -rf '%s'", final_dir); + (void) system(rm); + } + + /* Stage under /images/.staging// */ + char stage_id[13]; + if (rand_hex(stage_id, 12) < 0) { + free(image_hex); + oci_manifest_free(&manifest); + set_err(err, "unpack: getentropy failed", errno); + goto fail_staging; + } + char stage_dir[UN_PATH_MAX]; + if ((size_t) snprintf(stage_dir, sizeof(stage_dir), "%s/%s", staging_dir, + stage_id) >= sizeof(stage_dir)) { + free(image_hex); + oci_manifest_free(&manifest); + set_err(err, "unpack: stage dir overflow", ENAMETOOLONG); + goto fail_staging; + } + if (mkdir_p(stage_dir) < 0) { + free(image_hex); + oci_manifest_free(&manifest); + set_err(err, "unpack: mkdir stage failed", errno); + goto fail_staging; + } + + if (!quiet) + fprintf(stderr, "elfuse oci unpack: applying %zu layer(s)\n", + manifest.nlayers); + + /* Read + parse the image-config blob up-front so per-layer diff_ids + * are available to the cache hook in oci_unpack_layer. The Plan 1 + * origin sidecar still consumes the same struct later in this + * function, so the read happens exactly once. + */ + oci_image_config_t cfg = {0}; + { + uint8_t *cfg_body = NULL; + size_t cfg_len = 0; + if (read_blob(bs, manifest.config.algo, manifest.config.hex, &cfg_body, + &cfg_len, err) < 0) { + free(image_hex); + oci_manifest_free(&manifest); + goto fail_stage_dir; + } + const char *cparse_err = NULL; + if (oci_image_config_parse((const char *) cfg_body, cfg_len, &cfg, + &cparse_err) < 0) { + set_err( + err, + cparse_err ? cparse_err : "unpack: image config parse failed", + EINVAL); + free(cfg_body); + free(image_hex); + oci_manifest_free(&manifest); + goto fail_stage_dir; + } + free(cfg_body); + } + + /* Validate that diff_ids[] length matches manifest.layers[] length. A + * mismatch is a malformed image (the OCI image-spec mandates one + * diff_id per layer in order); fail-fast so the cache never associates + * a diff_id with the wrong layer payload. + */ + size_t diff_ids_count = 0; + if (cfg.rootfs_diff_ids) + while (cfg.rootfs_diff_ids[diff_ids_count]) + diff_ids_count++; + if (diff_ids_count != manifest.nlayers) { + set_err(err, "unpack: image config rootfs.diff_ids count mismatch", + EINVAL); + oci_image_config_free(&cfg); + free(image_hex); + oci_manifest_free(&manifest); + goto fail_stage_dir; + } + + /* C3.3c-ii orchestrator state. cum_meta accumulates the running + * cumulative meta table (uid/gid/mode per guest path); layer_meta + * is reset to a fresh table at the start of every loop iteration; + * chains holds the precomputed OCI ChainID strings for every + * layer so the stack-cache prefix search is one stat(2) per layer. + */ + oci_meta_table_t *cum_meta = NULL; + oci_meta_table_t *layer_meta = NULL; + char (*chains)[OCI_DIGEST_HEX_MAX + 16] = NULL; + + cum_meta = oci_meta_table_new(); + if (!cum_meta) { + set_err(err, "unpack: meta table alloc failed", ENOMEM); + goto fail_orch; + } + + if (manifest.nlayers > 0) { + chains = malloc(manifest.nlayers * sizeof(*chains)); + if (!chains) { + set_err(err, "unpack: chain array alloc failed", ENOMEM); + goto fail_orch; + } + const char *prev = NULL; + for (size_t i = 0; i < manifest.nlayers; i++) { + if (oci_chainid_compute(prev, cfg.rootfs_diff_ids[i], chains[i], + sizeof(chains[i])) < 0) { + set_err(err, "unpack: chain compute failed", + errno ? errno : EINVAL); + goto fail_orch; + } + prev = chains[i]; + } + } + + /* Search the stack cache backwards for the longest matching prefix + * snapshot. On hit, clonefile-restore the assembled stage_dir + * straight from cache and continue with the trailing layers only. + * No hit -> stage_dir stays at the empty mkdir_p state and the + * orchestrator iterates over every layer. + */ + size_t start_i = 0; + for (size_t k = manifest.nlayers; k-- > 0;) { + int hit = oci_store_stack_has(store, chains[k]); + if (hit < 0) { + set_err(err, "unpack: stack lookup failed", errno); + goto fail_orch; + } + if (hit != 1) + continue; + char stack_dir[UN_PATH_MAX]; + if (oci_store_stack_resolve(store, chains[k], stack_dir, + sizeof(stack_dir)) < 0) { + set_err(err, "unpack: stack resolve failed", errno); + goto fail_orch; + } + size_t sl = strlen(stack_dir); + if (sl > 0 && stack_dir[sl - 1] == '/') + stack_dir[sl - 1] = '\0'; + /* copyfile with COPYFILE_CLONE prefers an APFS clone (cheap + * COW) and falls back to a recursive byte copy on EXDEV, so + * stack restore works whether the store and stage share a + * volume or not. COPYFILE_CLONE implies an exclusive + * destination; the rm_recursive above prepares an absent + * target for both code paths. + */ + if (rm_recursive(stage_dir) < 0) { + set_err(err, "unpack: stage rm-for-stack failed", errno); + goto fail_orch; + } + if (copyfile(stack_dir, stage_dir, NULL, + COPYFILE_CLONE | COPYFILE_RECURSIVE | COPYFILE_NOFOLLOW | + COPYFILE_ALL) < 0) { + int saved = errno; + set_err(err, "unpack: stack restore copyfile failed", saved); + goto fail_orch; + } + /* Re-load the cumulative meta sidecar the stack snapshot + * persisted so trailing layers accumulate on top. A missing + * sidecar (older snapshot) is benign: cum_meta stays empty. + */ + oci_meta_table_t *restored = NULL; + const char *merr = NULL; + if (oci_meta_read(stage_dir, &restored, &merr) < 0) { + if (errno != ENOENT) { + set_err(err, merr ? merr : "unpack: stack meta read failed", + errno); + goto fail_orch; + } + errno = 0; + } else { + int mrc = oci_meta_merge(cum_meta, restored); + int saved = errno; + oci_meta_table_free(restored); + if (mrc < 0) { + set_err(err, "unpack: stack meta merge failed", saved); + goto fail_orch; + } + } + start_i = k + 1; + if (!quiet) + fprintf(stderr, "elfuse oci unpack: stack hit at chain %zu/%zu\n", + start_i, manifest.nlayers); + break; + } + + if (!quiet && manifest.nlayers > 0) + fprintf(stderr, + "elfuse oci unpack: applying %zu layer(s) (cache start %zu)\n", + manifest.nlayers - start_i, start_i); + + for (size_t i = start_i; i < manifest.nlayers; i++) { + char label[32]; + const char *log_label = NULL; + if (!quiet) { + snprintf(label, sizeof(label), "layer %zu", i + 1); + log_label = label; + } + + layer_meta = oci_meta_table_new(); + if (!layer_meta) { + set_err(err, "unpack: layer meta alloc failed", ENOMEM); + goto fail_orch; + } + + const char *diff_id = cfg.rootfs_diff_ids[i]; + char raw_cache_dir[UN_PATH_MAX]; + int raw_hit = oci_store_layer_has(store, diff_id); + if (raw_hit < 0) { + set_err(err, "unpack: raw cache lookup failed", errno); + goto fail_orch; + } + if (raw_hit == 1) { + if (oci_store_layer_resolve(store, diff_id, raw_cache_dir, + sizeof(raw_cache_dir)) < 0) { + set_err(err, "unpack: raw cache resolve failed", errno); + goto fail_orch; + } + size_t rl = strlen(raw_cache_dir); + if (rl > 0 && raw_cache_dir[rl - 1] == '/') + raw_cache_dir[rl - 1] = '\0'; + /* Load the per-layer sidecar so cum_meta picks up the + * uid/gid/mode entries the cache writer recorded at + * populate time. Missing sidecar is benign (older or + * hand-seeded entry). + */ + oci_meta_table_t *loaded = NULL; + const char *merr = NULL; + if (oci_meta_read_named(raw_cache_dir, UN_RAW_META_SIDECAR, &loaded, + &merr) < 0) { + if (errno != ENOENT) { + set_err(err, merr ? merr : "unpack: raw meta read failed", + errno); + goto fail_orch; + } + errno = 0; + } else { + oci_meta_table_free(layer_meta); + layer_meta = loaded; + } + if (log_label) + fprintf(stderr, " %s: %s (raw cached)\n", log_label, + manifest.layers[i].digest_str); + } else { + char raw_stage[UN_PATH_MAX]; + if (oci_store_layer_stage_path(store, diff_id, raw_stage, + sizeof(raw_stage)) < 0) { + set_err(err, "unpack: raw stage_path resolve failed", errno); + goto fail_orch; + } + if (mkdir(raw_stage, 0755) < 0) { + set_err(err, "unpack: raw stage mkdir failed", errno); + goto fail_orch; + } + if (oci_unpack_layer_raw(bs, &manifest.layers[i], raw_stage, NULL, + layer_meta, log_label, err) < 0) { + (void) rm_recursive(raw_stage); + goto fail_orch; + } + const char *mwerr = NULL; + if (oci_meta_write_named(layer_meta, raw_stage, UN_RAW_META_SIDECAR, + &mwerr) < 0) { + set_err(err, mwerr ? mwerr : "unpack: raw meta write failed", + errno); + (void) rm_recursive(raw_stage); + goto fail_orch; + } + const char *cerr = NULL; + if (oci_store_layer_commit(store, raw_stage, diff_id, &cerr) < 0) { + int saved = errno; + set_err(err, cerr ? cerr : "unpack: raw cache commit failed", + saved); + (void) rm_recursive(raw_stage); + goto fail_orch; + } + if (oci_store_layer_resolve(store, diff_id, raw_cache_dir, + sizeof(raw_cache_dir)) < 0) { + set_err(err, "unpack: raw cache resolve failed", errno); + goto fail_orch; + } + size_t rl = strlen(raw_cache_dir); + if (rl > 0 && raw_cache_dir[rl - 1] == '/') + raw_cache_dir[rl - 1] = '\0'; + } + + if (oci_unpack_assemble_layer(raw_cache_dir, stage_dir, err) < 0) + goto fail_orch; + + int mrc = oci_meta_merge(cum_meta, layer_meta); + int saved_errno = errno; + oci_meta_table_free(layer_meta); + layer_meta = NULL; + if (mrc < 0) { + set_err(err, "unpack: cum meta merge failed", saved_errno); + goto fail_orch; + } + + /* Snapshot stage_dir into the per-prefix stack cache so future + * unpacks sharing this chain prefix short-circuit. Failure here + * is fatal: silently degrading the cache would defeat the + * dedup path. + */ + if (oci_meta_write(cum_meta, stage_dir, err) < 0) + goto fail_orch; + char stack_stage[UN_PATH_MAX]; + if (oci_store_stack_stage_path(store, chains[i], stack_stage, + sizeof(stack_stage)) < 0) { + set_err(err, "unpack: stack stage_path resolve failed", errno); + goto fail_orch; + } + /* Same copyfile + COPYFILE_CLONE rationale as the stack + * restore: prefer APFS clone, fall back to recursive byte + * copy on EXDEV so the cache populates regardless of which + * volume holds the stage. + */ + if (copyfile(stage_dir, stack_stage, NULL, + COPYFILE_CLONE | COPYFILE_RECURSIVE | COPYFILE_NOFOLLOW | + COPYFILE_ALL) < 0) { + int saved = errno; + set_err(err, "unpack: stack snapshot copyfile failed", saved); + goto fail_orch; + } + const char *scerr = NULL; + if (oci_store_stack_commit(store, stack_stage, chains[i], &scerr) < 0) { + int saved = errno; + (void) rm_recursive(stack_stage); + set_err(err, scerr ? scerr : "unpack: stack commit failed", saved); + goto fail_orch; + } + } + + /* The per-iteration writes already produced an up-to-date sidecar + * on disk. On the full-stack-hit path (no iterations ran) the + * clonefile-restored stage_dir also already carries the snapshot's + * sidecar, so a final write would only re-emit identical bytes. + */ + oci_meta_table_free(cum_meta); + cum_meta = NULL; + free(chains); + chains = NULL; + + /* Origin sidecar: records manifest_digest + config_digest + diff_ids + * the Plan 1 keep-set walker reads. A failure here aborts the commit + * because a missing origin file would let prune silently delete layer + * blobs still backing this unpacked tree. + */ + { + char manifest_full[OCI_DIGEST_HEX_MAX + 16]; + if ((size_t) snprintf(manifest_full, sizeof(manifest_full), "sha256:%s", + image_hex) >= sizeof(manifest_full)) { + oci_image_config_free(&cfg); + set_err(err, "unpack: manifest digest overflow", ENAMETOOLONG); + free(image_hex); + oci_manifest_free(&manifest); + goto fail_stage_dir; + } + + const char *origin_err = NULL; + if (oci_origin_write(stage_dir, manifest_full, + manifest.config.digest_str, cfg.rootfs_diff_ids, + &origin_err) < 0) { + set_err(err, + origin_err ? origin_err : "unpack: origin write failed", + errno ? errno : EIO); + oci_image_config_free(&cfg); + free(image_hex); + oci_manifest_free(&manifest); + goto fail_stage_dir; + } + } + oci_image_config_free(&cfg); + + oci_manifest_free(&manifest); + + /* Atomic commit. */ + if (rename(stage_dir, final_dir) < 0) { + set_err(err, "unpack: stage rename failed", errno); + free(image_hex); + goto fail_stage_dir; + } + free(image_hex); + + size_t want = strlen(final_dir) + 2; + char *dup = malloc(want); + if (!dup) { + set_err(err, "unpack: strdup final path failed", ENOMEM); + goto fail_staging; + } + snprintf(dup, want, "%s/", final_dir); + *out_image_dir = dup; + + free(staging_dir); + free(images_dir); + free(volume_root); + return 0; + +fail_orch: + oci_meta_table_free(layer_meta); + oci_meta_table_free(cum_meta); + free(chains); + oci_image_config_free(&cfg); + free(image_hex); + oci_manifest_free(&manifest); +fail_stage_dir: { + char rm[UN_PATH_MAX]; + snprintf(rm, sizeof(rm), "rm -rf '%s'", stage_dir); + (void) system(rm); +} +fail_staging: + free(staging_dir); +fail_images: + free(images_dir); +fail_volume: + free(volume_root); + return -1; +} diff --git a/src/oci/unpack.h b/src/oci/unpack.h new file mode 100644 index 0000000..084c350 --- /dev/null +++ b/src/oci/unpack.h @@ -0,0 +1,162 @@ +/* OCI layer unpack orchestrator + * + * Drives the full Phase 2 pipeline: resolve a ref to a manifest digest + * through oci_store, read the manifest from the blob store, walk its + * layers, re-verify each layer blob's digest, decompress, and apply + * via oci_layer_apply into a staging directory under the sysroot + * volume's images/.staging/ subtree. Successful unpack commits via + * atomic rename into images/sha256-/. + * + * Phase 3 will consume the resulting directory via `elfuse run IMAGE`. + * Phase 2 stops at producing the directory; the user wires it manually + * through `elfuse --sysroot `. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include "oci/blob-store.h" +#include "oci/layer-apply.h" +#include "oci/layer-meta.h" +#include "oci/manifest.h" +#include "oci/ref.h" +#include "oci/store.h" + +typedef struct { + const char *volume_root; /* NULL -> default sparse APFS volume */ + bool quiet; + bool force_relayer; +} oci_unpack_options_t; + +/* Apply one OCI layer's tar payload into stage_dir as a pure overlay + * extract. + * + * Re-verifies the compressed blob digest against desc, opens the blob + * via the running blob store, decompresses per desc->media_type, then + * drives oci_layer_apply (overlay mode) against stage_dir. stage_dir + * must already exist and be writable; the helper does not mkdir it. + * + * Whiteout and opaque tar entries are processed by oci_layer_apply + * with overlay semantics: ".wh." deletes upper-layer state in + * stage_dir; ".wh..wh..opq" clears the containing directory. This + * helper has no concern with caches; Plan 3 C3.3c moved all cache + * orchestration up into oci_unpack itself, which drives raw-tar + * populate via oci_unpack_layer_raw and assembles via + * oci_unpack_assemble_layer. + * + * Parameters: + * bs - blob store backing the layer payload. + * desc - layer descriptor; algo / hex / media_type / size used. + * stage_dir - destination directory, absolute path, no trailing '/'. + * stats - optional; per-layer counters are summed when non-NULL. + * meta - optional; tar uid/gid/mode entries recorded when non-NULL. + * log_label - optional; when non-NULL the helper prints + * "