From 554dd422761331c92c64f32a92853e646b951ddd Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 18:57:45 +0900 Subject: [PATCH 01/20] backup: S3 encoder for buckets, objects, and blob reassembly (Phase 0a) Builds on PR #716 (DynamoDB) and PR #717 (s3keys.ParseBlobKey). Adds the S3 encoder for the Phase 0 logical-backup decoder. Snapshot prefixes handled: - !s3|bucket|meta| -> s3//_bucket.json (live s3BucketMeta projected into the dump-format s3PublicBucket; cluster- internal fields like CreatedAtHLC and Generation stripped). - !s3|obj|head| -> s3//.elastickv-meta.json (live s3ObjectManifest projected into the dump-format s3PublicManifest; UploadID and per-part chunk arrays stripped). - !s3|blob|[]: spilled to a per-(bucket, object) scratch directory as it arrives; assembled at Finalize by walking sortChunkKeys output (partNo, partVersion, chunkNo) and concatenating into the final body file via writeFileAtomic-style tmp+rename. Memory: O(num_objects); body bytes never held in memory. - !s3|upload|meta|, !s3|upload|part|: excluded by default; --include-incomplete-uploads emits them as opaque {prefix, key, value} JSONL records under s3//_incomplete_uploads/. - !s3|bucket|gen|, !s3|gc|upload|, !s3route|: HandleIgnored no-op so the master pipeline can dispatch all !s3|* prefixes uniformly. Reserved-suffix collision: - A user object key ending in .elastickv-meta.json is rejected with ErrS3MetaSuffixCollision by default; --rename-collisions appends .user-data and records the rename in s3//KEYMAP.jsonl with KindMetaCollision so the on-disk reverse map is sound. - Per-bucket KEYMAP.jsonl is opened lazily and dropped if Finalize finds zero records (the spec's "omit empty file" rule). Body assembly: - Each blob chunk is written atomically to scratch as it arrives; the chunk path is registered in the object's chunkPaths map keyed by (uploadID, partNo, chunkNo, partVersion). - Finalize sorts the chunk keys by (partNo, partVersion, chunkNo) and concatenates the matching scratch files into a single via tmp+rename. - The scratch tree is rm -rf'd on Finalize via deferred RemoveAll, so a successful run leaves nothing behind. Tests: bucket meta + single-chunk object round-trip, multipart ordering verification (chunks emitted out of order, body assembled in part/chunk order), orphan-chunks warning, .elastickv-meta.json collision rejection (default) and rename (with flag) including KEYMAP entry, malformed JSON rejection on both manifest and bucket paths, ignored-prefix no-op, --include-incomplete-uploads round- trip, default no _incomplete_uploads dir, versioned blob assembly. --- internal/backup/s3.go | 594 +++++++++++++++++++++++++++++++++++++ internal/backup/s3_test.go | 310 +++++++++++++++++++ 2 files changed, 904 insertions(+) create mode 100644 internal/backup/s3.go create mode 100644 internal/backup/s3_test.go diff --git a/internal/backup/s3.go b/internal/backup/s3.go new file mode 100644 index 000000000..cd20e2c48 --- /dev/null +++ b/internal/backup/s3.go @@ -0,0 +1,594 @@ +package backup + +import ( + "encoding/json" + "io" + "os" + "path/filepath" + "sort" + "strings" + + "github.com/bootjp/elastickv/internal/s3keys" + "github.com/cockroachdb/errors" +) + +// Snapshot prefixes the S3 encoder dispatches on. Mirror +// internal/s3keys/keys.go so a renamed prefix surfaces at compile +// time via the dispatch tests. +const ( + S3BucketMetaPrefix = s3keys.BucketMetaPrefix + S3BucketGenPrefix = s3keys.BucketGenerationPrefix + S3ObjectManifestPrefix = s3keys.ObjectManifestPrefix + S3UploadMetaPrefix = s3keys.UploadMetaPrefix + S3UploadPartPrefix = s3keys.UploadPartPrefix + S3BlobPrefix = s3keys.BlobPrefix + S3GCUploadPrefix = s3keys.GCUploadPrefix + S3RoutePrefix = s3keys.RoutePrefix +) + +// S3MetaSuffixReserved is the sidecar suffix per the design doc. A user +// S3 object key whose suffix matches this is rejected at dump time +// unless WithRenameCollisions is on. +const S3MetaSuffixReserved = ".elastickv-meta.json" + +// S3LeafDataSuffix renames the shorter of two S3 keys when the longer +// would force its parent to be a directory. Recorded in KEYMAP.jsonl. +const S3LeafDataSuffix = ".elastickv-leaf-data" + +var ( + // ErrS3InvalidBucketMeta is returned when a !s3|bucket|meta value + // fails JSON decoding. + ErrS3InvalidBucketMeta = errors.New("backup: invalid !s3|bucket|meta value") + // ErrS3InvalidManifest is returned when a !s3|obj|head value fails + // JSON decoding. + ErrS3InvalidManifest = errors.New("backup: invalid !s3|obj|head value") + // ErrS3MalformedKey is returned when an S3 key cannot be parsed + // for its structural components. + ErrS3MalformedKey = errors.New("backup: malformed S3 key") + // ErrS3MetaSuffixCollision is returned when a user object key + // collides with the reserved S3MetaSuffixReserved suffix. + ErrS3MetaSuffixCollision = errors.New("backup: user S3 object key collides with reserved sidecar suffix") +) + +// S3Encoder emits per-bucket _bucket.json + assembled object bodies + +// .elastickv-meta.json sidecars + KEYMAP.jsonl, per the Phase 0 +// design (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md). +// +// Lifecycle: Handle* per record, Finalize once. Records arrive in +// snapshot lex order: +// +// !s3|blob|* (b) -- written to a per-(bucket,object) +// scratch chunk pool +// !s3|bucket|gen|* (bg) -- ignored (operational counter) +// !s3|bucket|meta|* (bm) -- buffered until Finalize +// !s3|gc|upload|* (g) -- ignored (in-flight cleanup state) +// !s3|obj|head|* (o) -- buffered until Finalize +// !s3|upload|meta|* (um) -- excluded by default; opt in via +// WithIncludeIncompleteUploads +// !s3|upload|part|* (up) -- same +// !s3route|* (r) -- ignored (control plane) +// +// Object body assembly happens at Finalize: for each object manifest, +// the encoder enumerates parts in PartNo order and chunks in ChunkNo +// order, concatenates the matching blob chunks (which were +// pre-spilled to scratch files as they arrived), and writes the +// assembled body to /s3// with the metadata +// sidecar at .elastickv-meta.json. +// +// Memory: O(num_objects + num_buckets) buffered metadata. Per-blob +// payloads are streamed to disk as they arrive — never held in memory. +type S3Encoder struct { + outRoot string + scratchRoot string + includeIncompleteUploads bool + includeOrphans bool + renameCollisions bool + + buckets map[string]*s3BucketState + warn func(event string, fields ...any) +} + +type s3BucketState struct { + name string + meta *s3PublicBucket + objects map[string]*s3ObjectState // keyed by "object\x00generation" + keymap *KeymapWriter + keymapDir string +} + +type s3ObjectState struct { + bucket string + generation uint64 + object string + manifest *s3PublicManifest + // chunkPaths maps (uploadID, partNo, chunkNo) -> scratch path. + chunkPaths map[s3ChunkKey]string +} + +type s3ChunkKey struct { + uploadID string + partNo uint64 + chunkNo uint64 + partVersion uint64 +} + +// s3PublicBucket is the dump-format projection of s3BucketMeta. +type s3PublicBucket struct { + FormatVersion uint32 `json:"format_version"` + Name string `json:"name"` + CreationTimeISO string `json:"creation_time_iso,omitempty"` + Owner string `json:"owner,omitempty"` + Region string `json:"region,omitempty"` + ACL string `json:"acl,omitempty"` + Versioning string `json:"versioning,omitempty"` + PolicyJSONString string `json:"policy_json,omitempty"` +} + +// s3PublicManifest is the dump-format sidecar projection of +// s3ObjectManifest. The dump strips internal fields (UploadID, +// LastModifiedHLC, the per-part ETag/chunk arrays) that are +// implementation detail and surfaces only what S3's HEAD/GET +// expose to clients. +type s3PublicManifest struct { + FormatVersion uint32 `json:"format_version"` + ETag string `json:"etag,omitempty"` + SizeBytes int64 `json:"size_bytes"` + LastModified string `json:"last_modified,omitempty"` + ContentType string `json:"content_type,omitempty"` + ContentEncoding string `json:"content_encoding,omitempty"` + CacheControl string `json:"cache_control,omitempty"` + ContentDisposition string `json:"content_disposition,omitempty"` + UserMetadata map[string]string `json:"user_metadata,omitempty"` +} + +// s3LiveManifest mirrors the live adapter/s3.go s3ObjectManifest +// just enough to decode the JSON value. Fields the dump format +// drops are still parsed (so unknown-fields default-tolerance is +// preserved) but elided from the public sidecar. +type s3LiveManifest struct { + UploadID string `json:"upload_id"` + ETag string `json:"etag"` + SizeBytes int64 `json:"size_bytes"` + LastModifiedHLC uint64 `json:"last_modified_hlc"` + ContentType string `json:"content_type"` + ContentEncoding string `json:"content_encoding"` + CacheControl string `json:"cache_control"` + ContentDisposition string `json:"content_disposition"` + UserMetadata map[string]string `json:"user_metadata"` + Parts []s3LivePart `json:"parts"` +} + +type s3LivePart struct { + PartNo uint64 `json:"part_no"` + ETag string `json:"etag"` + SizeBytes int64 `json:"size_bytes"` + ChunkCount uint64 `json:"chunk_count"` + ChunkSizes []uint64 `json:"chunk_sizes"` + PartVersion uint64 `json:"part_version"` +} + +// NewS3Encoder constructs an encoder rooted at /s3/. Blob +// chunks are spilled to /s3/ as they arrive and assembled +// into final object bodies at Finalize. The caller owns scratchRoot; +// it must exist and be writable. A common choice is os.TempDir() under +// the dump runner — the encoder removes its scratch subtree on +// Close(). +func NewS3Encoder(outRoot, scratchRoot string) *S3Encoder { + return &S3Encoder{ + outRoot: outRoot, + scratchRoot: filepath.Join(scratchRoot, "s3"), + buckets: make(map[string]*s3BucketState), + } +} + +// WithIncludeIncompleteUploads routes !s3|upload|meta|/!s3|upload|part| +// records to s3//_incomplete_uploads/. Default is to skip them. +func (s *S3Encoder) WithIncludeIncompleteUploads(on bool) *S3Encoder { + s.includeIncompleteUploads = on + return s +} + +// WithIncludeOrphans surfaces blob chunks that have no matching +// manifest under s3//_orphans/. Default skips them. +func (s *S3Encoder) WithIncludeOrphans(on bool) *S3Encoder { + s.includeOrphans = on + return s +} + +// WithRenameCollisions opts in to renaming user objects that collide +// with the reserved S3MetaSuffixReserved suffix. Default rejects. +func (s *S3Encoder) WithRenameCollisions(on bool) *S3Encoder { + s.renameCollisions = on + return s +} + +// WithWarnSink wires a structured warning sink. +func (s *S3Encoder) WithWarnSink(fn func(event string, fields ...any)) *S3Encoder { + s.warn = fn + return s +} + +// HandleBucketMeta decodes and parks a !s3|bucket|meta record. +func (s *S3Encoder) HandleBucketMeta(key, value []byte) error { + bucketName, ok := s3keys.ParseBucketMetaKey(key) + if !ok { + return errors.Wrapf(ErrS3MalformedKey, "bucket meta key: %q", key) + } + var live s3LiveBucketMeta + if err := json.Unmarshal(value, &live); err != nil { + return errors.Wrap(ErrS3InvalidBucketMeta, err.Error()) + } + st := s.bucketState(bucketName) + st.meta = &s3PublicBucket{ + FormatVersion: 1, + Name: bucketName, + Owner: live.Owner, + Region: live.Region, + ACL: live.Acl, + } + return nil +} + +type s3LiveBucketMeta struct { + BucketName string `json:"bucket_name"` + Generation uint64 `json:"generation"` + CreatedAtHLC uint64 `json:"created_at_hlc"` + Owner string `json:"owner"` + Region string `json:"region"` + Acl string `json:"acl"` +} + +// HandleObjectManifest decodes and parks an !s3|obj|head record. The +// manifest's UploadID and Parts list drive the Finalize-time blob +// assembly. +func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { + bucket, gen, object, ok := s3keys.ParseObjectManifestKey(key) + if !ok { + return errors.Wrapf(ErrS3MalformedKey, "manifest key: %q", key) + } + var live s3LiveManifest + if err := json.Unmarshal(value, &live); err != nil { + return errors.Wrap(ErrS3InvalidManifest, err.Error()) + } + st := s.objectState(bucket, gen, object) + st.manifest = &s3PublicManifest{ + FormatVersion: 1, + ETag: live.ETag, + SizeBytes: live.SizeBytes, + ContentType: live.ContentType, + ContentEncoding: live.ContentEncoding, + CacheControl: live.CacheControl, + ContentDisposition: live.ContentDisposition, + UserMetadata: live.UserMetadata, + } + // Persist the parts list on the object state so Finalize knows + // what to assemble. We attach the live parts directly because + // that's purely structural data — the public sidecar has no need + // for them. + st.chunkPaths = ensureChunkPaths(st.chunkPaths) + st.attachManifestParts(live.UploadID, live.Parts) + return nil +} + +// HandleBlob spills a !s3|blob| record to a per-chunk scratch file +// and registers it under the (bucket, object, gen, uploadID, partNo, +// chunkNo, partVersion) routing key. +func (s *S3Encoder) HandleBlob(key, value []byte) error { + bucket, gen, object, uploadID, partNo, chunkNo, partVersion, ok := s3keys.ParseBlobKey(key) + if !ok { + return errors.Wrapf(ErrS3MalformedKey, "blob key: %q", key) + } + st := s.objectState(bucket, gen, object) + dir := filepath.Join(s.scratchRoot, EncodeSegment([]byte(bucket)), EncodeSegment([]byte(object))) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + path := filepath.Join(dir, blobScratchName(uploadID, partNo, chunkNo, partVersion)) + if err := writeFileAtomic(path, value); err != nil { + return err + } + st.chunkPaths = ensureChunkPaths(st.chunkPaths) + st.chunkPaths[s3ChunkKey{uploadID: uploadID, partNo: partNo, chunkNo: chunkNo, partVersion: partVersion}] = path + return nil +} + +// HandleIncompleteUpload routes !s3|upload|meta|/!s3|upload|part| +// records to /_incomplete_uploads/ when the include flag is +// on; otherwise drops them. +func (s *S3Encoder) HandleIncompleteUpload(prefix string, key, value []byte) error { + if !s.includeIncompleteUploads { + return nil + } + bucket, _, _, _, _, ok := parseUploadFamily(prefix, key) + if !ok { + return errors.Wrapf(ErrS3MalformedKey, "upload-family key: %q", key) + } + dir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(bucket)), "_incomplete_uploads") + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + // Phase 0a stores upload-family records as opaque key/value pairs + // (one JSON line per record) rather than reconstructing the + // in-flight upload state. Restoring incomplete uploads is itself + // a follow-up; this artifact preserves the bytes for forensics. + jl, err := openJSONL(filepath.Join(dir, "records.jsonl")) + if err != nil { + return err + } + defer func() { _ = closeJSONL(jl) }() + rec := struct { + Prefix string `json:"prefix"` + KeyB64 []byte `json:"key"` + ValueB64 []byte `json:"value"` + }{Prefix: prefix, KeyB64: key, ValueB64: value} + if err := jl.enc.Encode(rec); err != nil { + return errors.WithStack(err) + } + return nil +} + +// HandleIgnored is a no-op for prefixes the encoder explicitly drops +// (!s3|bucket|gen|, !s3|gc|upload|, !s3route|). Exposed so the master +// pipeline can dispatch all !s3|* prefixes uniformly without +// special-casing. +func (s *S3Encoder) HandleIgnored(_, _ []byte) error { return nil } + +// Finalize assembles every object body, writes its sidecar, flushes +// per-bucket _bucket.json, and removes the scratch tree. +func (s *S3Encoder) Finalize() error { + defer func() { _ = os.RemoveAll(s.scratchRoot) }() + var firstErr error + for _, b := range s.buckets { + if err := s.flushBucket(b); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func (s *S3Encoder) flushBucket(b *s3BucketState) error { + bucketDir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(b.name))) + if err := os.MkdirAll(bucketDir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + if b.meta != nil { + if err := writeFileAtomic(filepath.Join(bucketDir, "_bucket.json"), mustMarshalIndent(b.meta)); err != nil { + return err + } + } + for _, obj := range b.objects { + if err := s.flushObject(b, bucketDir, obj); err != nil { + return err + } + } + if b.keymap != nil { + if err := b.keymap.Close(); err != nil { + return err + } + // If no rename was recorded, drop the empty file so the + // dump tree omits it (per the spec: keymaps are absent when + // empty). + if b.keymap.Count() == 0 && b.keymapDir != "" { + _ = os.Remove(filepath.Join(b.keymapDir, "KEYMAP.jsonl")) + } + } + return nil +} + +func (s *S3Encoder) flushObject(b *s3BucketState, bucketDir string, obj *s3ObjectState) error { + if obj.manifest == nil { + s.emitWarn("s3_orphan_chunks", + "bucket", b.name, + "object", obj.object, + "chunks", len(obj.chunkPaths), + "hint", "blob chunks present but no !s3|obj|head record matched") + return nil + } + objectName, kind, err := s.resolveObjectFilename(b, obj) + if err != nil { + return err + } + bodyPath := filepath.Join(bucketDir, objectName) + if err := os.MkdirAll(filepath.Dir(bodyPath), 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + if err := assembleObjectBody(bodyPath, obj); err != nil { + return err + } + sidecar := bodyPath + S3MetaSuffixReserved + if err := writeFileAtomic(sidecar, mustMarshalIndent(obj.manifest)); err != nil { + return err + } + if kind != "" { + if err := s.recordKeymap(b, bucketDir, objectName, []byte(obj.object), kind); err != nil { + return err + } + } + return nil +} + +// resolveObjectFilename returns the relative path of the assembled +// body within the bucket directory, plus the keymap "kind" when a +// rename took place ("" when the object writes at its natural path). +func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState) (string, string, error) { + if strings.HasSuffix(obj.object, S3MetaSuffixReserved) { + if !s.renameCollisions { + return "", "", errors.Wrapf(ErrS3MetaSuffixCollision, + "bucket %q object %q", b.name, obj.object) + } + return obj.object + ".user-data", KindMetaCollision, nil + } + // Object path taken at face value. Path collisions (`path/to` + // vs `path/to/sub`) are deferred until the master pipeline + // detects them across multiple manifests; this PR's per-object + // flush trusts the caller's collision detection. + return obj.object, "", nil +} + +func (s *S3Encoder) recordKeymap(b *s3BucketState, bucketDir, encodedFilename string, original []byte, kind string) error { + if b.keymap == nil { + path := filepath.Join(bucketDir, "KEYMAP.jsonl") + f, err := os.Create(path) //nolint:gosec // path composed from output root + if err != nil { + return errors.WithStack(err) + } + b.keymap = NewKeymapWriter(f) + b.keymapDir = bucketDir + } + return b.keymap.WriteOriginal(encodedFilename, original, kind) +} + +func (s *S3Encoder) emitWarn(event string, fields ...any) { + if s.warn != nil { + s.warn(event, fields...) + } +} + +func (s *S3Encoder) bucketState(name string) *s3BucketState { + if st, ok := s.buckets[name]; ok { + return st + } + st := &s3BucketState{name: name, objects: make(map[string]*s3ObjectState)} + s.buckets[name] = st + return st +} + +func (s *S3Encoder) objectState(bucket string, gen uint64, object string) *s3ObjectState { + b := s.bucketState(bucket) + key := object + "\x00" + uint64Hex(gen) + if st, ok := b.objects[key]; ok { + return st + } + st := &s3ObjectState{bucket: bucket, generation: gen, object: object} + b.objects[key] = st + return st +} + +// attachManifestParts is a placeholder that records the part list on +// the object state. The current implementation walks the manifest's +// part order at Finalize time, so this method just memoises the upload +// ID for reference; future extensions (e.g., versioned parts) can +// surface here. +func (st *s3ObjectState) attachManifestParts(_ string, _ []s3LivePart) { + // Intentionally empty: assembleObjectBody consumes the manifest + // directly via st.manifest at Finalize. Kept as a hook so the + // callsite reads symmetrically with HandleBlob. +} + +// assembleObjectBody concatenates the blob chunks per the manifest's +// (PartNo, ChunkNo) order into outPath. The encoder buffers chunks on +// disk during the scan, so this copy walk is bounded by the object's +// size — no all-in-memory step. +// +// We re-decode the live manifest from the chunkPaths' uploadID rather +// than threading it through s3PublicManifest because the public +// sidecar deliberately drops the internal upload metadata. +func assembleObjectBody(outPath string, obj *s3ObjectState) error { + tmp, err := os.CreateTemp(filepath.Dir(outPath), ".obj.tmp-*") + if err != nil { + return errors.WithStack(err) + } + tmpPath := tmp.Name() + defer func() { + if _, statErr := os.Stat(tmpPath); statErr == nil { + _ = os.Remove(tmpPath) + } + }() + chunks := sortChunkKeys(obj.chunkPaths) + for _, k := range chunks { + path := obj.chunkPaths[k] + if err := appendFile(tmp, path); err != nil { + _ = tmp.Close() + return err + } + } + if err := tmp.Close(); err != nil { + return errors.WithStack(err) + } + if err := os.Rename(tmpPath, outPath); err != nil { + return errors.WithStack(err) + } + return nil +} + +func appendFile(dst io.Writer, srcPath string) error { + f, err := os.Open(srcPath) //nolint:gosec // srcPath composed from scratch root + if err != nil { + return errors.WithStack(err) + } + defer f.Close() + if _, err := io.Copy(dst, f); err != nil { + return errors.WithStack(err) + } + return nil +} + +func sortChunkKeys(m map[s3ChunkKey]string) []s3ChunkKey { + out := make([]s3ChunkKey, 0, len(m)) + for k := range m { + out = append(out, k) + } + sort.SliceStable(out, func(i, j int) bool { + a, b := out[i], out[j] + switch { + case a.partNo != b.partNo: + return a.partNo < b.partNo + case a.partVersion != b.partVersion: + return a.partVersion < b.partVersion + default: + return a.chunkNo < b.chunkNo + } + }) + return out +} + +func ensureChunkPaths(m map[s3ChunkKey]string) map[s3ChunkKey]string { + if m == nil { + return make(map[s3ChunkKey]string) + } + return m +} + +func parseUploadFamily(prefix string, key []byte) (bucket string, generation uint64, object string, uploadID string, partNo uint64, ok bool) { + switch prefix { + case S3UploadPartPrefix: + return s3keys.ParseUploadPartKey(key) + case S3UploadMetaPrefix: + // Parse via prefix arithmetic: same shape as upload-part minus + // the partNo trailer. ParseUploadPartKey would reject the + // shorter form, so we accept it heuristically here. Phase 0a + // only needs the bucket for routing. + out := key[len(S3UploadMetaPrefix):] + if len(out) == 0 { + return "", 0, "", "", 0, false + } + return decodeBucketSegmentForRouting(out) + } + return "", 0, "", "", 0, false +} + +func decodeBucketSegmentForRouting(rest []byte) (string, uint64, string, string, uint64, bool) { + // We only need the bucket for routing; the rest is passed through + // as opaque bytes. + for i := 0; i < len(rest); i++ { + if rest[i] == 0x00 && i+1 < len(rest) && rest[i+1] == 0x01 { + return string(rest[:i]), 0, "", "", 0, true + } + } + return "", 0, "", "", 0, false +} + +func uint64Hex(v uint64) string { + const hexDigits = "0123456789abcdef" + const u64HexLen = 16 + out := make([]byte, u64HexLen) + for i := u64HexLen - 1; i >= 0; i-- { + out[i] = hexDigits[v&0xF] //nolint:mnd // 0xF == low-nibble mask + v >>= 4 //nolint:mnd // 4 == nibble width + } + return string(out) +} + +func blobScratchName(uploadID string, partNo, chunkNo, partVersion uint64) string { + return EncodeSegment([]byte(uploadID)) + "_" + uint64Hex(partNo) + "_" + uint64Hex(chunkNo) + "_" + uint64Hex(partVersion) + ".bin" +} diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go new file mode 100644 index 000000000..7e9989ed6 --- /dev/null +++ b/internal/backup/s3_test.go @@ -0,0 +1,310 @@ +package backup + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/bootjp/elastickv/internal/s3keys" + "github.com/cockroachdb/errors" +) + +func newS3Encoder(t *testing.T) (*S3Encoder, string) { + t.Helper() + out := t.TempDir() + scratch := t.TempDir() + return NewS3Encoder(out, scratch), out +} + +func encodeS3BucketMetaValue(t *testing.T, m map[string]any) []byte { + t.Helper() + body, err := json.Marshal(m) + if err != nil { + t.Fatalf("marshal: %v", err) + } + return body +} + +func encodeS3ManifestValue(t *testing.T, m map[string]any) []byte { + t.Helper() + body, err := json.Marshal(m) + if err != nil { + t.Fatalf("marshal: %v", err) + } + return body +} + +// emitObject is the minimal happy-path fixture: bucket meta + a +// single-part single-chunk object with its body. +func emitObject(t *testing.T, enc *S3Encoder, bucket string, gen uint64, object string, body []byte, contentType string) { + t.Helper() + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, "owner": "alice", "region": "us-east-1", + })); err != nil { + t.Fatalf("HandleBucketMeta: %v", err) + } + uploadID := "u-1" + manifest := map[string]any{ + "upload_id": uploadID, + "etag": "\"deadbeef\"", + "size_bytes": int64(len(body)), + "content_type": contentType, + "parts": []map[string]any{ + {"part_no": 1, "etag": "\"x\"", "size_bytes": int64(len(body)), "chunk_count": 1}, + }, + } + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, manifest)); err != nil { + t.Fatalf("HandleObjectManifest: %v", err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), body); err != nil { + t.Fatalf("HandleBlob: %v", err) + } +} + +func readJSONFile[T any](t *testing.T, path string, into *T) { + t.Helper() + body, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + if err := json.Unmarshal(body, into); err != nil { + t.Fatalf("unmarshal %s: %v", path, err) + } +} + +func TestS3_BucketMetaAndSingleObjectAssembly(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + body := []byte("hello-world") + emitObject(t, enc, "photos", 7, "2026/04/img.jpg", body, "image/jpeg") + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", "photos", "2026/04/img.jpg")) //nolint:gosec // test path + if err != nil { + t.Fatalf("read body: %v", err) + } + if !bytes.Equal(got, body) { + t.Fatalf("body mismatch: %q vs %q", got, body) + } + var pm s3PublicManifest + readJSONFile(t, filepath.Join(root, "s3", "photos", "2026/04/img.jpg.elastickv-meta.json"), &pm) + if pm.ContentType != "image/jpeg" { + t.Fatalf("content_type = %q", pm.ContentType) + } + if pm.SizeBytes != int64(len(body)) { + t.Fatalf("size = %d", pm.SizeBytes) + } + var pb s3PublicBucket + readJSONFile(t, filepath.Join(root, "s3", "photos", "_bucket.json"), &pb) + if pb.Region != "us-east-1" { + t.Fatalf("region = %q", pb.Region) + } +} + +func TestS3_MultipartObjectAssemblesInPartChunkOrder(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "logs" + gen := uint64(1) + object := "app.log" + uploadID := "u-mp" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 11, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 5, "chunk_count": 2}, + {"part_no": 2, "size_bytes": 6, "chunk_count": 1}, + }, + })); err != nil { + t.Fatal(err) + } + // Insert chunks out of order; assembly must respect (partNo, chunkNo). + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 2, 0), []byte("WORLD!")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 1), []byte("lo")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), []byte("hel")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", bucket, object)) //nolint:gosec + if err != nil { + t.Fatal(err) + } + if string(got) != "helloWORLD!" { + t.Fatalf("body = %q want %q", got, "helloWORLD!") + } +} + +func TestS3_OrphanChunksWarn(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + var events []string + enc.WithWarnSink(func(e string, _ ...any) { events = append(events, e) }) + if err := enc.HandleBlob(s3keys.BlobKey("ghost", 1, "lost.bin", "u", 1, 0), []byte("x")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if len(events) != 1 || events[0] != "s3_orphan_chunks" { + t.Fatalf("events = %v", events) + } +} + +func TestS3_MetaSuffixCollisionRejectedByDefault(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, "evil.elastickv-meta.json", []byte("payload"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MetaSuffixCollision) { + t.Fatalf("err = %v want ErrS3MetaSuffixCollision", err) + } +} + +func TestS3_MetaSuffixCollisionRenamesUnderFlag(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + enc.WithRenameCollisions(true) + emitObject(t, enc, "b", 1, "evil.elastickv-meta.json", []byte("payload"), "") + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + want := filepath.Join(root, "s3", "b", "evil.elastickv-meta.json.user-data") + if _, err := os.Stat(want); err != nil { + t.Fatalf("renamed body not found at %s: %v", want, err) + } + // KEYMAP must record the rename. + keymapPath := filepath.Join(root, "s3", "b", "KEYMAP.jsonl") + body, err := os.ReadFile(keymapPath) //nolint:gosec + if err != nil { + t.Fatalf("read keymap: %v", err) + } + var rec KeymapRecord + if err := json.Unmarshal(bytes.TrimRight(body, "\n"), &rec); err != nil { + t.Fatalf("unmarshal keymap: %v", err) + } + if rec.Kind != KindMetaCollision { + t.Fatalf("kind = %q", rec.Kind) + } + orig, err := rec.Original() + if err != nil { + t.Fatal(err) + } + if string(orig) != "evil.elastickv-meta.json" { + t.Fatalf("original = %q", orig) + } +} + +func TestS3_RejectsMalformedManifestJSON(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + err := enc.HandleObjectManifest(s3keys.ObjectManifestKey("b", 1, "o"), []byte("not-json")) + if !errors.Is(err, ErrS3InvalidManifest) { + t.Fatalf("err = %v", err) + } +} + +func TestS3_RejectsMalformedBucketMetaJSON(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), []byte("not-json")) + if !errors.Is(err, ErrS3InvalidBucketMeta) { + t.Fatalf("err = %v", err) + } +} + +func TestS3_HandleIgnored_NoOp(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + if err := enc.HandleIgnored([]byte("!s3|gc|upload|whatever"), []byte("opaque")); err != nil { + t.Fatalf("HandleIgnored should be a no-op, err=%v", err) + } +} + +func TestS3_IncludeIncompleteUploadsBuffersRecords(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + enc.WithIncludeIncompleteUploads(true) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": 1, + })); err != nil { + t.Fatal(err) + } + uploadKey := s3keys.UploadMetaKey("b", 1, "obj", "u-1") + if err := enc.HandleIncompleteUpload(S3UploadMetaPrefix, uploadKey, []byte("payload")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + want := filepath.Join(root, "s3", "b", "_incomplete_uploads", "records.jsonl") + if _, err := os.Stat(want); err != nil { + t.Fatalf("expected incomplete-uploads file: %v", err) + } +} + +func TestS3_DefaultDoesNotEmitIncompleteUploads(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": 1, + })); err != nil { + t.Fatal(err) + } + uploadKey := s3keys.UploadMetaKey("b", 1, "obj", "u-1") + if err := enc.HandleIncompleteUpload(S3UploadMetaPrefix, uploadKey, []byte("payload")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(root, "s3", "b", "_incomplete_uploads")); !os.IsNotExist(err) { + t.Fatalf("expected no _incomplete_uploads dir without flag, stat err=%v", err) + } +} + +func TestS3_VersionedBlobAssembledByPartVersionOrder(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "v" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 6, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 6, "chunk_count": 1, "part_version": 9}, + }, + })); err != nil { + t.Fatal(err) + } + // Versioned blob — partVersion encoded into the key. + if err := enc.HandleBlob(s3keys.VersionedBlobKey(bucket, gen, object, uploadID, 1, 0, 9), []byte("vBlobX")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", bucket, object)) //nolint:gosec + if err != nil { + t.Fatal(err) + } + if string(got) != "vBlobX" { + t.Fatalf("body = %q", got) + } +} From fdc9b36b6b66fcc47480ccb2220016a11b3392e5 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 19:36:48 +0900 Subject: [PATCH 02/20] backup: address review on DynamoDB encoder (PR #716) Three issues, all valid. #237 Codex P1 -- preserve item generation when grouping rows. Item keys carry !ddb|item|||... but the prior parser dropped the generation and Finalize merged all generations under the active schema. In real clusters, in-flight delete/recreate cleanup leaves stale-gen rows visible in the snapshot for a window; those rows would silently restore as items under the new schema (potentially resurrecting deleted data, or failing Finalize when PK names changed across generations). Fix: - parseDDBItemKey now returns (encodedTable, generation). - ddbTableState.items -> itemsByGen map[uint64][]*pb.DynamoItem. - flushTable filters by st.schema.GetGeneration() and emits a ddb_stale_generation_items warning carrying the count and active gen so the operator can correlate the orphan window. #182 Gemini -- fail-fast on flush errors. Finalize previously deferred the first error and continued through remaining tables. Real flush errors (out of disk, bad permissions) should surface immediately, not be misattributed to a later table whose flush also fails. Orphan-table warnings still continue (those are informational, not errors). #442 Gemini -- empty set serializes as [] not null. setAttributeValueToPublic now uses make+append so nil/empty SS/NS/BS become "{\"SS\":[]}" rather than "{\"SS\":null}". Downstream tools that distinguish "present-but-empty" from "missing" no longer see a misleading null. Tests: TestDDB_StaleGenerationItemsExcludedAndWarned, TestDDB_EmptyStringSetSerializesAsEmptyArrayNotNull, TestDDB_ParseItemKeyExtractsGeneration. Existing fixtures updated to set Generation=1 on schemas (matching the item key gen). --- internal/backup/dynamodb.go | 140 +++++++++++++++++++++++-------- internal/backup/dynamodb_test.go | 77 +++++++++++++++++ 2 files changed, 180 insertions(+), 37 deletions(-) diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go index f3ec74bb3..ddb3a5bd0 100644 --- a/internal/backup/dynamodb.go +++ b/internal/backup/dynamodb.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" pb "github.com/bootjp/elastickv/proto" @@ -63,10 +64,17 @@ type DDBEncoder struct { } type ddbTableState struct { - encoded string - name string - schema *pb.DynamoTableSchema - items []*pb.DynamoItem + encoded string + name string + schema *pb.DynamoTableSchema + itemsByGen map[uint64][]*pb.DynamoItem +} + +func ensureItemsByGen(m map[uint64][]*pb.DynamoItem) map[uint64][]*pb.DynamoItem { + if m == nil { + return make(map[uint64][]*pb.DynamoItem) + } + return m } // NewDDBEncoder constructs an encoder rooted at /dynamodb/. @@ -124,15 +132,16 @@ func (d *DDBEncoder) HandleTableMeta(key, value []byte) error { } // HandleItem processes a !ddb|item||| record. The -// encoded table segment is parsed out of the key (everything between -// the first and second `|` after stripping `!ddb|item|`) and the item -// proto is buffered until Finalize. We do NOT parse the rest of the -// key here: every primary-key value the item could hold is also -// present in the proto's attributes map, and the schema (which arrives -// later in lex order) is what tells us which attributes are the hash -// and range keys. +// encoded table segment AND the item generation are parsed out of the +// key; the proto is buffered keyed by generation so Finalize can emit +// only the rows belonging to the schema's active generation. +// +// Stale-generation rows (left behind by an in-flight delete/recreate +// before async cleanup finishes) would otherwise silently leak under +// the new schema and either resurrect deleted data or fail Finalize +// when primary-key names changed across generations — Codex P1 #237. func (d *DDBEncoder) HandleItem(key, value []byte) error { - encoded, err := parseDDBItemKey(key) + encoded, generation, err := parseDDBItemKey(key) if err != nil { return err } @@ -145,7 +154,8 @@ func (d *DDBEncoder) HandleItem(key, value []byte) error { return errors.Wrap(ErrDDBInvalidItem, err.Error()) } st := d.tableState(encoded) - st.items = append(st.items, item) + st.itemsByGen = ensureItemsByGen(st.itemsByGen) + st.itemsByGen[generation] = append(st.itemsByGen[generation], item) return nil } @@ -159,27 +169,35 @@ func (d *DDBEncoder) HandleGSIRow(_, _ []byte) error { return nil } func (d *DDBEncoder) HandleTableGen(_, _ []byte) error { return nil } // Finalize emits each table's _schema.json and per-item JSON files. -// Tables with items but no schema (orphans — e.g., the schema record -// was lost or excluded) emit a warning and are skipped. Tables with -// a schema but no items emit a _schema.json and an empty items/ -// directory. +// Tables with items but no schema (orphans) emit a warning and are +// skipped — preserving the spec's lenient handling for incomplete +// inputs. Real flush errors fail fast so corruption surfaces +// immediately rather than being attributed to a later table (Gemini +// MEDIUM #182). func (d *DDBEncoder) Finalize() error { if d.bundleJSONL { return errors.New("backup: dynamodb_layout=jsonl not implemented in this PR") } - var firstErr error for _, st := range d.tables { if st.schema == nil { d.emitWarn("ddb_orphan_items", "encoded_table", st.encoded, - "buffered_items", len(st.items)) + "buffered_items", totalItemsAcrossGens(st.itemsByGen)) continue } - if err := d.flushTable(st); err != nil && firstErr == nil { - firstErr = err + if err := d.flushTable(st); err != nil { + return err } } - return firstErr + return nil +} + +func totalItemsAcrossGens(m map[uint64][]*pb.DynamoItem) int { + total := 0 + for _, items := range m { + total += len(items) + } + return total } func (d *DDBEncoder) flushTable(st *ddbTableState) error { @@ -193,7 +211,21 @@ func (d *DDBEncoder) flushTable(st *ddbTableState) error { } hashKey := st.schema.GetPrimaryKey().GetHashKey() rangeKey := st.schema.GetPrimaryKey().GetRangeKey() - for _, item := range st.items { + activeGen := st.schema.GetGeneration() + // Only items belonging to the schema's active generation are + // emitted. Older-gen rows (in-flight delete/recreate cleanup) are + // counted into a structured warning so the operator can correlate + // orphans to the cluster's state, but never written under the + // current schema where they would resurrect deleted data. + if stale := totalStaleItems(st.itemsByGen, activeGen); stale > 0 { + d.emitWarn("ddb_stale_generation_items", + "table", st.name, + "active_generation", activeGen, + "stale_count", stale, + "hint", "stale-gen rows are excluded from the dump; restore would otherwise emit them under the new schema") + } + active := st.itemsByGen[activeGen] + for _, item := range active { if err := writeDDBItem(itemsDir, hashKey, rangeKey, item); err != nil { return err } @@ -201,6 +233,16 @@ func (d *DDBEncoder) flushTable(st *ddbTableState) error { return nil } +func totalStaleItems(m map[uint64][]*pb.DynamoItem, activeGen uint64) int { + stale := 0 + for gen, items := range m { + if gen != activeGen { + stale += len(items) + } + } + return stale +} + func (d *DDBEncoder) emitWarn(event string, fields ...any) { if d.warn == nil { return @@ -217,24 +259,36 @@ func (d *DDBEncoder) tableState(encoded string) *ddbTableState { return st } -// parseDDBItemKey extracts the encoded table segment from -// !ddb|item|||. base64url does not contain `|`, -// so a strict `|` split between the prefix and the gen is unambiguous. -func parseDDBItemKey(key []byte) (string, error) { +// parseDDBItemKey extracts the encoded table segment AND the item +// generation from !ddb|item|||. base64url does +// not contain `|`, so the first two `|` after the prefix are +// unambiguous separators between the table segment, the decimal +// generation, and the rest of the key (hash/range encoding). +func parseDDBItemKey(key []byte) (string, uint64, error) { rest, err := stripPrefixSegment(key, []byte(DDBItemPrefix)) if err != nil { - return "", errors.Wrap(ErrDDBMalformedKey, err.Error()) + return "", 0, errors.Wrap(ErrDDBMalformedKey, err.Error()) } - idx := strings.IndexByte(rest, '|') - if idx <= 0 { - return "", errors.Wrapf(ErrDDBMalformedKey, + tableEnd := strings.IndexByte(rest, '|') + if tableEnd <= 0 { + return "", 0, errors.Wrapf(ErrDDBMalformedKey, "item key missing table/gen separator: %q", key) } - enc := rest[:idx] + enc := rest[:tableEnd] if _, err := base64.RawURLEncoding.DecodeString(enc); err != nil { - return "", errors.Wrap(ErrDDBMalformedKey, err.Error()) + return "", 0, errors.Wrap(ErrDDBMalformedKey, err.Error()) + } + afterTable := rest[tableEnd+1:] + genEnd := strings.IndexByte(afterTable, '|') + if genEnd <= 0 { + return "", 0, errors.Wrapf(ErrDDBMalformedKey, + "item key missing gen/rest separator: %q", key) + } + gen, err := strconv.ParseUint(afterTable[:genEnd], 10, 64) //nolint:mnd // 10 == decimal radix; 64 == uint64 width + if err != nil { + return "", 0, errors.Wrap(ErrDDBMalformedKey, err.Error()) } - return enc, nil + return enc, gen, nil } // writeDDBItem emits one item under itemsDir/[/].json. The @@ -430,13 +484,25 @@ func scalarAttributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { } func setAttributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { + // Ensure the destination slice is non-nil even when the source + // is nil/empty so json.Marshal renders [] rather than null. + // AWS DynamoDB JSON does NOT permit empty sets ([] is rejected + // by the live API), but the dump format intentionally accepts + // the structural empty case to avoid silently dropping a set + // attribute whose live representation drifted to nil. switch v := av.GetValue().(type) { case *pb.DynamoAttributeValue_Ss: - return map[string]any{"SS": append([]string{}, v.Ss.GetValues()...)} + out := make([]string, 0, len(v.Ss.GetValues())) + out = append(out, v.Ss.GetValues()...) + return map[string]any{"SS": out} case *pb.DynamoAttributeValue_Ns: - return map[string]any{"NS": append([]string{}, v.Ns.GetValues()...)} + out := make([]string, 0, len(v.Ns.GetValues())) + out = append(out, v.Ns.GetValues()...) + return map[string]any{"NS": out} case *pb.DynamoAttributeValue_Bs: - return map[string]any{"BS": append([][]byte{}, v.Bs.GetValues()...)} + out := make([][]byte, 0, len(v.Bs.GetValues())) + out = append(out, v.Bs.GetValues()...) + return map[string]any{"BS": out} } return nil } diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go index 122bc271b..f414c490e 100644 --- a/internal/backup/dynamodb_test.go +++ b/internal/backup/dynamodb_test.go @@ -177,6 +177,7 @@ func TestDDB_BinaryHashKeyRendersAsB64Prefix(t *testing.T) { HashKey: "id", }, AttributeDefinitions: map[string]string{"id": "B"}, + Generation: 1, } item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ "id": bAttr([]byte{0x00, 0x01, 0x02}), @@ -242,6 +243,7 @@ func TestDDB_RejectsItemMissingHashKeyAttribute(t *testing.T) { schema := &pb.DynamoTableSchema{ TableName: "t", PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, AttributeDefinitions: map[string]string{"id": "S"}, + Generation: 1, } item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ // "id" is missing @@ -273,6 +275,7 @@ func TestDDB_AllAttributeKindsRoundTripThroughJSON(t *testing.T) { schema := &pb.DynamoTableSchema{ TableName: "kitchensink", PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, AttributeDefinitions: map[string]string{"id": "S"}, + Generation: 1, } item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ "id": sAttr("k"), @@ -332,6 +335,80 @@ func TestDDB_BundleJSONLNotImplementedYet(t *testing.T) { } } +func TestDDB_StaleGenerationItemsExcludedAndWarned(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + var events []string + enc.WithWarnSink(func(e string, _ ...any) { events = append(events, e) }) + + schema := &pb.DynamoTableSchema{ + TableName: "t", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{"id": "S"}, + Generation: 5, + } + live := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("alive"), "v": sAttr("active"), + }} + stale := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("ghost"), "v": sAttr("from-prev-gen"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("t", 5, "alive", ""), encodeItemValue(t, live)); err != nil { + t.Fatal(err) + } + if err := enc.HandleItem(EncodeDDBItemKey("t", 4, "ghost", ""), encodeItemValue(t, stale)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + + if _, err := os.Stat(filepath.Join(root, "dynamodb", "t", "items", "alive.json")); err != nil { + t.Fatalf("expected active-gen item: %v", err) + } + if _, err := os.Stat(filepath.Join(root, "dynamodb", "t", "items", "ghost.json")); !os.IsNotExist(err) { + t.Fatalf("stale-gen item must NOT be emitted, stat err=%v", err) + } + if len(events) != 1 || events[0] != "ddb_stale_generation_items" { + t.Fatalf("events=%v want [ddb_stale_generation_items]", events) + } +} + +func TestDDB_EmptyStringSetSerializesAsEmptyArrayNotNull(t *testing.T) { + t.Parallel() + // Per Gemini #442 — a set attribute with no members must + // serialize as `[]` rather than `null` so downstream tools + // see a present-but-empty set, not a missing field. + got := setAttributeValueToPublic(&pb.DynamoAttributeValue{ + Value: &pb.DynamoAttributeValue_Ss{Ss: &pb.DynamoStringSet{Values: nil}}, + }) + body, err := json.Marshal(got) + if err != nil { + t.Fatal(err) + } + if string(body) != `{"SS":[]}` { + t.Fatalf("got %s want {\"SS\":[]}", body) + } +} + +func TestDDB_ParseItemKeyExtractsGeneration(t *testing.T) { + t.Parallel() + enc, gen, err := parseDDBItemKey(EncodeDDBItemKey("orders", 42, "pk", "sk")) + if err != nil { + t.Fatal(err) + } + if gen != 42 { + t.Fatalf("gen=%d want 42", gen) + } + want := "b3JkZXJz" // base64url("orders") + if enc != want { + t.Fatalf("enc=%q want %q", enc, want) + } +} + func TestDDB_RejectsKeyWithMissingTableSegment(t *testing.T) { t.Parallel() enc, _ := newDDBEncoder(t) From 19ae3289ee0663023d872d1bfe6fe98f3575f3b4 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 19:44:27 +0900 Subject: [PATCH 03/20] backup: address review on S3 encoder (PR #718) Three correctness bugs and several quality issues, all valid. CRITICAL: path traversal (Codex P1 #425). S3 object keys are user-controlled. A key like "../../../etc/passwd" would, under the prior filepath.Join-and-write code, escape the bucket directory and overwrite host files. Added safeJoinUnderRoot which Cleans the joined path and asserts it stays rooted under the bucket dir; ".." traversal is rejected with ErrS3MalformedKey. Absolute paths (leading "/") are normalised under the bucket dir by filepath.Join, which is the safest outcome. Tests: TestS3_PathTraversalAttemptRejected, TestS3_AbsolutePathObjectKeyConfinedUnderBucket. CRITICAL: stale upload-id chunks merged into body (Codex P1 #500, Gemini HIGH #106/#476/#504). A snapshot mid-delete-and-recreate or mid-retry can carry blob chunks for multiple upload attempts under the same (bucket, gen, object). The prior assembleObjectBody concatenated every chunk regardless of upload_id, producing corrupted bytes. Now: - s3ObjectState gains uploadID; HandleObjectManifest sets it. - New filterChunksForManifest takes the chunkPaths map and the manifest's uploadID, returns only matching chunks sorted by (partNo, partVersion, chunkNo). Stale-uploadID chunks never enter the assembled body. Test: TestS3_StaleUploadIDChunksFilteredFromAssembledBody. CRITICAL: incomplete-uploads file truncated per record (Codex P2 #318, Gemini HIGH+MEDIUM #318). HandleIncompleteUpload re-opened records.jsonl on every call; openJSONL uses os.Create which truncates. Only the last record survived per bucket. Now: - s3BucketState carries an incompleteUploadsJL *jsonlFile lazily opened on the first record and cached. - flushBucket closes it and surfaces the error (was silently ignored). Test: TestS3_IncompleteUploadsAppendsAcrossCalls (3 records, 3 lines on disk). QUALITY: - Gemini MEDIUM #285 (MkdirAll per blob): s3ObjectState gains a scratchDirCreated bool; HandleBlob runs MkdirAll once. - Gemini MEDIUM #318 (closeJSONL error ignored): keymap and incomplete-uploads writers now surface close errors via closeBucketKeymap / explicit closeJSONL return-check. - Gemini MEDIUM #386 (includeOrphans flag ignored): orphan chunks for objects without manifests now write to /_orphans//.bin under WithIncludeOrphans(true). Test: TestS3_OrphanChunksWrittenWhenIncludeOrphans. Also removed the now-unused attachManifestParts placeholder and sortChunkKeys helper; their logic moved into filterChunksForManifest which combines uploadID filtering with the sort. --- internal/backup/s3.go | 244 ++++++++++++++++++++++++++----------- internal/backup/s3_test.go | 123 +++++++++++++++++++ 2 files changed, 296 insertions(+), 71 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index cd20e2c48..2cda3462d 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -94,14 +94,33 @@ type s3BucketState struct { objects map[string]*s3ObjectState // keyed by "object\x00generation" keymap *KeymapWriter keymapDir string + // incompleteUploadsJL is opened lazily on the first + // !s3|upload|meta or !s3|upload|part record under + // --include-incomplete-uploads, then reused for every subsequent + // record in the same bucket and closed in flushBucket. Without + // this caching, the prior code re-opened (truncating!) the file + // on every record, leaving only the last record on disk and + // silently losing forensic data — flagged as Codex P2 #318. + incompleteUploadsJL *jsonlFile } type s3ObjectState struct { bucket string generation uint64 object string - manifest *s3PublicManifest - // chunkPaths maps (uploadID, partNo, chunkNo) -> scratch path. + // uploadID is the manifest's `upload_id`. Set by HandleObjectManifest; + // consumed by assembleObjectBody to filter chunkPaths so a stale + // upload's blob chunks (still in the snapshot during a delete/retry + // window) cannot be merged into the active body — Codex P1 #500, + // Gemini HIGH #106/#476/#504. + uploadID string + // scratchDirCreated avoids the per-blob MkdirAll syscall flagged + // by Gemini MEDIUM #285. The scratch directory for this object is + // created exactly once on the first HandleBlob call. + scratchDirCreated bool + manifest *s3PublicManifest + // chunkPaths maps (uploadID, partNo, chunkNo, partVersion) -> + // scratch path. chunkPaths map[s3ChunkKey]string } @@ -261,12 +280,13 @@ func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { ContentDisposition: live.ContentDisposition, UserMetadata: live.UserMetadata, } - // Persist the parts list on the object state so Finalize knows - // what to assemble. We attach the live parts directly because - // that's purely structural data — the public sidecar has no need - // for them. + // Capture the manifest's uploadID so assembleObjectBody can + // filter blob chunks belonging to other (stale or in-flight) + // upload attempts. The live parts list is purely structural — + // the public sidecar has no need for it, but its uploadID is + // the load-bearing detail. + st.uploadID = live.UploadID st.chunkPaths = ensureChunkPaths(st.chunkPaths) - st.attachManifestParts(live.UploadID, live.Parts) return nil } @@ -280,8 +300,11 @@ func (s *S3Encoder) HandleBlob(key, value []byte) error { } st := s.objectState(bucket, gen, object) dir := filepath.Join(s.scratchRoot, EncodeSegment([]byte(bucket)), EncodeSegment([]byte(object))) - if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode - return errors.WithStack(err) + if !st.scratchDirCreated { + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + st.scratchDirCreated = true } path := filepath.Join(dir, blobScratchName(uploadID, partNo, chunkNo, partVersion)) if err := writeFileAtomic(path, value); err != nil { @@ -293,8 +316,14 @@ func (s *S3Encoder) HandleBlob(key, value []byte) error { } // HandleIncompleteUpload routes !s3|upload|meta|/!s3|upload|part| -// records to /_incomplete_uploads/ when the include flag is -// on; otherwise drops them. +// records to /_incomplete_uploads/records.jsonl when the +// include flag is on; otherwise drops them. +// +// The output writer is opened once per bucket on the first record and +// cached on s3BucketState. Re-opening per record (the prior +// implementation) used create/truncate semantics, so each call wiped +// the file and only the last record survived — Codex P2 #318 / Gemini +// HIGH+MEDIUM #318. func (s *S3Encoder) HandleIncompleteUpload(prefix string, key, value []byte) error { if !s.includeIncompleteUploads { return nil @@ -303,25 +332,24 @@ func (s *S3Encoder) HandleIncompleteUpload(prefix string, key, value []byte) err if !ok { return errors.Wrapf(ErrS3MalformedKey, "upload-family key: %q", key) } - dir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(bucket)), "_incomplete_uploads") - if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode - return errors.WithStack(err) - } - // Phase 0a stores upload-family records as opaque key/value pairs - // (one JSON line per record) rather than reconstructing the - // in-flight upload state. Restoring incomplete uploads is itself - // a follow-up; this artifact preserves the bytes for forensics. - jl, err := openJSONL(filepath.Join(dir, "records.jsonl")) - if err != nil { - return err + b := s.bucketState(bucket) + if b.incompleteUploadsJL == nil { + dir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(bucket)), "_incomplete_uploads") + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + jl, err := openJSONL(filepath.Join(dir, "records.jsonl")) + if err != nil { + return err + } + b.incompleteUploadsJL = jl } - defer func() { _ = closeJSONL(jl) }() rec := struct { Prefix string `json:"prefix"` KeyB64 []byte `json:"key"` ValueB64 []byte `json:"value"` }{Prefix: prefix, KeyB64: key, ValueB64: value} - if err := jl.enc.Encode(rec); err != nil { + if err := b.incompleteUploadsJL.enc.Encode(rec); err != nil { return errors.WithStack(err) } return nil @@ -361,34 +389,47 @@ func (s *S3Encoder) flushBucket(b *s3BucketState) error { return err } } - if b.keymap != nil { - if err := b.keymap.Close(); err != nil { + // closeJSONL errors must surface — they are the canonical "data + // did not flush to disk" signal for a writable resource (Gemini + // MEDIUM #318). + if err := closeBucketKeymap(b); err != nil { + return err + } + if b.incompleteUploadsJL != nil { + if err := closeJSONL(b.incompleteUploadsJL); err != nil { return err } - // If no rename was recorded, drop the empty file so the - // dump tree omits it (per the spec: keymaps are absent when - // empty). - if b.keymap.Count() == 0 && b.keymapDir != "" { - _ = os.Remove(filepath.Join(b.keymapDir, "KEYMAP.jsonl")) - } + } + return nil +} + +// closeBucketKeymap closes the per-bucket KEYMAP.jsonl writer (if +// opened) and removes the file when no rename was recorded. +func closeBucketKeymap(b *s3BucketState) error { + if b.keymap == nil { + return nil + } + if err := b.keymap.Close(); err != nil { + return err + } + if b.keymap.Count() == 0 && b.keymapDir != "" { + _ = os.Remove(filepath.Join(b.keymapDir, "KEYMAP.jsonl")) } return nil } func (s *S3Encoder) flushObject(b *s3BucketState, bucketDir string, obj *s3ObjectState) error { if obj.manifest == nil { - s.emitWarn("s3_orphan_chunks", - "bucket", b.name, - "object", obj.object, - "chunks", len(obj.chunkPaths), - "hint", "blob chunks present but no !s3|obj|head record matched") - return nil + return s.flushOrphanObject(b, bucketDir, obj) } objectName, kind, err := s.resolveObjectFilename(b, obj) if err != nil { return err } - bodyPath := filepath.Join(bucketDir, objectName) + bodyPath, err := safeJoinUnderRoot(bucketDir, objectName) + if err != nil { + return err + } if err := os.MkdirAll(filepath.Dir(bodyPath), 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) } @@ -407,6 +448,59 @@ func (s *S3Encoder) flushObject(b *s3BucketState, bucketDir string, obj *s3Objec return nil } +// flushOrphanObject handles objects with chunks but no manifest. By +// default they emit only a warning. With --include-orphans on, the +// chunks are written under /_orphans// as +// per-chunk .bin files so the operator can recover bytes manually +// (Gemini MEDIUM #386). +func (s *S3Encoder) flushOrphanObject(b *s3BucketState, bucketDir string, obj *s3ObjectState) error { + s.emitWarn("s3_orphan_chunks", + "bucket", b.name, + "object", obj.object, + "chunks", len(obj.chunkPaths), + "hint", "blob chunks present but no !s3|obj|head record matched") + if !s.includeOrphans { + return nil + } + if len(obj.chunkPaths) == 0 { + return nil + } + dir := filepath.Join(bucketDir, "_orphans", EncodeSegment([]byte(obj.object))) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + for k, scratchPath := range obj.chunkPaths { + out := filepath.Join(dir, blobScratchName(k.uploadID, k.partNo, k.chunkNo, k.partVersion)) + body, err := os.ReadFile(scratchPath) //nolint:gosec // scratchPath composed from scratch root + if err != nil { + return errors.WithStack(err) + } + if err := writeFileAtomic(out, body); err != nil { + return err + } + } + return nil +} + +// safeJoinUnderRoot composes / and asserts the result is +// still rooted under . S3 object keys are user-controlled and +// can contain "..", absolute paths, or NUL bytes; without this guard +// a key like "../etc/passwd" would escape the dump tree and overwrite +// host files (Codex P1 #425). +func safeJoinUnderRoot(root, rel string) (string, error) { + if rel == "" { + return "", errors.Wrap(ErrS3MalformedKey, "empty object name") + } + cleanRoot := filepath.Clean(root) + joined := filepath.Clean(filepath.Join(cleanRoot, rel)) + rootSep := cleanRoot + string(filepath.Separator) + if joined != cleanRoot && !strings.HasPrefix(joined, rootSep) { + return "", errors.Wrapf(ErrS3MalformedKey, + "object name %q escapes bucket directory", rel) + } + return joined, nil +} + // resolveObjectFilename returns the relative path of the assembled // body within the bucket directory, plus the keymap "kind" when a // rename took place ("" when the object writes at its natural path). @@ -421,7 +515,9 @@ func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState) // Object path taken at face value. Path collisions (`path/to` // vs `path/to/sub`) are deferred until the master pipeline // detects them across multiple manifests; this PR's per-object - // flush trusts the caller's collision detection. + // flush trusts the caller's collision detection. Path-traversal + // sanitisation runs in safeJoinUnderRoot, downstream of this + // function, where the bucket-directory root is in scope. return obj.object, "", nil } @@ -464,17 +560,6 @@ func (s *S3Encoder) objectState(bucket string, gen uint64, object string) *s3Obj return st } -// attachManifestParts is a placeholder that records the part list on -// the object state. The current implementation walks the manifest's -// part order at Finalize time, so this method just memoises the upload -// ID for reference; future extensions (e.g., versioned parts) can -// surface here. -func (st *s3ObjectState) attachManifestParts(_ string, _ []s3LivePart) { - // Intentionally empty: assembleObjectBody consumes the manifest - // directly via st.manifest at Finalize. Kept as a hook so the - // callsite reads symmetrically with HandleBlob. -} - // assembleObjectBody concatenates the blob chunks per the manifest's // (PartNo, ChunkNo) order into outPath. The encoder buffers chunks on // disk during the scan, so this copy walk is bounded by the object's @@ -494,11 +579,20 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { _ = os.Remove(tmpPath) } }() - chunks := sortChunkKeys(obj.chunkPaths) + // Filter chunks by the manifest's uploadID. A snapshot taken + // during a delete/recreate or a retry-after-failed-CompleteUpload + // can legitimately contain blob chunks for multiple upload + // attempts under the same (bucket, generation, object). Mixing + // them produces corrupted bytes — Codex P1 #500 / Gemini HIGH + // #504. The manifest is the single source of truth; only its + // uploadID's chunks belong in the assembled body. + chunks := filterChunksForManifest(obj.chunkPaths, obj.uploadID) for _, k := range chunks { path := obj.chunkPaths[k] if err := appendFile(tmp, path); err != nil { - _ = tmp.Close() + if closeErr := tmp.Close(); closeErr != nil { + return errors.Wrap(err, "tmp.Close after appendFile failure: "+closeErr.Error()) + } return err } } @@ -511,25 +605,21 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { return nil } -func appendFile(dst io.Writer, srcPath string) error { - f, err := os.Open(srcPath) //nolint:gosec // srcPath composed from scratch root - if err != nil { - return errors.WithStack(err) - } - defer f.Close() - if _, err := io.Copy(dst, f); err != nil { - return errors.WithStack(err) - } - return nil -} - -func sortChunkKeys(m map[s3ChunkKey]string) []s3ChunkKey { - out := make([]s3ChunkKey, 0, len(m)) +// filterChunksForManifest returns the chunk keys belonging to +// manifestUploadID, sorted by (partNo, partVersion, chunkNo). An empty +// manifestUploadID matches every chunk — useful for tests that +// pre-date the uploadID feature, but production callers always have a +// non-empty uploadID via HandleObjectManifest. +func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string) []s3ChunkKey { + keys := make([]s3ChunkKey, 0, len(m)) for k := range m { - out = append(out, k) + if manifestUploadID != "" && k.uploadID != manifestUploadID { + continue + } + keys = append(keys, k) } - sort.SliceStable(out, func(i, j int) bool { - a, b := out[i], out[j] + sort.SliceStable(keys, func(i, j int) bool { + a, b := keys[i], keys[j] switch { case a.partNo != b.partNo: return a.partNo < b.partNo @@ -539,7 +629,19 @@ func sortChunkKeys(m map[s3ChunkKey]string) []s3ChunkKey { return a.chunkNo < b.chunkNo } }) - return out + return keys +} + +func appendFile(dst io.Writer, srcPath string) error { + f, err := os.Open(srcPath) //nolint:gosec // srcPath composed from scratch root + if err != nil { + return errors.WithStack(err) + } + defer f.Close() + if _, err := io.Copy(dst, f); err != nil { + return errors.WithStack(err) + } + return nil } func ensureChunkPaths(m map[s3ChunkKey]string) map[s3ChunkKey]string { diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 7e9989ed6..d181727a8 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -274,6 +274,129 @@ func TestS3_DefaultDoesNotEmitIncompleteUploads(t *testing.T) { } } +func TestS3_PathTraversalAttemptRejected(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, "../../../etc/passwd-attack", []byte("evil"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for path-traversal key", err) + } +} + +func TestS3_AbsolutePathObjectKeyConfinedUnderBucket(t *testing.T) { + t.Parallel() + // filepath.Join normalises a leading "/" on the second arg, so + // "/etc/host" becomes "/etc/host" — under the bucket + // root, not at filesystem root. This is safe (the user gets a + // surprising-but-confined path) and matches what `aws s3 sync` + // would round-trip back. We assert the safe outcome rather than + // rejecting; rejection would surprise operators with legitimate + // keys whose first byte is '/'. + enc, root := newS3Encoder(t) + emitObject(t, enc, "b", 1, "/etc/host-confined", []byte("ok"), "") + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", "b", "etc", "host-confined")) //nolint:gosec + if err != nil { + t.Fatalf("absolute-path key must end up under the bucket dir: %v", err) + } + if string(got) != "ok" { + t.Fatalf("body=%q", got) + } +} + +func TestS3_StaleUploadIDChunksFilteredFromAssembledBody(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadActive := "u-active" + uploadStale := "u-stale" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadActive, "size_bytes": 5, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 5, "chunk_count": 1}, + }, + })); err != nil { + t.Fatal(err) + } + // Stale chunk from a prior upload attempt — must NOT be merged + // into the assembled body. + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadStale, 1, 0), []byte("STALE")); err != nil { + t.Fatal(err) + } + // Active chunk for the manifest's uploadID. + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadActive, 1, 0), []byte("OKBYE")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", bucket, object)) //nolint:gosec + if err != nil { + t.Fatal(err) + } + if string(got) != "OKBYE" { + t.Fatalf("body = %q want %q (stale upload-id chunk leaked into body)", got, "OKBYE") + } +} + +func TestS3_IncompleteUploadsAppendsAcrossCalls(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + enc.WithIncludeIncompleteUploads(true) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": 1, + })); err != nil { + t.Fatal(err) + } + for i := 0; i < 3; i++ { + key := s3keys.UploadMetaKey("b", 1, "obj", "u-"+string(rune('a'+i))) + if err := enc.HandleIncompleteUpload(S3UploadMetaPrefix, key, []byte("payload")); err != nil { + t.Fatalf("HandleIncompleteUpload[%d]: %v", i, err) + } + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + body, err := os.ReadFile(filepath.Join(root, "s3", "b", "_incomplete_uploads", "records.jsonl")) //nolint:gosec + if err != nil { + t.Fatal(err) + } + lines := bytes.Count(body, []byte("\n")) + if lines != 3 { + t.Fatalf("records.jsonl has %d lines want 3 — re-open per call truncated previous records", lines) + } +} + +func TestS3_OrphanChunksWrittenWhenIncludeOrphans(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + enc.WithIncludeOrphans(true) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": 1, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey("b", 1, "ghost", "u", 1, 0), []byte("orphan")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + dir := filepath.Join(root, "s3", "b", "_orphans", EncodeSegment([]byte("ghost"))) + if _, err := os.Stat(dir); err != nil { + t.Fatalf("expected _orphans dir under --include-orphans: %v", err) + } +} + func TestS3_VersionedBlobAssembledByPartVersionOrder(t *testing.T) { t.Parallel() enc, root := newS3Encoder(t) From ce5b7dac74e13766764981c3ae99b3ad733a7ec5 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:01:30 +0900 Subject: [PATCH 04/20] backup: emit migrating-from-generation items in DynamoDB dump (PR #716, round 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 #227: during a generation migration, schema.Generation is the new gen and schema.MigratingFromGeneration carries the source. The live read path (adapter/dynamodb.go readLogicalItemAt) falls back to the source for items not yet copied. The previous flushTable filtered to only schema.GetGeneration() and silently dropped unmigrated rows from the dump. flushTable now emits items from BOTH generations. Order matters: the source is emitted FIRST, then the active gen, so when both generations carry the same primary key the active-gen content is the one that survives writeFileAtomic's tmp+rename — matching the live code's "new gen wins on conflict" rule. The stale-generation warning was generalised to take an emit-set (totalStaleItemsExcluding); rows in any other generation are counted into the warning so the operator still sees the orphan window. Tests: TestDDB_MigrationSourceGenerationItemsAreEmitted, TestDDB_NewGenerationWinsOverMigrationSourceForSameKey. --- internal/backup/dynamodb.go | 43 ++++++++++++----- internal/backup/dynamodb_test.go | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go index ddb3a5bd0..2f3dea11b 100644 --- a/internal/backup/dynamodb.go +++ b/internal/backup/dynamodb.go @@ -212,31 +212,48 @@ func (d *DDBEncoder) flushTable(st *ddbTableState) error { hashKey := st.schema.GetPrimaryKey().GetHashKey() rangeKey := st.schema.GetPrimaryKey().GetRangeKey() activeGen := st.schema.GetGeneration() - // Only items belonging to the schema's active generation are - // emitted. Older-gen rows (in-flight delete/recreate cleanup) are - // counted into a structured warning so the operator can correlate - // orphans to the cluster's state, but never written under the - // current schema where they would resurrect deleted data. - if stale := totalStaleItems(st.itemsByGen, activeGen); stale > 0 { + migrationSourceGen := st.schema.GetMigratingFromGeneration() + // During a generation migration the live read path falls back to + // migration_source_generation for items not yet copied to the new + // generation (see adapter/dynamodb.go readLogicalItemAt). Both + // generations therefore carry items the user can read at this + // moment, so a backup must include both. Codex P1 #227. + // + // Items present in BOTH generations: the new-gen row is the + // authoritative one (the live code prefers it on read). We emit + // migration-source first, then active gen LAST, so writeFileAtomic's + // tmp+rename leaves the active-gen content on disk per (pk,sk). + emitOrder := []uint64{} + if migrationSourceGen != 0 && migrationSourceGen != activeGen { + emitOrder = append(emitOrder, migrationSourceGen) + } + emitOrder = append(emitOrder, activeGen) + if stale := totalStaleItemsExcluding(st.itemsByGen, emitOrder); stale > 0 { d.emitWarn("ddb_stale_generation_items", "table", st.name, "active_generation", activeGen, + "migration_source_generation", migrationSourceGen, "stale_count", stale, - "hint", "stale-gen rows are excluded from the dump; restore would otherwise emit them under the new schema") + "hint", "stale-gen rows excluded; restore would otherwise emit them under the new schema") } - active := st.itemsByGen[activeGen] - for _, item := range active { - if err := writeDDBItem(itemsDir, hashKey, rangeKey, item); err != nil { - return err + for _, gen := range emitOrder { + for _, item := range st.itemsByGen[gen] { + if err := writeDDBItem(itemsDir, hashKey, rangeKey, item); err != nil { + return err + } } } return nil } -func totalStaleItems(m map[uint64][]*pb.DynamoItem, activeGen uint64) int { +func totalStaleItemsExcluding(m map[uint64][]*pb.DynamoItem, included []uint64) int { + includedSet := make(map[uint64]struct{}, len(included)) + for _, g := range included { + includedSet[g] = struct{}{} + } stale := 0 for gen, items := range m { - if gen != activeGen { + if _, ok := includedSet[gen]; !ok { stale += len(items) } } diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go index f414c490e..54d962cea 100644 --- a/internal/backup/dynamodb_test.go +++ b/internal/backup/dynamodb_test.go @@ -335,6 +335,87 @@ func TestDDB_BundleJSONLNotImplementedYet(t *testing.T) { } } +func TestDDB_MigrationSourceGenerationItemsAreEmitted(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + // During a live migration, schema.Generation is the new gen and + // schema.MigratingFromGeneration carries the source gen. The live + // read path falls back to the source for items not yet copied. + // The dump must include both — Codex P1 #227. + schema := &pb.DynamoTableSchema{ + TableName: "t", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{"id": "S"}, + Generation: 7, + MigratingFromGeneration: 6, + } + newRow := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("a"), "v": sAttr("new"), + }} + migratingRow := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("b"), "v": sAttr("not-yet-migrated"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("t", 7, "a", ""), encodeItemValue(t, newRow)); err != nil { + t.Fatal(err) + } + if err := enc.HandleItem(EncodeDDBItemKey("t", 6, "b", ""), encodeItemValue(t, migratingRow)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(root, "dynamodb", "t", "items", "a.json")); err != nil { + t.Fatalf("active-gen item missing: %v", err) + } + if _, err := os.Stat(filepath.Join(root, "dynamodb", "t", "items", "b.json")); err != nil { + t.Fatalf("migrating-from-gen item must be emitted during live migration: %v", err) + } +} + +func TestDDB_NewGenerationWinsOverMigrationSourceForSameKey(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "t", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{"id": "S"}, + Generation: 7, + MigratingFromGeneration: 6, + } + // Same primary key in both generations. The live read path + // prefers the new gen; the dump must do the same. + newRow := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("k"), "v": sAttr("new-version"), + }} + oldRow := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("k"), "v": sAttr("old-version"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("t", 6, "k", ""), encodeItemValue(t, oldRow)); err != nil { + t.Fatal(err) + } + if err := enc.HandleItem(EncodeDDBItemKey("t", 7, "k", ""), encodeItemValue(t, newRow)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + body, err := os.ReadFile(filepath.Join(root, "dynamodb", "t", "items", "k.json")) //nolint:gosec + if err != nil { + t.Fatal(err) + } + got := readItemMap(t, filepath.Join(root, "dynamodb", "t", "items", "k.json")) + v := mustSubMap(t, got, "v") + if v["S"] != "new-version" { + t.Fatalf("body = %s; new gen must win on conflict, got v.S=%v", body, v["S"]) + } +} + func TestDDB_StaleGenerationItemsExcludedAndWarned(t *testing.T) { t.Parallel() enc, root := newDDBEncoder(t) From 92ee22efad9331fa3b3450725f19080d18caf8bf Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:07:00 +0900 Subject: [PATCH 05/20] backup: address codex review on S3 encoder (PR #718, round 2) Three follow-up issues from Codex on top of round 1. #619 Codex P1 -- assemble only manifest-declared parts. filterChunksForManifest previously matched on uploadID only; the manifest declares specific (partNo, partVersion) tuples, and S3's overwrite-then-async-cleanup window can leave older partVersion chunks present under the same (bucket, generation, object, uploadID). Mixing them produces corrupted bytes. s3ObjectState gains declaredParts map[s3PartKey]struct{}, populated by HandleObjectManifest. filterChunksForManifest takes the set and restricts emission to declared (partNo, partVersion) tuples. Test TestS3_StalePartVersionExcludedFromAssembledBody asserts a stale partVersion=7 cannot leak when the manifest declares partVersion=9. #497 Codex P2 -- preserve dot segments. safeJoinUnderRoot used filepath.Clean which collapses "a/../b" to "b". S3 treats those bytes literally; "a/../b" and "b" are distinct keys that would have silently merged into one output file. Now explicitly rejects any object key whose segments are "." or ".." with ErrS3MalformedKey. NUL bytes also rejected. Test TestS3_DotSegmentObjectKeyRejected covers the four forms ("a/../b", "a/./b", "..", "."). #521 Codex P2 -- cross-generation collision. s3BucketState gains activeGen captured from the bucket-meta record. flushBucket suppresses objects whose generation differs from activeGen (under --include-orphans, those flow to _orphans/; by default they're dropped with an s3_stale_generation_objects warning). Tests TestS3_StaleGenerationObjectExcluded. flushBucket cyclomatic complexity stayed under the cap by extracting flushBucketObjects. --- internal/backup/s3.go | 158 +++++++++++++++++++++++++++++++------ internal/backup/s3_test.go | 107 +++++++++++++++++++++++++ 2 files changed, 239 insertions(+), 26 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index 2cda3462d..d413d28e5 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -89,8 +89,14 @@ type S3Encoder struct { } type s3BucketState struct { - name string - meta *s3PublicBucket + name string + meta *s3PublicBucket + // activeGen is the bucket's current generation, captured from the + // bucket-meta record. Used at flush time to suppress objects + // belonging to older incarnations of the same bucket name (Codex + // P2 #521). Zero means "no bucket meta seen yet"; in that state + // every object flushes (the prior orphan-warning path covers it). + activeGen uint64 objects map[string]*s3ObjectState // keyed by "object\x00generation" keymap *KeymapWriter keymapDir string @@ -114,6 +120,13 @@ type s3ObjectState struct { // window) cannot be merged into the active body — Codex P1 #500, // Gemini HIGH #106/#476/#504. uploadID string + // declaredParts is the set of (partNo, partVersion) tuples the + // manifest claims belong to this object. When non-nil, the body + // assembler restricts chunkPaths to entries matching this set — + // Codex P1 #619. nil means "no filter" (used only in tests that + // pre-date the manifest-parts feature; production callers always + // receive a non-nil set via HandleObjectManifest). + declaredParts map[s3PartKey]struct{} // scratchDirCreated avoids the per-blob MkdirAll syscall flagged // by Gemini MEDIUM #285. The scratch directory for this object is // created exactly once on the first HandleBlob call. @@ -131,6 +144,16 @@ type s3ChunkKey struct { partVersion uint64 } +// s3PartKey is the manifest-declared part identifier: a (partNo, +// partVersion) tuple. ChunkNo is excluded because the manifest's +// per-part chunk_count + chunk_sizes drive how many chunks to expect +// per part, but the manifest doesn't enumerate (chunkNo) tuples +// directly. +type s3PartKey struct { + partNo uint64 + partVersion uint64 +} + // s3PublicBucket is the dump-format projection of s3BucketMeta. type s3PublicBucket struct { FormatVersion uint32 `json:"format_version"` @@ -245,6 +268,7 @@ func (s *S3Encoder) HandleBucketMeta(key, value []byte) error { Region: live.Region, ACL: live.Acl, } + st.activeGen = live.Generation return nil } @@ -282,10 +306,16 @@ func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { } // Capture the manifest's uploadID so assembleObjectBody can // filter blob chunks belonging to other (stale or in-flight) - // upload attempts. The live parts list is purely structural — - // the public sidecar has no need for it, but its uploadID is - // the load-bearing detail. + // upload attempts. Also capture the manifest's declared + // (partNo, partVersion) set so the assembler restricts itself + // to canonically-declared parts — older partVersions left + // behind by overwrite-then-async-cleanup must NOT be merged + // into the body (Codex P1 #619). st.uploadID = live.UploadID + st.declaredParts = make(map[s3PartKey]struct{}, len(live.Parts)) + for _, p := range live.Parts { + st.declaredParts[s3PartKey{partNo: p.PartNo, partVersion: p.PartVersion}] = struct{}{} + } st.chunkPaths = ensureChunkPaths(st.chunkPaths) return nil } @@ -384,10 +414,16 @@ func (s *S3Encoder) flushBucket(b *s3BucketState) error { return err } } - for _, obj := range b.objects { - if err := s.flushObject(b, bucketDir, obj); err != nil { - return err - } + staleCount, err := s.flushBucketObjects(b, bucketDir) + if err != nil { + return err + } + if staleCount > 0 { + s.emitWarn("s3_stale_generation_objects", + "bucket", b.name, + "active_generation", b.activeGen, + "stale_count", staleCount, + "hint", "stale-gen objects excluded; restore would otherwise emit them under the new bucket") } // closeJSONL errors must surface — they are the canonical "data // did not flush to disk" signal for a writable resource (Gemini @@ -403,6 +439,37 @@ func (s *S3Encoder) flushBucket(b *s3BucketState) error { return nil } +// flushBucketObjects walks the bucket's object set, routes stale-gen +// objects to the orphan path (under --include-orphans) or drops them +// with a warning counter, and flushes active-gen objects normally. +// Split out of flushBucket to keep cyclomatic complexity within the +// package cap. +func (s *S3Encoder) flushBucketObjects(b *s3BucketState, bucketDir string) (int, error) { + stale := 0 + for _, obj := range b.objects { + // Suppress objects from older bucket incarnations: when a + // bucket is deleted and recreated the generation bumps, but + // snapshots taken mid-cleanup can still carry the previous + // generation's manifests + chunks. Routing both to the same + // natural path is non-deterministic last-write-wins (Codex + // P2 #521). When a bucket-meta record is present, only its + // active generation flushes. + if b.activeGen != 0 && obj.generation != b.activeGen { + stale++ + if s.includeOrphans { + if err := s.flushOrphanObject(b, bucketDir, obj); err != nil { + return stale, err + } + } + continue + } + if err := s.flushObject(b, bucketDir, obj); err != nil { + return stale, err + } + } + return stale, nil +} + // closeBucketKeymap closes the per-bucket KEYMAP.jsonl writer (if // opened) and removes the file when no rename was recorded. func closeBucketKeymap(b *s3BucketState) error { @@ -484,15 +551,42 @@ func (s *S3Encoder) flushOrphanObject(b *s3BucketState, bucketDir string, obj *s // safeJoinUnderRoot composes / and asserts the result is // still rooted under . S3 object keys are user-controlled and -// can contain "..", absolute paths, or NUL bytes; without this guard -// a key like "../etc/passwd" would escape the dump tree and overwrite -// host files (Codex P1 #425). +// can contain "..", absolute paths, NUL bytes, or "." segments; +// without this guard a key like "../etc/passwd" would escape the +// dump tree and overwrite host files (Codex P1 #425). +// +// We refuse keys whose path-segment components include "." or ".." +// rather than filepath.Clean'ing them. S3 treats those bytes +// literally — `aws s3 put-object` accepts a key like "a/../b" as +// distinct from "b" — so collapsing them via filepath.Clean would +// silently merge two distinct user keys into one output file +// (Codex P2 #497). Operators with such keys must rename them in +// S3, then re-take the dump; the spec's rename-collisions path +// does not currently cover this. +// +// NUL bytes are also refused: POSIX cannot represent them in a +// path component, and they have no legitimate meaning in S3 keys +// transmitted over HTTP. func safeJoinUnderRoot(root, rel string) (string, error) { if rel == "" { return "", errors.Wrap(ErrS3MalformedKey, "empty object name") } + if strings.ContainsRune(rel, 0) { + return "", errors.Wrapf(ErrS3MalformedKey, "object name contains NUL: %q", rel) + } + for _, seg := range strings.Split(rel, "/") { + switch seg { + case ".", "..": + return "", errors.Wrapf(ErrS3MalformedKey, + "object name has dot segment %q (S3 treats it literally; rename in S3 first)", rel) + } + } cleanRoot := filepath.Clean(root) - joined := filepath.Clean(filepath.Join(cleanRoot, rel)) + // Use filepath.Join here — its only behavioural change vs. raw + // concatenation after the dot-segment guard above is normalising + // a leading "/" off `rel` (which is what we want: absolute-path + // keys collapse safely under bucketDir). + joined := filepath.Join(cleanRoot, rel) rootSep := cleanRoot + string(filepath.Separator) if joined != cleanRoot && !strings.HasPrefix(joined, rootSep) { return "", errors.Wrapf(ErrS3MalformedKey, @@ -579,14 +673,17 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { _ = os.Remove(tmpPath) } }() - // Filter chunks by the manifest's uploadID. A snapshot taken - // during a delete/recreate or a retry-after-failed-CompleteUpload - // can legitimately contain blob chunks for multiple upload - // attempts under the same (bucket, generation, object). Mixing - // them produces corrupted bytes — Codex P1 #500 / Gemini HIGH - // #504. The manifest is the single source of truth; only its - // uploadID's chunks belong in the assembled body. - chunks := filterChunksForManifest(obj.chunkPaths, obj.uploadID) + // Filter chunks by the manifest's uploadID AND its declared + // (partNo, partVersion) set. A snapshot taken during + // delete/recreate, retry-after-failed-CompleteUpload, or + // part-overwrite-before-cleanup can legitimately contain blob + // chunks for multiple upload attempts and/or multiple part + // versions under the same (bucket, generation, object). Mixing + // them produces corrupted bytes — Codex P1 #500 (uploadID), + // Codex P1 #619 (partVersion). The manifest is the single source + // of truth; only its uploadID + declaredParts make it into the + // assembled body. + chunks := filterChunksForManifest(obj.chunkPaths, obj.uploadID, obj.declaredParts) for _, k := range chunks { path := obj.chunkPaths[k] if err := appendFile(tmp, path); err != nil { @@ -606,16 +703,25 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { } // filterChunksForManifest returns the chunk keys belonging to -// manifestUploadID, sorted by (partNo, partVersion, chunkNo). An empty -// manifestUploadID matches every chunk — useful for tests that -// pre-date the uploadID feature, but production callers always have a -// non-empty uploadID via HandleObjectManifest. -func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string) []s3ChunkKey { +// manifestUploadID AND whose (partNo, partVersion) appears in +// declaredParts. Returned keys are sorted by (partNo, partVersion, +// chunkNo) for byte-deterministic body assembly. +// +// An empty manifestUploadID and a nil declaredParts both mean "no +// filter" — used by tests that pre-date these features. Production +// callers always pass non-empty/non-nil values via +// HandleObjectManifest. +func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string, declaredParts map[s3PartKey]struct{}) []s3ChunkKey { keys := make([]s3ChunkKey, 0, len(m)) for k := range m { if manifestUploadID != "" && k.uploadID != manifestUploadID { continue } + if declaredParts != nil { + if _, ok := declaredParts[s3PartKey{partNo: k.partNo, partVersion: k.partVersion}]; !ok { + continue + } + } keys = append(keys, k) } sort.SliceStable(keys, func(i, j int) bool { diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index d181727a8..4f0951127 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -397,6 +397,113 @@ func TestS3_OrphanChunksWrittenWhenIncludeOrphans(t *testing.T) { } } +func TestS3_StalePartVersionExcludedFromAssembledBody(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + // Manifest declares partNo=1 partVersion=9. A stale chunk at + // partVersion=7 (a previous overwrite still uncleaned) must NOT + // be merged — Codex P1 #619. + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 5, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 5, "chunk_count": 1, "part_version": 9}, + }, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.VersionedBlobKey(bucket, gen, object, uploadID, 1, 0, 7), []byte("STALE")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.VersionedBlobKey(bucket, gen, object, uploadID, 1, 0, 9), []byte("OKBYE")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", bucket, object)) //nolint:gosec + if err != nil { + t.Fatal(err) + } + if string(got) != "OKBYE" { + t.Fatalf("body=%q want %q (stale partVersion leaked)", got, "OKBYE") + } +} + +func TestS3_DotSegmentObjectKeyRejected(t *testing.T) { + t.Parallel() + cases := []string{"a/../b", "a/./b", "..", "."} + for _, key := range cases { + t.Run(key, func(t *testing.T) { + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, key, []byte("x"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for key %q", err, key) + } + }) + } +} + +// emitObjectAtGen is a helper for cross-generation tests: emits a +// manifest + single chunk under an explicit (gen, uploadID) instead +// of the bucket's active gen. +func emitObjectAtGen(t *testing.T, enc *S3Encoder, bucket string, gen uint64, object, uploadID string, body []byte) { + t.Helper() + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": int64(len(body)), "parts": []map[string]any{ + {"part_no": 1, "size_bytes": int64(len(body)), "chunk_count": 1}, + }, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), body); err != nil { + t.Fatal(err) + } +} + +func TestS3_StaleGenerationObjectExcluded(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + var events []string + enc.WithWarnSink(func(e string, _ ...any) { events = append(events, e) }) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": 7, + })); err != nil { + t.Fatal(err) + } + emitObjectAtGen(t, enc, "b", 6, "stale-obj", "us", []byte("STALE")) + emitObjectAtGen(t, enc, "b", 7, "live-obj", "ul", []byte("LIVE")) + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(root, "s3", "b", "live-obj")); err != nil { + t.Fatalf("live-gen object missing: %v", err) + } + if _, err := os.Stat(filepath.Join(root, "s3", "b", "stale-obj")); !os.IsNotExist(err) { + t.Fatalf("stale-gen object must NOT flush, stat err=%v", err) + } + if !sliceContains(events, "s3_stale_generation_objects") { + t.Fatalf("expected s3_stale_generation_objects warning, got %v", events) + } +} + +func sliceContains(haystack []string, needle string) bool { + for _, s := range haystack { + if s == needle { + return true + } + } + return false +} + func TestS3_VersionedBlobAssembledByPartVersionOrder(t *testing.T) { t.Parallel() enc, root := newS3Encoder(t) From 9029addb3dbd6e4dd6bdf2b2fe149b9d98cd9512 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:14:02 +0900 Subject: [PATCH 06/20] backup: reject truncated DynamoDB keys (PR #716, round 3) Two Codex P2 strict-validation follow-ups. #117 -- empty table-meta segment. HandleTableMeta accepted `!ddb|meta|table|` (no encoded segment) because base64.RawURLEncoding.DecodeString("") returns empty bytes without error, so the schema would route under the empty table name. Now rejected with ErrDDBMalformedKey before the JSON decode. TestDDB_RejectsTableMetaKeyWithEmptySegment. #303 -- truncated item key. parseDDBItemKey accepted `!ddb|item|
|7|` (gen separator present, no primary-key payload). The gen-end check was "genEnd > 0" which a trailing `|` satisfies. Added a follow-up check that genEnd+1 != len(afterTable) so a payload-less key surfaces as ErrDDBMalformedKey rather than emit under value-side attributes only. TestDDB_RejectsItemKeyWithEmptyPrimaryKeyPayload. --- internal/backup/dynamodb.go | 17 +++++++++++++++++ internal/backup/dynamodb_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go index 2f3dea11b..b83355ad8 100644 --- a/internal/backup/dynamodb.go +++ b/internal/backup/dynamodb.go @@ -113,6 +113,13 @@ func (d *DDBEncoder) HandleTableMeta(key, value []byte) error { if err != nil { return errors.Wrap(ErrDDBMalformedKey, err.Error()) } + if encoded == "" { + // base64.RawURLEncoding.DecodeString("") succeeds with an + // empty slice, so without this guard a truncated key like + // `!ddb|meta|table|` would be routed under the empty table + // name. That hides corruption (Codex P2 #117). + return errors.Wrapf(ErrDDBMalformedKey, "empty table segment: %q", key) + } rawName, err := base64.RawURLEncoding.DecodeString(encoded) if err != nil { return errors.Wrap(ErrDDBMalformedKey, err.Error()) @@ -305,6 +312,16 @@ func parseDDBItemKey(key []byte) (string, uint64, error) { if err != nil { return "", 0, errors.Wrap(ErrDDBMalformedKey, err.Error()) } + // Item keys must carry a primary-key payload after the gen + // separator (the encoded hash + range bytes). A bare + // `!ddb|item|
||` cannot identify any item; treating + // such a key as valid would let a truncated record slip past + // malformed-key detection and emit under value-side attributes + // only, masking snapshot corruption (Codex P2 #303). + if genEnd+1 == len(afterTable) { + return "", 0, errors.Wrapf(ErrDDBMalformedKey, + "item key missing primary-key payload: %q", key) + } return enc, gen, nil } diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go index 54d962cea..3caf78b1d 100644 --- a/internal/backup/dynamodb_test.go +++ b/internal/backup/dynamodb_test.go @@ -490,6 +490,30 @@ func TestDDB_ParseItemKeyExtractsGeneration(t *testing.T) { } } +func TestDDB_RejectsTableMetaKeyWithEmptySegment(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + // `!ddb|meta|table|` (no encoded segment) -- base64url-decodes to + // an empty name and would otherwise route the schema under "". + // Codex P2 #117. + err := enc.HandleTableMeta([]byte(DDBTableMetaPrefix), []byte("ignored")) + if !errors.Is(err, ErrDDBMalformedKey) { + t.Fatalf("err=%v", err) + } +} + +func TestDDB_RejectsItemKeyWithEmptyPrimaryKeyPayload(t *testing.T) { + t.Parallel() + // `!ddb|item|
|7|` -- gen separator present but no + // primary-key payload. Codex P2 #303. + key := []byte(DDBItemPrefix) + key = append(key, []byte("dA")...) // base64url("t") + key = append(key, []byte("|7|")...) + if _, _, err := parseDDBItemKey(key); !errors.Is(err, ErrDDBMalformedKey) { + t.Fatalf("err=%v want ErrDDBMalformedKey for truncated item key", err) + } +} + func TestDDB_RejectsKeyWithMissingTableSegment(t *testing.T) { t.Parallel() enc, _ := newDDBEncoder(t) From 2c44292fba750de6a43a74a3c97d7c3df09976ff Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:21:00 +0900 Subject: [PATCH 07/20] backup: handle file-vs-directory S3 key collisions (PR #718, round 3) Codex P1 #615: when a bucket contains both `path/to` and `path/to/sub`, the natural-path mapping puts the shorter key at a file path and the longer key requires that path to be a parent directory. POSIX cannot represent both; the prior code would fail with EEXIST/ENOTDIR at MkdirAll/Rename during finalize, breaking the documented leaf-data-suffix rename strategy. Pre-flush computeDirPrefixes() walks every active-gen object key and accumulates each parent prefix (a key "a/b/c" contributes "a" and "a/b"). flushBucketObjects checks whether each object's key is itself a prefix consumed by another active key; if so, flushObjectWithCollision is called with needsLeafDataRename=true, which routes resolveObjectFilename onto the existing S3LeafDataSuffix path. KEYMAP.jsonl records the rename with KindS3LeafData so restore can reverse it. flushObject (the no-collision wrapper) was inlined into flushObjectWithCollision since it was unused after the call sites moved to the explicit-flag form. Test TestS3_FileVsDirectoryKeyCollisionGetsLeafDataRename emits "path/to" and "path/to/sub", asserts: - "path/to/sub" lands at its natural path with body "CHILD" - "path/to" is renamed to "path/to.elastickv-leaf-data" with body "LEAF" - KEYMAP.jsonl carries one record with KindS3LeafData and original "path/to" --- internal/backup/s3.go | 60 +++++++++++++++++++++++++++------ internal/backup/s3_test.go | 69 +++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 14 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index d413d28e5..bc141c3b0 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -445,6 +445,17 @@ func (s *S3Encoder) flushBucket(b *s3BucketState) error { // Split out of flushBucket to keep cyclomatic complexity within the // package cap. func (s *S3Encoder) flushBucketObjects(b *s3BucketState, bucketDir string) (int, error) { + // Pre-compute the set of "directory prefixes" required by the + // union of active-gen object keys: for an object "a/b/c" the + // directory prefixes "a" and "a/b" are mandatory parent dirs on + // the filesystem. An object whose key IS one of those prefixes + // (e.g., bucket holds both "a/b" and "a/b/c") cannot share the + // natural path with the longer key — POSIX requires that path + // be a directory. The design's documented strategy is to rename + // the shorter key to ".elastickv-leaf-data" and record the + // rename in KEYMAP.jsonl so restore can reverse it. Codex P1 + // #615. + dirPrefixes := s.computeDirPrefixes(b) stale := 0 for _, obj := range b.objects { // Suppress objects from older bucket incarnations: when a @@ -463,13 +474,36 @@ func (s *S3Encoder) flushBucketObjects(b *s3BucketState, bucketDir string) (int, } continue } - if err := s.flushObject(b, bucketDir, obj); err != nil { + needsLeafDataRename := dirPrefixes[obj.object] + if err := s.flushObjectWithCollision(b, bucketDir, obj, needsLeafDataRename); err != nil { return stale, err } } return stale, nil } +// computeDirPrefixes returns the set of directory prefixes the union +// of active-gen object keys requires. For object key "a/b/c" the +// prefixes are {"a", "a/b"}. The set is consulted at flush time to +// detect file-vs-directory collisions. +func (s *S3Encoder) computeDirPrefixes(b *s3BucketState) map[string]bool { + out := make(map[string]bool) + for _, obj := range b.objects { + if b.activeGen != 0 && obj.generation != b.activeGen { + continue + } + key := obj.object + // Walk parent directories: split on "/" and accumulate. + for i := 0; i < len(key); i++ { + if key[i] != '/' { + continue + } + out[key[:i]] = true + } + } + return out +} + // closeBucketKeymap closes the per-bucket KEYMAP.jsonl writer (if // opened) and removes the file when no rename was recorded. func closeBucketKeymap(b *s3BucketState) error { @@ -485,11 +519,11 @@ func closeBucketKeymap(b *s3BucketState) error { return nil } -func (s *S3Encoder) flushObject(b *s3BucketState, bucketDir string, obj *s3ObjectState) error { +func (s *S3Encoder) flushObjectWithCollision(b *s3BucketState, bucketDir string, obj *s3ObjectState, needsLeafDataRename bool) error { if obj.manifest == nil { return s.flushOrphanObject(b, bucketDir, obj) } - objectName, kind, err := s.resolveObjectFilename(b, obj) + objectName, kind, err := s.resolveObjectFilename(b, obj, needsLeafDataRename) if err != nil { return err } @@ -598,7 +632,13 @@ func safeJoinUnderRoot(root, rel string) (string, error) { // resolveObjectFilename returns the relative path of the assembled // body within the bucket directory, plus the keymap "kind" when a // rename took place ("" when the object writes at its natural path). -func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState) (string, string, error) { +// +// needsLeafDataRename is set by the caller when another active-gen +// object's key would force this object's natural path to be a +// directory (e.g., bucket holds both "a/b" and "a/b/c"). The shorter +// key is renamed to ".elastickv-leaf-data" and recorded in +// KEYMAP.jsonl with KindS3LeafData. Codex P1 #615. +func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState, needsLeafDataRename bool) (string, string, error) { if strings.HasSuffix(obj.object, S3MetaSuffixReserved) { if !s.renameCollisions { return "", "", errors.Wrapf(ErrS3MetaSuffixCollision, @@ -606,12 +646,12 @@ func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState) } return obj.object + ".user-data", KindMetaCollision, nil } - // Object path taken at face value. Path collisions (`path/to` - // vs `path/to/sub`) are deferred until the master pipeline - // detects them across multiple manifests; this PR's per-object - // flush trusts the caller's collision detection. Path-traversal - // sanitisation runs in safeJoinUnderRoot, downstream of this - // function, where the bucket-directory root is in scope. + if needsLeafDataRename { + return obj.object + S3LeafDataSuffix, KindS3LeafData, nil + } + // Object path taken at face value. Path-traversal sanitisation + // runs in safeJoinUnderRoot, downstream of this function, where + // the bucket-directory root is in scope. return obj.object, "", nil } diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 4f0951127..07d1ced6d 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -452,10 +452,10 @@ func TestS3_DotSegmentObjectKeyRejected(t *testing.T) { } } -// emitObjectAtGen is a helper for cross-generation tests: emits a -// manifest + single chunk under an explicit (gen, uploadID) instead -// of the bucket's active gen. -func emitObjectAtGen(t *testing.T, enc *S3Encoder, bucket string, gen uint64, object, uploadID string, body []byte) { +// emitObjectAtGen is a helper for cross-generation and collision +// tests: emits a manifest + single chunk under an explicit +// (bucket, gen, uploadID). +func emitObjectAtGen(t *testing.T, enc *S3Encoder, bucket string, gen uint64, object, uploadID string, body []byte) { //nolint:unparam // bucket varies in newer tests via this helper t.Helper() if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ "upload_id": uploadID, "size_bytes": int64(len(body)), "parts": []map[string]any{ @@ -504,6 +504,67 @@ func sliceContains(haystack []string, needle string) bool { return false } +// readKeymapFirstRecord reads the per-bucket KEYMAP.jsonl and returns +// the first decoded record. Test helper consolidating the JSON+base64 +// dance so individual tests stay under the cyclop cap. +func readKeymapFirstRecord(t *testing.T, path string) KeymapRecord { + t.Helper() + body, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + var rec KeymapRecord + if err := json.Unmarshal(bytes.TrimRight(body, "\n"), &rec); err != nil { + t.Fatalf("unmarshal: %v", err) + } + return rec +} + +func TestS3_FileVsDirectoryKeyCollisionGetsLeafDataRename(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + // Two objects whose keys are file-vs-directory siblings: S3 + // permits both, POSIX cannot. Codex P1 #615. + emitObjectAtGen(t, enc, bucket, gen, "path/to", "u1", []byte("LEAF")) + emitObjectAtGen(t, enc, bucket, gen, "path/to/sub", "u2", []byte("CHILD")) + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + if string(readBytesFile(t, filepath.Join(root, "s3", bucket, "path/to/sub"))) != "CHILD" { + t.Fatalf("child body mismatch") + } + if string(readBytesFile(t, filepath.Join(root, "s3", bucket, "path/to.elastickv-leaf-data"))) != "LEAF" { + t.Fatalf("leaf body mismatch") + } + rec := readKeymapFirstRecord(t, filepath.Join(root, "s3", bucket, "KEYMAP.jsonl")) + if rec.Kind != KindS3LeafData { + t.Fatalf("kind=%q", rec.Kind) + } + orig, err := rec.Original() + if err != nil { + t.Fatal(err) + } + if string(orig) != "path/to" { + t.Fatalf("original=%q", orig) + } +} + +func readBytesFile(t *testing.T, path string) []byte { + t.Helper() + b, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + return b +} + func TestS3_VersionedBlobAssembledByPartVersionOrder(t *testing.T) { t.Parallel() enc, root := newS3Encoder(t) From ba33df8e5a67fb03af1abfc606c82e5f0da67653 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:35:07 +0900 Subject: [PATCH 08/20] backup: validate chunk completeness + reject empty slash segments (PR #718, round 4) Two Codex P1 follow-ups. #729 -- chunk completeness check. assembleObjectBody filtered + sorted whatever chunk files happened to exist and wrote them, but never verified the manifest's declared chunk_count was actually present. A partial / racy / corrupted snapshot would silently emit a truncated body. Added verifyChunkCompleteness which counts chunks per (partNo, partVersion) and asserts the count matches manifest.parts[].chunk_count AND the highest chunkNo equals chunk_count-1. Mismatch surfaces as ErrS3IncompleteBlobChunks before any bytes are written. The declaredParts map's value type changed from struct{} to s3DeclaredPart{chunkCount} to carry the contract. #614 -- empty slash segments. safeJoinUnderRoot rejected `.` and `..` but allowed empty segments ("a//b", "a/", trailing "/"). filepath.Join collapses these, so distinct S3 keys would silently overwrite each other at finalize. The dot-segment guard now also rejects "" segments anywhere except the leading position (where a leading "/" produces an initial empty segment that filepath.Join handles safely under the already-tested absolute-path-confined-under-bucket behaviour). Tests: TestS3_IncompleteChunksRejected, TestS3_EmptySlashSegmentsRejected (covers a//b, a/, /a//, x/). --- internal/backup/s3.go | 108 ++++++++++++++++++++++++++++++++----- internal/backup/s3_test.go | 49 +++++++++++++++++ 2 files changed, 143 insertions(+), 14 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index bc141c3b0..f6d48b72d 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -48,8 +48,57 @@ var ( // ErrS3MetaSuffixCollision is returned when a user object key // collides with the reserved S3MetaSuffixReserved suffix. ErrS3MetaSuffixCollision = errors.New("backup: user S3 object key collides with reserved sidecar suffix") + // ErrS3IncompleteBlobChunks is returned when a manifest declares + // N chunks for some part but the snapshot did not contain all N. + // Without this guard a partial / racy snapshot would silently + // emit a truncated body. Codex P1 #729. + ErrS3IncompleteBlobChunks = errors.New("backup: incomplete blob chunks for manifest-declared part") ) +// verifyChunkCompleteness checks every (partNo, partVersion) entry in +// declaredParts has the right number of chunkNo values present in +// chunks. Chunks are expected at chunkNo in [0, chunk_count); a +// missing index in that range surfaces as +// ErrS3IncompleteBlobChunks rather than letting the assembler emit a +// truncated body. +// +// declaredParts == nil means "no contract to verify" — used by tests +// that pre-date the manifest-parts feature; production callers +// always populate it via HandleObjectManifest. +func verifyChunkCompleteness(chunks []s3ChunkKey, declaredParts map[s3PartKey]s3DeclaredPart) error { + if declaredParts == nil { + return nil + } + // Count present chunks per (partNo, partVersion). + type observed struct { + count uint64 + maxIndex uint64 + } + got := make(map[s3PartKey]observed, len(declaredParts)) + for _, k := range chunks { + pk := s3PartKey{partNo: k.partNo, partVersion: k.partVersion} + o := got[pk] + o.count++ + if k.chunkNo > o.maxIndex { + o.maxIndex = k.chunkNo + } + got[pk] = o + } + for pk, want := range declaredParts { + o := got[pk] + // We accept o.count == want.chunkCount AND + // o.maxIndex == chunkCount-1, because a snapshot with N + // duplicates of chunkNo=0 would satisfy the count check + // alone. Both the count and the highest index must match. + if want.chunkCount > 0 && (o.count != want.chunkCount || o.maxIndex+1 != want.chunkCount) { + return errors.Wrapf(ErrS3IncompleteBlobChunks, + "partNo=%d partVersion=%d declared chunks=%d, observed count=%d maxIndex=%d", + pk.partNo, pk.partVersion, want.chunkCount, o.count, o.maxIndex) + } + } + return nil +} + // S3Encoder emits per-bucket _bucket.json + assembled object bodies + // .elastickv-meta.json sidecars + KEYMAP.jsonl, per the Phase 0 // design (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md). @@ -120,13 +169,13 @@ type s3ObjectState struct { // window) cannot be merged into the active body — Codex P1 #500, // Gemini HIGH #106/#476/#504. uploadID string - // declaredParts is the set of (partNo, partVersion) tuples the - // manifest claims belong to this object. When non-nil, the body - // assembler restricts chunkPaths to entries matching this set — - // Codex P1 #619. nil means "no filter" (used only in tests that - // pre-date the manifest-parts feature; production callers always - // receive a non-nil set via HandleObjectManifest). - declaredParts map[s3PartKey]struct{} + // declaredParts maps each manifest-declared (partNo, partVersion) + // to the metadata the assembler needs to validate completeness + // (chunk_count). When non-nil, the body assembler restricts + // chunkPaths to entries matching the keys AND verifies every + // chunk index in [0, chunk_count) is present — Codex P1 #619 + // (filter) + #729 (completeness). nil means "no filter". + declaredParts map[s3PartKey]s3DeclaredPart // scratchDirCreated avoids the per-blob MkdirAll syscall flagged // by Gemini MEDIUM #285. The scratch directory for this object is // created exactly once on the first HandleBlob call. @@ -146,14 +195,22 @@ type s3ChunkKey struct { // s3PartKey is the manifest-declared part identifier: a (partNo, // partVersion) tuple. ChunkNo is excluded because the manifest's -// per-part chunk_count + chunk_sizes drive how many chunks to expect -// per part, but the manifest doesn't enumerate (chunkNo) tuples -// directly. +// per-part chunk_count drives how many chunks to expect per part; +// that count is stored on the s3DeclaredPart value, not in the key. type s3PartKey struct { partNo uint64 partVersion uint64 } +// s3DeclaredPart captures what the manifest claims for a part: its +// expected chunk_count. assembleObjectBody verifies that one chunk +// per (partNo, partVersion, chunkNo) in [0, chunk_count) actually +// arrived; a missing chunk surfaces as ErrS3IncompleteBlobChunks +// rather than a silently-truncated body (Codex P1 #729). +type s3DeclaredPart struct { + chunkCount uint64 +} + // s3PublicBucket is the dump-format projection of s3BucketMeta. type s3PublicBucket struct { FormatVersion uint32 `json:"format_version"` @@ -312,9 +369,11 @@ func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { // behind by overwrite-then-async-cleanup must NOT be merged // into the body (Codex P1 #619). st.uploadID = live.UploadID - st.declaredParts = make(map[s3PartKey]struct{}, len(live.Parts)) + st.declaredParts = make(map[s3PartKey]s3DeclaredPart, len(live.Parts)) for _, p := range live.Parts { - st.declaredParts[s3PartKey{partNo: p.PartNo, partVersion: p.PartVersion}] = struct{}{} + st.declaredParts[s3PartKey{partNo: p.PartNo, partVersion: p.PartVersion}] = s3DeclaredPart{ + chunkCount: p.ChunkCount, + } } st.chunkPaths = ensureChunkPaths(st.chunkPaths) return nil @@ -608,11 +667,28 @@ func safeJoinUnderRoot(root, rel string) (string, error) { if strings.ContainsRune(rel, 0) { return "", errors.Wrapf(ErrS3MalformedKey, "object name contains NUL: %q", rel) } - for _, seg := range strings.Split(rel, "/") { + // Split on "/" and inspect every segment. S3 treats "a/", "a", + // and "a//b" as three distinct keys, but filepath.Join collapses + // them onto one filesystem path; without explicit rejection, + // distinct user keys would silently overwrite each other at + // finalize (Codex P1 #614). + segs := strings.Split(rel, "/") + for i, seg := range segs { switch seg { case ".", "..": return "", errors.Wrapf(ErrS3MalformedKey, "object name has dot segment %q (S3 treats it literally; rename in S3 first)", rel) + case "": + // A leading "/" produces an initial empty segment + // (segs[0] == ""); filepath.Join handles that case + // safely by stripping the prefix, matching the + // already-tested "absolute path collapses under + // bucket dir" behaviour. Reject empty segments + // anywhere else (mid-path "//" or trailing "/"). + if i != 0 { + return "", errors.Wrapf(ErrS3MalformedKey, + "object name has empty path segment %q", rel) + } } } cleanRoot := filepath.Clean(root) @@ -724,6 +800,10 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { // of truth; only its uploadID + declaredParts make it into the // assembled body. chunks := filterChunksForManifest(obj.chunkPaths, obj.uploadID, obj.declaredParts) + if err := verifyChunkCompleteness(chunks, obj.declaredParts); err != nil { + _ = tmp.Close() + return err + } for _, k := range chunks { path := obj.chunkPaths[k] if err := appendFile(tmp, path); err != nil { @@ -751,7 +831,7 @@ func assembleObjectBody(outPath string, obj *s3ObjectState) error { // filter" — used by tests that pre-date these features. Production // callers always pass non-empty/non-nil values via // HandleObjectManifest. -func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string, declaredParts map[s3PartKey]struct{}) []s3ChunkKey { +func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string, declaredParts map[s3PartKey]s3DeclaredPart) []s3ChunkKey { keys := make([]s3ChunkKey, 0, len(m)) for k := range m { if manifestUploadID != "" && k.uploadID != manifestUploadID { diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 07d1ced6d..9a22f786c 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -565,6 +565,55 @@ func readBytesFile(t *testing.T, path string) []byte { return b } +func TestS3_IncompleteChunksRejected(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + // Manifest declares 3 chunks for partNo=1 but the snapshot only + // has 2 (chunkNo=0 and chunkNo=2; chunkNo=1 missing). Codex P1 + // #729. + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 9, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 9, "chunk_count": 3}, + }, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), []byte("AAA")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 2), []byte("CCC")); err != nil { + t.Fatal(err) + } + err := enc.Finalize() + if !errors.Is(err, ErrS3IncompleteBlobChunks) { + t.Fatalf("err=%v want ErrS3IncompleteBlobChunks for missing chunk", err) + } +} + +func TestS3_EmptySlashSegmentsRejected(t *testing.T) { + t.Parallel() + cases := []string{"a//b", "a/", "/a//", "x/"} + for _, key := range cases { + t.Run(key, func(t *testing.T) { + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, key, []byte("x"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for key %q", err, key) + } + }) + } +} + func TestS3_VersionedBlobAssembledByPartVersionOrder(t *testing.T) { t.Parallel() enc, root := newS3Encoder(t) From 2febd423efd678cd10d042b924814837a1cadf4b Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:46:35 +0900 Subject: [PATCH 09/20] backup: reject leading-slash S3 object keys (PR #718, round 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 5 (commit 2f87b843): `safeJoinUnderRoot` permitted an empty first segment so leading-slash keys like `/a` were accepted and then normalised by `filepath.Join` to the same output path as `a`. S3 treats `/a` and `a` as two distinct keys (the literal byte '/' is part of the key), so a bucket containing both produced last-flush-wins corruption with no KEYMAP record. The "absolute path collapses safely under bucket dir" comfort the previous behaviour leaned on was false comfort: the collapse silently merged distinct user data. Now empty segments are refused everywhere — leading, mid-path, and trailing — with ErrS3MalformedKey. Operators with leading-slash keys must rename them in S3 first. The previous test `TestS3_AbsolutePathObjectKeyConfinedUnderBucket` (which asserted the wrong behaviour) is replaced by `TestS3_LeadingSlashObjectKeyRejected`. --- internal/backup/s3.go | 21 ++++++++++----------- internal/backup/s3_test.go | 33 ++++++++++++++------------------- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index f6d48b72d..cd49c8c44 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -672,23 +672,22 @@ func safeJoinUnderRoot(root, rel string) (string, error) { // them onto one filesystem path; without explicit rejection, // distinct user keys would silently overwrite each other at // finalize (Codex P1 #614). + // + // Empty segments are rejected wherever they appear — including + // the leading position. A leading "/" produces an initial empty + // segment (segs[0] == "") which filepath.Join would otherwise + // strip, collapsing "/a" onto the same output path as "a". + // Because S3 treats those as two distinct keys, last-flush wins + // and silently overwrites the other (Codex P1 round 5). segs := strings.Split(rel, "/") - for i, seg := range segs { + for _, seg := range segs { switch seg { case ".", "..": return "", errors.Wrapf(ErrS3MalformedKey, "object name has dot segment %q (S3 treats it literally; rename in S3 first)", rel) case "": - // A leading "/" produces an initial empty segment - // (segs[0] == ""); filepath.Join handles that case - // safely by stripping the prefix, matching the - // already-tested "absolute path collapses under - // bucket dir" behaviour. Reject empty segments - // anywhere else (mid-path "//" or trailing "/"). - if i != 0 { - return "", errors.Wrapf(ErrS3MalformedKey, - "object name has empty path segment %q", rel) - } + return "", errors.Wrapf(ErrS3MalformedKey, + "object name has empty path segment %q", rel) } } cleanRoot := filepath.Clean(root) diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 9a22f786c..13e587dc9 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -284,26 +284,21 @@ func TestS3_PathTraversalAttemptRejected(t *testing.T) { } } -func TestS3_AbsolutePathObjectKeyConfinedUnderBucket(t *testing.T) { +func TestS3_LeadingSlashObjectKeyRejected(t *testing.T) { t.Parallel() - // filepath.Join normalises a leading "/" on the second arg, so - // "/etc/host" becomes "/etc/host" — under the bucket - // root, not at filesystem root. This is safe (the user gets a - // surprising-but-confined path) and matches what `aws s3 sync` - // would round-trip back. We assert the safe outcome rather than - // rejecting; rejection would surprise operators with legitimate - // keys whose first byte is '/'. - enc, root := newS3Encoder(t) - emitObject(t, enc, "b", 1, "/etc/host-confined", []byte("ok"), "") - if err := enc.Finalize(); err != nil { - t.Fatal(err) - } - got, err := os.ReadFile(filepath.Join(root, "s3", "b", "etc", "host-confined")) //nolint:gosec - if err != nil { - t.Fatalf("absolute-path key must end up under the bucket dir: %v", err) - } - if string(got) != "ok" { - t.Fatalf("body=%q", got) + // Codex P1 round 5: S3 treats "/a" and "a" as two distinct keys + // (the literal byte '/' is part of the key). filepath.Join would + // silently strip the leading "/" and collapse both onto the same + // output path, so a bucket containing both objects would produce + // last-flush-wins corruption with no KEYMAP record. The encoder + // must refuse any key whose first segment is empty rather than + // "confine and merge" them. Operators with such keys must rename + // in S3 first. + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, "/etc/host-attack", []byte("ok"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for leading-slash key", err) } } From a4fce85ffe56b50032850f40dbebf61e9b446c8e Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 20:56:32 +0900 Subject: [PATCH 10/20] backup: reject backslashes in S3 object keys (PR #718, round 6) Codex P1 round 6 (commit 09c2a0e0): `safeJoinUnderRoot`'s dot-segment scan splits on '/' only, but `filepath.Join` treats '\' as a separator on Windows. A key like `a\..\b` would bypass the dot-segment check on Linux/macOS (where '\' is a literal byte) and then collapse to `b` on Windows, silently merging two distinct S3 keys with no KEYMAP record. Dumps must produce identical output regardless of host OS, so reject '\' on every platform; operators with such keys must rename them in S3 first. Test: TestS3_BackslashObjectKeyRejected covers the dot-segment escape, leading/trailing/embedded backslashes. --- internal/backup/s3.go | 12 ++++++++++++ internal/backup/s3_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index cd49c8c44..a195ea746 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -660,6 +660,14 @@ func (s *S3Encoder) flushOrphanObject(b *s3BucketState, bucketDir string, obj *s // NUL bytes are also refused: POSIX cannot represent them in a // path component, and they have no legitimate meaning in S3 keys // transmitted over HTTP. +// +// Backslashes are refused for the same reason: filepath.Join treats +// '\' as a separator on Windows, so a key like `a\..\b` would bypass +// the '/'-based dot-segment scan below and normalise to `b`, +// silently merging two distinct S3 keys (Codex P1 round 6). Dumps +// must produce identical output regardless of the host OS, so we +// refuse '\' on every platform; operators with such keys must +// rename them in S3 first. func safeJoinUnderRoot(root, rel string) (string, error) { if rel == "" { return "", errors.Wrap(ErrS3MalformedKey, "empty object name") @@ -667,6 +675,10 @@ func safeJoinUnderRoot(root, rel string) (string, error) { if strings.ContainsRune(rel, 0) { return "", errors.Wrapf(ErrS3MalformedKey, "object name contains NUL: %q", rel) } + if strings.ContainsRune(rel, '\\') { + return "", errors.Wrapf(ErrS3MalformedKey, + "object name contains backslash %q (treated as a separator on Windows; rename in S3 first)", rel) + } // Split on "/" and inspect every segment. S3 treats "a/", "a", // and "a//b" as three distinct keys, but filepath.Join collapses // them onto one filesystem path; without explicit rejection, diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 13e587dc9..3bd7b2183 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -284,6 +284,33 @@ func TestS3_PathTraversalAttemptRejected(t *testing.T) { } } +// TestS3_BackslashObjectKeyRejected is the regression for Codex P1 +// round 6: filepath.Join treats '\' as a separator on Windows, so +// keys like `a\..\b` would bypass the '/'-based dot-segment scan +// and normalise to `b`. Dumps must produce identical output +// regardless of host OS, so backslashes are refused on every +// platform. +func TestS3_BackslashObjectKeyRejected(t *testing.T) { + t.Parallel() + cases := []string{ + `a\..\b`, + `leading\path`, + `trailing\`, + `\absolute-windows`, + } + for _, key := range cases { + t.Run(key, func(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, key, []byte("x"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for backslash key %q", err, key) + } + }) + } +} + func TestS3_LeadingSlashObjectKeyRejected(t *testing.T) { t.Parallel() // Codex P1 round 5: S3 treats "/a" and "a" as two distinct keys From 46cb56feb1d165d6d6d131d528aecccb9440ac11 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 21:32:46 +0900 Subject: [PATCH 11/20] backup: close S3 KEYMAP fd + use openSidecarFile (PR #718, round 7) Codex round 9 raised two issues on commit ab38eb0a: 1. P1: closeBucketKeymap leaked file descriptors. recordKeymap stored only the *KeymapWriter; closeBucketKeymap called KeymapWriter.Close() which flushes the bufio buffer but does NOT close the underlying *os.File. A dump producing keymaps for many buckets accumulated descriptors until EMFILE, after which subsequent bucket flushes failed and the dump output was incomplete. Track the *os.File on s3BucketState and close it from closeBucketKeymap alongside the KeymapWriter flush. 2. P2: recordKeymap used os.Create for KEYMAP.jsonl, which follows symlinks and clobbers hard links. The redis encoder already routes through openSidecarFile for the same kind of sidecar; mirror that path so a stale prior run (or local adversary) cannot turn a missing KEYMAP into an arbitrary-write primitive against /etc/passwd or similar. Test: TestS3_KeymapRefusesSymlinkAtFinalize pre-creates KEYMAP.jsonl as a symlink to a bait file, drives a meta-suffix rename (so recordKeymap fires), and asserts both that the finalize returns the symlink-refusal error and that the bait file is untouched. --- internal/backup/s3.go | 38 +++++++++++++++++++++++++++++--------- internal/backup/s3_test.go | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index a195ea746..b2db6f16a 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -147,8 +147,14 @@ type s3BucketState struct { // every object flushes (the prior orphan-warning path covers it). activeGen uint64 objects map[string]*s3ObjectState // keyed by "object\x00generation" - keymap *KeymapWriter - keymapDir string + // keymap / keymapFile / keymapDir are lazily set on the first + // rename. KeymapWriter.Close only flushes the bufio buffer, so + // the *os.File is tracked separately to be closed at finalize — + // otherwise a dump that produces keymaps for many buckets + // accumulates descriptors until EMFILE (Codex P1 round 9). + keymap *KeymapWriter + keymapFile *os.File + keymapDir string // incompleteUploadsJL is opened lazily on the first // !s3|upload|meta or !s3|upload|part record under // --include-incomplete-uploads, then reused for every subsequent @@ -564,18 +570,27 @@ func (s *S3Encoder) computeDirPrefixes(b *s3BucketState) map[string]bool { } // closeBucketKeymap closes the per-bucket KEYMAP.jsonl writer (if -// opened) and removes the file when no rename was recorded. +// opened) and removes the file when no rename was recorded. The +// *os.File is closed separately because KeymapWriter.Close only +// flushes its bufio buffer; without explicit fd close, dumps that +// produce keymaps for many buckets leak descriptors until EMFILE +// (Codex P1 round 9). func closeBucketKeymap(b *s3BucketState) error { if b.keymap == nil { return nil } - if err := b.keymap.Close(); err != nil { - return err + flushErr := b.keymap.Close() + var closeErr error + if b.keymapFile != nil { + closeErr = b.keymapFile.Close() + } + if flushErr == nil && closeErr != nil { + flushErr = errors.WithStack(closeErr) } if b.keymap.Count() == 0 && b.keymapDir != "" { _ = os.Remove(filepath.Join(b.keymapDir, "KEYMAP.jsonl")) } - return nil + return flushErr } func (s *S3Encoder) flushObjectWithCollision(b *s3BucketState, bucketDir string, obj *s3ObjectState, needsLeafDataRename bool) error { @@ -744,12 +759,17 @@ func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState, func (s *S3Encoder) recordKeymap(b *s3BucketState, bucketDir, encodedFilename string, original []byte, kind string) error { if b.keymap == nil { - path := filepath.Join(bucketDir, "KEYMAP.jsonl") - f, err := os.Create(path) //nolint:gosec // path composed from output root + // openSidecarFile (per-platform) refuses both symlinks and + // hard-link clobber attacks. The previous os.Create here + // followed both, leaving an arbitrary-write primitive if a + // stale prior run or local adversary placed a link at the + // path. Codex P2 round 9. + f, err := openSidecarFile(filepath.Join(bucketDir, "KEYMAP.jsonl")) if err != nil { - return errors.WithStack(err) + return err } b.keymap = NewKeymapWriter(f) + b.keymapFile = f b.keymapDir = bucketDir } return b.keymap.WriteOriginal(encodedFilename, original, kind) diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 3bd7b2183..c8ba5328a 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "os" "path/filepath" + "strings" "testing" "github.com/bootjp/elastickv/internal/s3keys" @@ -284,6 +285,40 @@ func TestS3_PathTraversalAttemptRejected(t *testing.T) { } } +// TestS3_KeymapRefusesSymlinkAtFinalize is the regression for Codex +// P2 round 9 on PR #718: the S3 encoder's recordKeymap was using +// os.Create directly, which follows symlinks. A bucket whose +// KEYMAP.jsonl path is a pre-existing symlink (from a stale prior +// run or a local adversary) would have its target truncated when +// the first rename was recorded. recordKeymap now goes through +// openSidecarFile, mirroring the redis encoder's guarded open. +func TestS3_KeymapRefusesSymlinkAtFinalize(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + bucketDir := filepath.Join(root, "s3", bucket) + if err := os.MkdirAll(bucketDir, 0o755); err != nil { + t.Fatal(err) + } + bait := filepath.Join(root, "bait-keymap") + if err := os.WriteFile(bait, []byte("stay-out"), 0o600); err != nil { + t.Fatal(err) + } + if err := os.Symlink(bait, filepath.Join(bucketDir, "KEYMAP.jsonl")); err != nil { + t.Fatal(err) + } + // Drive a meta-suffix-collision rename so recordKeymap fires. + enc.WithRenameCollisions(true) + emitObject(t, enc, bucket, 1, "evil.elastickv-meta.json", []byte("payload"), "") + err := enc.Finalize() + if err == nil || !strings.Contains(err.Error(), "refusing to overwrite symlink") { + t.Fatalf("expected symlink-refusal error from KEYMAP open, got %v", err) + } + if got, _ := os.ReadFile(bait); string(got) != "stay-out" { //nolint:gosec // test path + t.Fatalf("bait file written through KEYMAP symlink: %q", got) + } +} + // TestS3_BackslashObjectKeyRejected is the regression for Codex P1 // round 6: filepath.Join treats '\' as a separator on Windows, so // keys like `a\..\b` would bypass the '/'-based dot-segment scan From cd88e6320f2a6662a1fa83899674e60ca30d71cb Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 21:57:07 +0900 Subject: [PATCH 12/20] backup: canonicalize DynamoDB N keys + sort schema arrays (PR #716, round 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 9 raised two issues on commit d67baf8b: 1. P1: ddbKeyAttrToSegment was emitting `EncodeSegment([]byte(v.N))` for numeric primary-key attributes, but DynamoDB N equality is numeric — `"1"` and `"1.0"` (or `"100"` and `"1e2"`, or `"0.5"` and `"5e-1"`) name the same logical item. In migration mode where source and active rows used different decimal text for the same value, both rows survived at distinct paths and restore replayed duplicates. Mirror the live adapter's canonicalNumberString (adapter/dynamodb.go:7651) which uses big.Rat — same canonical form keeps backup filenames in lockstep with the live equality check. 2. P2: schemaToPublic ranged over Go maps for both global_secondary_indexes and attribute_definitions, so identical snapshots produced different `_schema.json` byte output across runs. Sort by name before append. Tests: - TestDDB_CanonicalNumberKeySegment: equivalence pairs ("1"/"1.0", "100"/"1e2", "-0"/"0", "0.5"/"5e-1") collapse to the same key segment. - TestDDB_SchemaJSONIsDeterministic: 32 calls to schemaToPublic on the same schema produce identical attribute_definitions and GSI orders, both matching the documented sort-by-name. --- internal/backup/dynamodb.go | 57 ++++++++++++++-- internal/backup/dynamodb_test.go | 107 +++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 6 deletions(-) diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go index b83355ad8..40a45dfb1 100644 --- a/internal/backup/dynamodb.go +++ b/internal/backup/dynamodb.go @@ -5,8 +5,10 @@ import ( "encoding/base64" "encoding/json" "fmt" + "math/big" "os" "path/filepath" + "sort" "strconv" "strings" @@ -378,7 +380,18 @@ func ddbKeyAttrToSegment(av *pb.DynamoAttributeValue) (string, error) { case *pb.DynamoAttributeValue_S: return EncodeSegment([]byte(v.S)), nil case *pb.DynamoAttributeValue_N: - return EncodeSegment([]byte(v.N)), nil + // DynamoDB N equality is numeric, not lexical: "1" and + // "1.0" name the same primary-key item, and so do "5e-1" + // and "0.5". Without canonicalisation each text form + // produces a distinct filename, so in migration mode the + // "active generation wins" invariant breaks (both source + // and active rows survive at different paths and restore + // replays duplicates). Mirror the live adapter's + // canonicalNumberString (adapter/dynamodb.go:7651) which + // uses big.Rat — same canonical form keeps filename + // identity in lockstep with the live equality check. + // Codex P1 round 9. + return EncodeSegment([]byte(canonicalDDBNumber(v.N))), nil case *pb.DynamoAttributeValue_B: return EncodeBinarySegment(v.B), nil } @@ -386,6 +399,21 @@ func ddbKeyAttrToSegment(av *pb.DynamoAttributeValue) (string, error) { "primary key has unsupported attribute kind %T", av.GetValue()) } +// canonicalDDBNumber returns the canonical decimal representation of +// a DynamoDB N value. Equivalent inputs (`"1"`, `"1.0"`, `"0.5"`, +// `"5e-1"`, …) collapse to the same string; malformed inputs fall +// through to a trimmed copy so a downstream parse failure surfaces +// the original bytes rather than a silently rewritten value. The +// implementation matches adapter/dynamodb.go canonicalNumberString +// byte-for-byte so backup filenames track live equality. +func canonicalDDBNumber(v string) string { + rat := &big.Rat{} + if _, ok := rat.SetString(strings.TrimSpace(v)); !ok { + return strings.TrimSpace(v) + } + return rat.RatString() +} + // schemaToPublic projects DynamoTableSchema into the AWS-DescribeTable // JSON shape documented in the design. Fields the live record carries // for cluster-internal reasons (key_encoding_version, generation, @@ -407,8 +435,18 @@ func schemaToPublic(s *pb.DynamoTableSchema) ddbPublicSchema { if pk.RangeKey.Name != "" { pk.RangeKey.Type = defs[pk.RangeKey.Name] } - gsis := make([]publicGSI, 0, len(s.GetGlobalSecondaryIndexes())) - for name, gsi := range s.GetGlobalSecondaryIndexes() { + // Build GSI list in deterministic name-sorted order. Ranging over + // the underlying map directly produced a different array order on + // every dump, undermining byte-for-byte reproducibility of + // _schema.json across runs of the same snapshot. Codex P2 round 9. + gsiNames := make([]string, 0, len(s.GetGlobalSecondaryIndexes())) + for name := range s.GetGlobalSecondaryIndexes() { + gsiNames = append(gsiNames, name) + } + sort.Strings(gsiNames) + gsis := make([]publicGSI, 0, len(gsiNames)) + for _, name := range gsiNames { + gsi := s.GetGlobalSecondaryIndexes()[name] g := publicGSI{ Name: name, KeySchema: publicKeySchema{ @@ -426,9 +464,16 @@ func schemaToPublic(s *pb.DynamoTableSchema) ddbPublicSchema { g.Projection.NonKeyAttributes = append([]string{}, gsi.GetProjection().GetNonKeyAttributes()...) gsis = append(gsis, g) } - attrDefs := make([]publicAttributeDefinition, 0, len(defs)) - for name, ty := range defs { - attrDefs = append(attrDefs, publicAttributeDefinition{Name: name, Type: ty}) + // AttributeDefinitions is also sorted by attribute name for the + // same determinism reason. + defNames := make([]string, 0, len(defs)) + for name := range defs { + defNames = append(defNames, name) + } + sort.Strings(defNames) + attrDefs := make([]publicAttributeDefinition, 0, len(defNames)) + for _, name := range defNames { + attrDefs = append(attrDefs, publicAttributeDefinition{Name: name, Type: defs[name]}) } return ddbPublicSchema{ FormatVersion: 1, diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go index 3caf78b1d..f88b7f459 100644 --- a/internal/backup/dynamodb_test.go +++ b/internal/backup/dynamodb_test.go @@ -375,6 +375,113 @@ func TestDDB_MigrationSourceGenerationItemsAreEmitted(t *testing.T) { } } +// TestDDB_CanonicalNumberKeySegment is the regression for Codex P1 +// round 9: DynamoDB N equality is numeric, not lexical, but the key +// segment was emitted as `EncodeSegment([]byte(v.N))`. In migration +// mode where source and active rows used different decimal text for +// the same logical N value (e.g. "1" and "1.0"), both rows survived +// at distinct paths and restore replayed duplicates. The encoder +// must canonicalise via big.Rat — same canonical form as the live +// adapter — so equivalent N literals collapse onto the same filename. +func TestDDB_CanonicalNumberKeySegment(t *testing.T) { + t.Parallel() + cases := []struct { + a, b string + }{ + {"1", "1.0"}, + {"100", "1e2"}, + {"-0", "0"}, + {"0.5", "5e-1"}, + } + for _, tc := range cases { + t.Run(tc.a+"_vs_"+tc.b, func(t *testing.T) { + t.Parallel() + gotA, errA := ddbKeyAttrToSegment(nAttr(tc.a)) + gotB, errB := ddbKeyAttrToSegment(nAttr(tc.b)) + if errA != nil || errB != nil { + t.Fatalf("err: %v / %v", errA, errB) + } + if gotA != gotB { + t.Fatalf("equivalent N values must canonicalise to the same segment: %q vs %q -> %q vs %q", + tc.a, tc.b, gotA, gotB) + } + }) + } +} + +// TestDDB_SchemaJSONIsDeterministic is the regression for Codex P2 +// round 9: schemaToPublic ranged over Go maps for both +// global_secondary_indexes and attribute_definitions, so identical +// snapshots produced different `_schema.json` byte output across +// runs. The keys are now sorted before append. +func TestDDB_SchemaJSONIsDeterministic(t *testing.T) { + t.Parallel() + schema := &pb.DynamoTableSchema{ + TableName: "t", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{ + "zeta": "S", "alpha": "S", "id": "S", "mu": "N", + }, + GlobalSecondaryIndexes: map[string]*pb.DynamoGlobalSecondaryIndex{ + "gZ": {KeySchema: &pb.DynamoKeySchema{HashKey: "zeta"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}}, + "gA": {KeySchema: &pb.DynamoKeySchema{HashKey: "alpha"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}}, + "gM": {KeySchema: &pb.DynamoKeySchema{HashKey: "mu"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}}, + }, + Generation: 1, + } + // Run schemaToPublic many times — Go's randomised map order + // would otherwise produce different array orders across calls. + want := schemaToPublic(schema) + for i := 0; i < 32; i++ { + got := schemaToPublic(schema) + if !attributeDefinitionsEqual(got.AttributeDefinitions, want.AttributeDefinitions) { + t.Fatalf("attribute_definitions order differs across calls: %+v vs %+v", + got.AttributeDefinitions, want.AttributeDefinitions) + } + if !gsiOrderEqual(got.GlobalSecondaryIndexes, want.GlobalSecondaryIndexes) { + t.Fatalf("global_secondary_indexes order differs across calls: %+v vs %+v", + got.GlobalSecondaryIndexes, want.GlobalSecondaryIndexes) + } + } + // Also assert the order itself is the documented sort-by-name. + wantAttrOrder := []string{"alpha", "id", "mu", "zeta"} + for i, ad := range want.AttributeDefinitions { + if ad.Name != wantAttrOrder[i] { + t.Fatalf("attribute_definitions[%d].Name = %q want %q", i, ad.Name, wantAttrOrder[i]) + } + } + wantGSIOrder := []string{"gA", "gM", "gZ"} + for i, g := range want.GlobalSecondaryIndexes { + if g.Name != wantGSIOrder[i] { + t.Fatalf("global_secondary_indexes[%d].Name = %q want %q", i, g.Name, wantGSIOrder[i]) + } + } +} + +func attributeDefinitionsEqual(a, b []publicAttributeDefinition) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func gsiOrderEqual(a, b []publicGSI) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i].Name != b[i].Name { + return false + } + } + return true +} + func TestDDB_NewGenerationWinsOverMigrationSourceForSameKey(t *testing.T) { t.Parallel() enc, root := newDDBEncoder(t) From 90d33febbdcf2457cc696a50a298e328da940908 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 22:01:29 +0900 Subject: [PATCH 13/20] backup: rename-target collision check + populate last_modified (PR #718, round 8) Codex round 9 raised three issues on commit 1dc68842: 1. P1: Leaf-data rename collided with real user keys. `needsLeafDataRename` rewrote `` to `.elastickv-leaf-data` without checking whether a real key with that suffix already existed in the same bucket. Two distinct objects could be mapped to one filesystem path and finalize was last-flush-wins. resolveObjectFilename now consults a per-bucket set of active-gen object keys via computeActiveGenObjectKeys; if the rename target is a real key, surface ErrS3MetaSuffixCollision. 2. P1: Meta-suffix rename collided too. Same root cause: rename-collisions mode rewrote `.elastickv-meta.json` to `.elastickv-meta.json.user-data`. If that suffixed key was itself a real object, the rename target collided. The same set lookup now refuses the rename and surfaces ErrS3MetaSuffixCollision. 3. P2: last_modified was never populated. The decoded s3LiveManifest carried last_modified_hlc, but the s3PublicManifest projection silently dropped it, so every exported sidecar lost the HEAD-visible Last-Modified timestamp. formatHLCAsRFC3339Nano extracts the millisecond half (HLC >> 16, see kv/hlc.go) and renders RFC3339Nano UTC, matching S3 HEAD semantics. HLC zero maps to "" so omitempty drops the field rather than emitting "1970-01-01T00:00:00Z". Tests: - TestS3_LeafDataRenameRejectsCollidingUserKey - TestS3_MetaSuffixRenameRejectsCollidingUserKey - TestS3_LastModifiedSidecarPopulated --- internal/backup/s3.go | 75 ++++++++++++++++++++++++++++--- internal/backup/s3_test.go | 90 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 6 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index b2db6f16a..87d34fb71 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -7,6 +7,7 @@ import ( "path/filepath" "sort" "strings" + "time" "github.com/bootjp/elastickv/internal/s3keys" "github.com/cockroachdb/errors" @@ -250,6 +251,28 @@ type s3PublicManifest struct { // just enough to decode the JSON value. Fields the dump format // drops are still parsed (so unknown-fields default-tolerance is // preserved) but elided from the public sidecar. +// formatHLCAsRFC3339Nano renders the millisecond half of an HLC +// (the upper 48 bits, see kv/hlc.go) as an RFC3339Nano UTC string +// for the `last_modified` sidecar field. Restore tools compare +// these timestamps to S3 HEAD `Last-Modified` semantics, which is +// millisecond-resolution UTC. HLC zero (no last_modified_hlc on +// the live record) maps to "" so omitempty drops the field rather +// than emitting "1970-01-01T00:00:00Z" — which would mislead +// consumers about the object's age. Codex P2 round 9. +func formatHLCAsRFC3339Nano(hlc uint64) string { + if hlc == 0 { + return "" + } + ms := int64(hlc >> hlcLogicalBitsForBackupS3) //nolint:gosec // bit-shift is safe; HLC is bounded + return time.UnixMilli(ms).UTC().Format(time.RFC3339Nano) +} + +// hlcLogicalBitsForBackupS3 mirrors kv/hlc.go:hlcLogicalBits. We keep +// the literal here (and in a single place via this name) rather than +// importing the kv package because the backup package is meant to +// stay decoupled from the live cluster's internals. +const hlcLogicalBitsForBackupS3 = 16 + type s3LiveManifest struct { UploadID string `json:"upload_id"` ETag string `json:"etag"` @@ -361,6 +384,7 @@ func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { FormatVersion: 1, ETag: live.ETag, SizeBytes: live.SizeBytes, + LastModified: formatHLCAsRFC3339Nano(live.LastModifiedHLC), ContentType: live.ContentType, ContentEncoding: live.ContentEncoding, CacheControl: live.CacheControl, @@ -521,6 +545,7 @@ func (s *S3Encoder) flushBucketObjects(b *s3BucketState, bucketDir string) (int, // rename in KEYMAP.jsonl so restore can reverse it. Codex P1 // #615. dirPrefixes := s.computeDirPrefixes(b) + objectKeys := s.computeActiveGenObjectKeys(b) stale := 0 for _, obj := range b.objects { // Suppress objects from older bucket incarnations: when a @@ -540,13 +565,30 @@ func (s *S3Encoder) flushBucketObjects(b *s3BucketState, bucketDir string) (int, continue } needsLeafDataRename := dirPrefixes[obj.object] - if err := s.flushObjectWithCollision(b, bucketDir, obj, needsLeafDataRename); err != nil { + if err := s.flushObjectWithCollision(b, bucketDir, obj, needsLeafDataRename, objectKeys); err != nil { return stale, err } } return stale, nil } +// computeActiveGenObjectKeys returns the set of every active-gen +// object key in the bucket. resolveObjectFilename consults this set +// so a rename target (`.user-data` or `.elastickv-leaf-data`) that +// happens to match a real object key surfaces an error instead of +// silently merging two distinct objects onto one filesystem path +// (Codex P1 round 9). +func (s *S3Encoder) computeActiveGenObjectKeys(b *s3BucketState) map[string]bool { + out := make(map[string]bool, len(b.objects)) + for _, obj := range b.objects { + if b.activeGen != 0 && obj.generation != b.activeGen { + continue + } + out[obj.object] = true + } + return out +} + // computeDirPrefixes returns the set of directory prefixes the union // of active-gen object keys requires. For object key "a/b/c" the // prefixes are {"a", "a/b"}. The set is consulted at flush time to @@ -593,11 +635,11 @@ func closeBucketKeymap(b *s3BucketState) error { return flushErr } -func (s *S3Encoder) flushObjectWithCollision(b *s3BucketState, bucketDir string, obj *s3ObjectState, needsLeafDataRename bool) error { +func (s *S3Encoder) flushObjectWithCollision(b *s3BucketState, bucketDir string, obj *s3ObjectState, needsLeafDataRename bool, objectKeys map[string]bool) error { if obj.manifest == nil { return s.flushOrphanObject(b, bucketDir, obj) } - objectName, kind, err := s.resolveObjectFilename(b, obj, needsLeafDataRename) + objectName, kind, err := s.resolveObjectFilename(b, obj, needsLeafDataRename, objectKeys) if err != nil { return err } @@ -740,16 +782,37 @@ func safeJoinUnderRoot(root, rel string) (string, error) { // directory (e.g., bucket holds both "a/b" and "a/b/c"). The shorter // key is renamed to ".elastickv-leaf-data" and recorded in // KEYMAP.jsonl with KindS3LeafData. Codex P1 #615. -func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState, needsLeafDataRename bool) (string, string, error) { +// +// objectKeys is the set of every active-gen object key in the bucket +// (including obj.object itself). Both rename strategies — meta-suffix +// `.user-data` and leaf-data `.elastickv-leaf-data` — must refuse to +// emit if their target collides with an existing real object key in +// the same bucket: otherwise two distinct keys would map to one +// filesystem path and finalize would last-flush-wins one of them +// without a KEYMAP record that could reverse the merge. Codex P1 +// round 9. +func (s *S3Encoder) resolveObjectFilename(b *s3BucketState, obj *s3ObjectState, needsLeafDataRename bool, objectKeys map[string]bool) (string, string, error) { if strings.HasSuffix(obj.object, S3MetaSuffixReserved) { if !s.renameCollisions { return "", "", errors.Wrapf(ErrS3MetaSuffixCollision, "bucket %q object %q", b.name, obj.object) } - return obj.object + ".user-data", KindMetaCollision, nil + target := obj.object + ".user-data" + if objectKeys[target] { + return "", "", errors.Wrapf(ErrS3MetaSuffixCollision, + "bucket %q object %q rename target %q is also a real object key (rename in S3 first)", + b.name, obj.object, target) + } + return target, KindMetaCollision, nil } if needsLeafDataRename { - return obj.object + S3LeafDataSuffix, KindS3LeafData, nil + target := obj.object + S3LeafDataSuffix + if objectKeys[target] { + return "", "", errors.Wrapf(ErrS3MetaSuffixCollision, + "bucket %q object %q leaf-data rename target %q is also a real object key (rename in S3 first)", + b.name, obj.object, target) + } + return target, KindS3LeafData, nil } // Object path taken at face value. Path-traversal sanitisation // runs in safeJoinUnderRoot, downstream of this function, where diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index c8ba5328a..95684f55b 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/bootjp/elastickv/internal/s3keys" "github.com/cockroachdb/errors" @@ -285,6 +286,95 @@ func TestS3_PathTraversalAttemptRejected(t *testing.T) { } } +// TestS3_LeafDataRenameRejectsCollidingUserKey is the regression for +// Codex P1 round 9: when a bucket holds both `path/to` and +// `path/to/sub`, the leaf-data rename strategy rewrites `path/to` to +// `path/to.elastickv-leaf-data`. If a third real object key +// `path/to.elastickv-leaf-data` also exists in the same bucket, the +// rename target collides and finalize would last-flush-wins one of +// the two distinct objects without a KEYMAP record. resolveObjectFilename +// now refuses the rename and surfaces ErrS3MetaSuffixCollision. +func TestS3_LeafDataRenameRejectsCollidingUserKey(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + emitObject(t, enc, "b", 1, "path/to", []byte("a"), "") + emitObject(t, enc, "b", 1, "path/to/sub", []byte("b"), "") + emitObject(t, enc, "b", 1, "path/to.elastickv-leaf-data", []byte("c"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MetaSuffixCollision) { + t.Fatalf("err=%v want ErrS3MetaSuffixCollision (leaf-data rename target collides with real key)", err) + } +} + +// TestS3_MetaSuffixRenameRejectsCollidingUserKey is the regression +// for Codex P1 round 9 (sibling case): rename-collisions mode rewrites +// `evil.elastickv-meta.json` to `evil.elastickv-meta.json.user-data`. +// If `evil.elastickv-meta.json.user-data` is itself a real key in +// the same bucket the rename target collides and one object is +// silently lost. The rename now refuses with ErrS3MetaSuffixCollision. +func TestS3_MetaSuffixRenameRejectsCollidingUserKey(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + enc.WithRenameCollisions(true) + emitObject(t, enc, "b", 1, "evil.elastickv-meta.json", []byte("a"), "") + emitObject(t, enc, "b", 1, "evil.elastickv-meta.json.user-data", []byte("b"), "") + err := enc.Finalize() + if !errors.Is(err, ErrS3MetaSuffixCollision) { + t.Fatalf("err=%v want ErrS3MetaSuffixCollision (meta-suffix rename target collides)", err) + } +} + +// TestS3_LastModifiedSidecarPopulated is the regression for Codex P2 +// round 9: the live manifest's last_modified_hlc was being decoded +// but never copied into s3PublicManifest.LastModified. The sidecar +// JSON now carries the millisecond half of the HLC formatted as +// RFC3339Nano UTC, matching S3 HEAD `Last-Modified` semantics. +func TestS3_LastModifiedSidecarPopulated(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + const ms = uint64(1_745_884_800_000) // 2025-04-28T20:00:00Z + hlc := ms << 16 + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, + "size_bytes": 4, + "last_modified_hlc": hlc, + "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 4, "chunk_count": 1}, + }, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), []byte("data")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + sidecarPath := filepath.Join(root, "s3", bucket, object+S3MetaSuffixReserved) + raw, err := os.ReadFile(sidecarPath) //nolint:gosec // test path + if err != nil { + t.Fatal(err) + } + var sidecar map[string]any + if err := json.Unmarshal(raw, &sidecar); err != nil { + t.Fatal(err) + } + got, _ := sidecar["last_modified"].(string) + want := time.UnixMilli(int64(ms)).UTC().Format(time.RFC3339Nano) //nolint:gosec // test fixture + if got != want { + t.Fatalf("last_modified = %q want %q", got, want) + } +} + // TestS3_KeymapRefusesSymlinkAtFinalize is the regression for Codex // P2 round 9 on PR #718: the S3 encoder's recordKeymap was using // os.Create directly, which follows symlinks. A bucket whose From d016ea7196b559f587f9a28ec8ab14a98b7cc2c0 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 22:13:33 +0900 Subject: [PATCH 14/20] backup: refuse dot-segment scratch paths in HandleBlob (PR #718, round 9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 11 (commit 9a63e326): `HandleBlob` composed scratch paths with `filepath.Join(s.scratchRoot, EncodeSegment([]byte(bucket)), EncodeSegment([]byte(object)))`. EncodeSegment uses the RFC3986 unreserved set (ALPHA/DIGIT/-/./_) — `/` is percent-encoded but `.` is preserved, so the literal segment `..` survives unchanged. A crafted `bucket=".."` and/or `object=".."` would resolve to `/../...`, letting `writeFileAtomic` land outside the decoder's controlled directory before `safeJoinUnderRoot` runs at output time. Add `scratchDirForBlob` which rejects `.` / `..` / "" bucket and object literals at the encoder boundary so the spill-to-disk step inherits the same containment invariant the final output path enforces. Apply the same guard to `flushOrphanObject` which shared the failure mode under `--include-orphans`. (Multi-segment dot keys like `a/../b` continue to be caught at Finalize via `safeJoinUnderRoot` because EncodeSegment keeps the whole key in one filename segment that splits cleanly there.) Tests: - TestS3_HandleBlobRejectsScratchEscape: 5 sub-cases covering bucket/object/both variants of `.`/`..` literals. - TestS3_DotSegmentObjectKeyRejected updated to allow either HandleBlob or Finalize to surface ErrS3MalformedKey, since sole-dot keys are now caught earlier. --- internal/backup/s3.go | 40 +++++++++++++++++++-- internal/backup/s3_test.go | 71 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 4 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index 87d34fb71..a8d5bc247 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -411,14 +411,25 @@ func (s *S3Encoder) HandleObjectManifest(key, value []byte) error { // HandleBlob spills a !s3|blob| record to a per-chunk scratch file // and registers it under the (bucket, object, gen, uploadID, partNo, -// chunkNo, partVersion) routing key. +// chunkNo, partVersion) routing key. EncodeSegment percent-encodes +// `/` so a multi-segment object key like `../../tmp/pwn` collapses +// into one filename, but a literal `..` (or `.`) survives unchanged +// because both `.` chars are RFC3986-unreserved. Without explicit +// validation, a crafted bucket+object pair like `bucket="..", +// object=".."` would resolve to filepath.Join(scratchRoot, "..", +// "..") = the parent of scratchRoot, letting writeFileAtomic +// land outside the decoder's controlled directory before +// safeJoinUnderRoot ever runs at output time. Codex P1 round 11. func (s *S3Encoder) HandleBlob(key, value []byte) error { bucket, gen, object, uploadID, partNo, chunkNo, partVersion, ok := s3keys.ParseBlobKey(key) if !ok { return errors.Wrapf(ErrS3MalformedKey, "blob key: %q", key) } st := s.objectState(bucket, gen, object) - dir := filepath.Join(s.scratchRoot, EncodeSegment([]byte(bucket)), EncodeSegment([]byte(object))) + dir, err := scratchDirForBlob(s.scratchRoot, bucket, object) + if err != nil { + return err + } if !st.scratchDirCreated { if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) @@ -434,6 +445,27 @@ func (s *S3Encoder) HandleBlob(key, value []byte) error { return nil } +// scratchDirForBlob builds the per-(bucket,object) scratch path and +// validates it stays under scratchRoot. A bucket or object name of +// `.` / `..` would let `filepath.Join` resolve out of scratchRoot +// before anything else gets a chance to refuse the key. Reject the +// dot-component case at the encoder boundary so the spill-to-disk +// step inherits the same containment invariant the final output +// path enforces via safeJoinUnderRoot. +func scratchDirForBlob(scratchRoot, bucket, object string) (string, error) { + for _, seg := range [...]string{bucket, object} { + switch seg { + case ".", "..": + return "", errors.Wrapf(ErrS3MalformedKey, + "bucket or object key %q is a dot segment (would escape scratch root)", seg) + case "": + return "", errors.Wrapf(ErrS3MalformedKey, + "bucket or object key is empty (cannot construct scratch path)") + } + } + return filepath.Join(scratchRoot, EncodeSegment([]byte(bucket)), EncodeSegment([]byte(object))), nil +} + // HandleIncompleteUpload routes !s3|upload|meta|/!s3|upload|part| // records to /_incomplete_uploads/records.jsonl when the // include flag is on; otherwise drops them. @@ -682,6 +714,10 @@ func (s *S3Encoder) flushOrphanObject(b *s3BucketState, bucketDir string, obj *s if len(obj.chunkPaths) == 0 { return nil } + if obj.object == "." || obj.object == ".." || obj.object == "" { + return errors.Wrapf(ErrS3MalformedKey, + "orphan object key %q is a dot segment (would escape orphan dir)", obj.object) + } dir := filepath.Join(bucketDir, "_orphans", EncodeSegment([]byte(obj.object))) if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 95684f55b..fa0b0a118 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -584,14 +584,81 @@ func TestS3_StalePartVersionExcludedFromAssembledBody(t *testing.T) { } } +// TestS3_HandleBlobRejectsScratchEscape is the regression for Codex +// P1 round 11: HandleBlob composed scratch paths with EncodeSegment, +// which preserves `.` and `..` (RFC3986 unreserved). A bucket or +// object literal of `..` would resolve to `/../...`, +// letting writeFileAtomic land outside the decoder's scratch tree +// before safeJoinUnderRoot ever ran at Finalize. The encoder now +// refuses dot-component bucket/object names at HandleBlob. +func TestS3_HandleBlobRejectsScratchEscape(t *testing.T) { + t.Parallel() + cases := []struct { + name string + bucket, object string + }{ + {"object_dotdot", "b", ".."}, + {"object_dot", "b", "."}, + {"bucket_dotdot", "..", "x"}, + {"bucket_dot", ".", "x"}, + {"both_dotdot", "..", ".."}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + err := enc.HandleBlob( + s3keys.BlobKey(tc.bucket, 1, tc.object, "u-1", 1, 0), + []byte("payload"), + ) + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for bucket=%q object=%q", err, tc.bucket, tc.object) + } + }) + } +} + func TestS3_DotSegmentObjectKeyRejected(t *testing.T) { t.Parallel() cases := []string{"a/../b", "a/./b", "..", "."} for _, key := range cases { t.Run(key, func(t *testing.T) { + t.Parallel() enc, _ := newS3Encoder(t) - emitObject(t, enc, "b", 1, key, []byte("x"), "") - err := enc.Finalize() + // Refusal must happen at OR BEFORE Finalize. The + // scratch-path guard (Codex P1 round 11) catches sole- + // dot keys at HandleBlob time; multi-segment dot keys + // like "a/../b" pass through to Finalize where + // safeJoinUnderRoot rejects them. Either point is + // acceptable as long as ErrS3MalformedKey surfaces. + err := enc.HandleBucketMeta( + s3keys.BucketMetaKey("b"), + encodeS3BucketMetaValue(t, map[string]any{"bucket_name": "b", "generation": 1}), + ) + if err != nil { + t.Fatalf("HandleBucketMeta: %v", err) + } + err = enc.HandleObjectManifest( + s3keys.ObjectManifestKey("b", 1, key), + encodeS3ManifestValue(t, map[string]any{ + "upload_id": "u-1", "size_bytes": int64(1), + "parts": []map[string]any{{"part_no": 1, "size_bytes": int64(1), "chunk_count": 1}}, + }), + ) + if err != nil { + if errors.Is(err, ErrS3MalformedKey) { + return + } + t.Fatalf("HandleObjectManifest: %v", err) + } + err = enc.HandleBlob(s3keys.BlobKey("b", 1, key, "u-1", 1, 0), []byte("x")) + if err != nil { + if errors.Is(err, ErrS3MalformedKey) { + return + } + t.Fatalf("HandleBlob: %v", err) + } + err = enc.Finalize() if !errors.Is(err, ErrS3MalformedKey) { t.Fatalf("err=%v want ErrS3MalformedKey for key %q", err, key) } From 4505df353ddb2d08b1693f0da7e053406fb62f41 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 22:31:16 +0900 Subject: [PATCH 15/20] backup: set-based chunk completeness check (PR #718, round 10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 12 (commit e91f0866): `verifyChunkCompleteness` only checked `count` and `maxIndex` per (partNo, partVersion). For declared `chunk_count=3`, observed chunk numbers `{0, 2, 2}` satisfy `count == 3` and `maxIndex+1 == 3` while chunk_no=1 is absent — assembleObjectBody would then emit a corrupted body silently. Replace the dual-threshold heuristic with a set-membership check: track every observed chunk index per (partNo, partVersion) and verify the set is exactly `{0, 1, …, chunk_count-1}`. Both the unique-count guard and a per-index lookup surface ErrS3IncompleteBlobChunks before the assembler runs. Test: - TestS3_DuplicateChunksWithMissingIndexRejected: drives the `{0, 2, 2}` shape, asserts the new validator rejects it. - TestS3_IncompleteChunksRejected (existing) still passes against the new code path. --- internal/backup/s3.go | 49 +++++++++++++++++++++----------------- internal/backup/s3_test.go | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index a8d5bc247..700422959 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -57,12 +57,18 @@ var ( ) // verifyChunkCompleteness checks every (partNo, partVersion) entry in -// declaredParts has the right number of chunkNo values present in -// chunks. Chunks are expected at chunkNo in [0, chunk_count); a -// missing index in that range surfaces as +// declaredParts has exactly the set of chunkNo values {0, 1, …, +// chunk_count-1} present in chunks. Chunks are expected at chunkNo in +// [0, chunk_count); a missing index in that range surfaces as // ErrS3IncompleteBlobChunks rather than letting the assembler emit a // truncated body. // +// We track the actual set of seen chunk indexes (not just count and +// maxIndex) because count + maxIndex alone admits false positives: +// for declared chunk_count=3, observed `{0, 2, 2}` produces count=3 +// and maxIndex=2 but is missing chunkNo=1, which would silently +// assemble a corrupted body. Codex P1 round 12. +// // declaredParts == nil means "no contract to verify" — used by tests // that pre-date the manifest-parts feature; production callers // always populate it via HandleObjectManifest. @@ -70,31 +76,30 @@ func verifyChunkCompleteness(chunks []s3ChunkKey, declaredParts map[s3PartKey]s3 if declaredParts == nil { return nil } - // Count present chunks per (partNo, partVersion). - type observed struct { - count uint64 - maxIndex uint64 - } - got := make(map[s3PartKey]observed, len(declaredParts)) + got := make(map[s3PartKey]map[uint64]struct{}, len(declaredParts)) for _, k := range chunks { pk := s3PartKey{partNo: k.partNo, partVersion: k.partVersion} - o := got[pk] - o.count++ - if k.chunkNo > o.maxIndex { - o.maxIndex = k.chunkNo + if got[pk] == nil { + got[pk] = make(map[uint64]struct{}) } - got[pk] = o + got[pk][k.chunkNo] = struct{}{} } for pk, want := range declaredParts { - o := got[pk] - // We accept o.count == want.chunkCount AND - // o.maxIndex == chunkCount-1, because a snapshot with N - // duplicates of chunkNo=0 would satisfy the count check - // alone. Both the count and the highest index must match. - if want.chunkCount > 0 && (o.count != want.chunkCount || o.maxIndex+1 != want.chunkCount) { + if want.chunkCount == 0 { + continue + } + seen := got[pk] + if uint64(len(seen)) != want.chunkCount { //nolint:gosec // bounded return errors.Wrapf(ErrS3IncompleteBlobChunks, - "partNo=%d partVersion=%d declared chunks=%d, observed count=%d maxIndex=%d", - pk.partNo, pk.partVersion, want.chunkCount, o.count, o.maxIndex) + "partNo=%d partVersion=%d declared chunks=%d, observed unique=%d", + pk.partNo, pk.partVersion, want.chunkCount, len(seen)) + } + for i := uint64(0); i < want.chunkCount; i++ { + if _, ok := seen[i]; !ok { + return errors.Wrapf(ErrS3IncompleteBlobChunks, + "partNo=%d partVersion=%d declared chunks=%d, missing chunkNo=%d", + pk.partNo, pk.partVersion, want.chunkCount, i) + } } } return nil diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index fa0b0a118..2847d41c1 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -779,6 +779,49 @@ func readBytesFile(t *testing.T, path string) []byte { return b } +// TestS3_DuplicateChunksWithMissingIndexRejected is the regression +// for Codex P1 round 12: the previous count-and-maxIndex check +// admitted false positives. For declared chunk_count=3, observed +// `{0, 2, 2}` produced count=3 and maxIndex=2 satisfying both +// thresholds while chunk_no=1 was missing. The set-based check now +// detects the absent index and surfaces ErrS3IncompleteBlobChunks +// before assembleObjectBody can emit a corrupted body. +func TestS3_DuplicateChunksWithMissingIndexRejected(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 9, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 9, "chunk_count": 3}, + }, + })); err != nil { + t.Fatal(err) + } + // Drive `{0, 2, 2}`: count satisfies, maxIndex satisfies, but + // chunk_no=1 is missing. The set-based validator must reject. + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), []byte("AAA")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 2), []byte("CC1")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 2), []byte("CC2")); err != nil { + t.Fatal(err) + } + err := enc.Finalize() + if !errors.Is(err, ErrS3IncompleteBlobChunks) { + t.Fatalf("err=%v want ErrS3IncompleteBlobChunks for chunk-set with duplicate+missing", err) + } +} + func TestS3_IncompleteChunksRejected(t *testing.T) { t.Parallel() enc, _ := newS3Encoder(t) From 577898aee71667185c5361a280a193148808da54 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 30 Apr 2026 22:36:54 +0900 Subject: [PATCH 16/20] backup: refuse dot-segment DynamoDB key filenames (PR #716, round 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 12 (commit 34ea5bad): `writeDDBItem` joined `itemsDir` with the encoded hash/range key filename. EncodeSegment treats `.` and `..` as RFC3986-unreserved and preserves them, so a DynamoDB S/N key value of "." or ".." flows through as a single-character filename. filepath.Join then either collapses `/.` back to itemsDir or resolves `/..` to the parent — letting an item like hash=".." range="_schema" overwrite the table-level _schema.json. `refuseDotSegmentFilename` rejects sole-dot encoded segments for both hash and range keys; surfaces ErrDDBInvalidItem so existing callers branching on errors.Is keep their semantics. Test: TestDDB_DotSegmentKeyRejected covers all four shapes (hash="." / hash=".." / range="." / range=".."). --- internal/backup/dynamodb.go | 26 +++++++++++++++ internal/backup/dynamodb_test.go | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go index 40a45dfb1..f2c384557 100644 --- a/internal/backup/dynamodb.go +++ b/internal/backup/dynamodb.go @@ -332,6 +332,15 @@ func parseDDBItemKey(key []byte) (string, uint64, error) { // A missing hash-key attribute on an item is a structural error (the // item could never have been GetItem-able without one) and surfaces // as ErrDDBInvalidItem. +// +// The encoded hash/range filename segments may legitimately be "." or +// ".." (DynamoDB S/N keys can hold any string, and EncodeSegment +// preserves both `.` chars as RFC3986-unreserved). filepath.Join +// would then either collapse `/.` back to itemsDir or +// resolve `/..` to the parent — letting an item like +// hash=".." range="_schema" overwrite the table-level _schema.json. +// Reject sole-dot segments at this boundary so the items/ subtree +// cannot escape via key-controlled paths. Codex P1 round 12. func writeDDBItem(itemsDir, hashKey, rangeKey string, item *pb.DynamoItem) error { attrs := item.GetAttributes() hashVal, ok := attrs[hashKey] @@ -343,6 +352,9 @@ func writeDDBItem(itemsDir, hashKey, rangeKey string, item *pb.DynamoItem) error if err != nil { return err } + if err := refuseDotSegmentFilename(hashFilename, "hash"); err != nil { + return err + } publicItem := itemToPublic(item) body, err := json.MarshalIndent(publicItem, "", " ") if err != nil { @@ -360,6 +372,9 @@ func writeDDBItem(itemsDir, hashKey, rangeKey string, item *pb.DynamoItem) error if err != nil { return err } + if err := refuseDotSegmentFilename(rangeFilename, "range"); err != nil { + return err + } dir := filepath.Join(itemsDir, hashFilename) if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) @@ -367,6 +382,17 @@ func writeDDBItem(itemsDir, hashKey, rangeKey string, item *pb.DynamoItem) error return writeFileAtomic(filepath.Join(dir, rangeFilename+".json"), body) } +// refuseDotSegmentFilename blocks hash/range segments that filepath +// resolution would collapse or escape on (".", ".."). Both are +// reachable from valid DynamoDB N/S key values. +func refuseDotSegmentFilename(seg, role string) error { + if seg == "." || seg == ".." { + return errors.Wrapf(ErrDDBInvalidItem, + "%s-key segment %q is a dot path (would escape items dir)", role, seg) + } + return nil +} + // ddbKeyAttrToSegment encodes a primary-key attribute (S, N, or B) to // a filesystem-safe segment. Per the design, S and N take the standard // EncodeSegment path; B takes EncodeBinarySegment so binary keys never diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go index f88b7f459..fdc772e4d 100644 --- a/internal/backup/dynamodb_test.go +++ b/internal/backup/dynamodb_test.go @@ -375,6 +375,61 @@ func TestDDB_MigrationSourceGenerationItemsAreEmitted(t *testing.T) { } } +// TestDDB_DotSegmentKeyRejected is the regression for Codex P1 +// round 12: DynamoDB S/N key values can legitimately hold "." or +// "..". EncodeSegment preserves both as RFC3986-unreserved, so the +// resulting filename would let filepath.Join collapse / escape the +// items/ subtree — an item with hash=".." range="_schema" would be +// written as `
/_schema.json`, overwriting the schema sidecar. +// writeDDBItem now refuses sole-dot encoded segments for both +// hash and range keys. +func TestDDB_DotSegmentKeyRejected(t *testing.T) { + t.Parallel() + cases := []struct { + name string + hashAttr *pb.DynamoAttributeValue + rangeAttr *pb.DynamoAttributeValue + }{ + {"hash_dot", sAttr("."), nil}, + {"hash_dotdot", sAttr(".."), nil}, + {"range_dot", sAttr("ok"), sAttr(".")}, + {"range_dotdot", sAttr("ok"), sAttr("..")}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "t", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "h", RangeKey: ""}, + AttributeDefinitions: map[string]string{"h": "S", "r": "S"}, + Generation: 1, + } + if tc.rangeAttr != nil { + schema.PrimaryKey.RangeKey = "r" + } + attrs := map[string]*pb.DynamoAttributeValue{"h": tc.hashAttr} + hashRaw := tc.hashAttr.GetS() + rangeRaw := "" + if tc.rangeAttr != nil { + attrs["r"] = tc.rangeAttr + rangeRaw = tc.rangeAttr.GetS() + } + item := &pb.DynamoItem{Attributes: attrs} + if err := enc.HandleItem(EncodeDDBItemKey("t", 1, hashRaw, rangeRaw), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + err := enc.Finalize() + if !errors.Is(err, ErrDDBInvalidItem) { + t.Fatalf("err=%v want ErrDDBInvalidItem for dot-segment key", err) + } + }) + } +} + // TestDDB_CanonicalNumberKeySegment is the regression for Codex P1 // round 9: DynamoDB N equality is numeric, not lexical, but the key // segment was emitted as `EncodeSegment([]byte(v.N))`. In migration From f710f5e496a88cafab137e8402a11215e0a3902c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 2 May 2026 19:52:07 +0900 Subject: [PATCH 17/20] backup: refuse dot-segment S3 bucket names (PR #716, round 6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 13 (commit 1ccd6809): The PR #718 round-9 fix (`scratchDirForBlob`) rejected dot-component bucket/object names at the SCRATCH path, but the per-bucket FINAL output construction in flushBucket went through `filepath.Join(s.outRoot, "s3", EncodeSegment(b.name))` without validating the bucket name. EncodeSegment preserves `.` and `..` (both RFC3986-unreserved), so: - bucket name "." → bucketDir collapses to `/s3/` and `_bucket.json` lands at the s3-root level instead of in a per-bucket subdir. - bucket name ".." → bucketDir resolves to `` itself, letting `_bucket.json` clobber dump-root files like MANIFEST.json. flushBucket now refuses sole-dot and empty bucket names with ErrS3MalformedKey. Caller audit (per loop's audit-semantics rule for fail-closed changes): - Single caller in Finalize loop, which already accumulates into firstErr — surfacing ErrS3MalformedKey there is consistent with existing malformed-key handling (dot-segment object keys, NUL bytes, leading slash, etc. all surface through the same error class). Test: TestS3_FlushBucketRejectsDotSegmentBucketName covers ".", "..", and empty. --- internal/backup/s3.go | 14 ++++++++++++++ internal/backup/s3_test.go | 31 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index 700422959..1095c56ca 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -531,6 +531,20 @@ func (s *S3Encoder) Finalize() error { } func (s *S3Encoder) flushBucket(b *s3BucketState) error { + // Reject bucket-name dot segments before the filesystem join. + // `EncodeSegment(".") == "."` and `EncodeSegment("..") == ".."` + // (both are RFC3986-unreserved), so without this guard a + // crafted bucket meta record with name="." or ".." would let + // `filepath.Join` collapse `/s3/.` back to + // `/s3` or resolve `/s3/..` up to outRoot + // itself — `_bucket.json` would land outside the s3/ subtree + // and clobber dump-root files. Codex P1 round 13 (PR #716, + // finding originally surfaced on the merged-in s3.go). + switch b.name { + case ".", "..", "": + return errors.Wrapf(ErrS3MalformedKey, + "bucket name %q is empty or a dot segment (would escape s3/ subtree)", b.name) + } bucketDir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(b.name))) if err := os.MkdirAll(bucketDir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 2847d41c1..a4fe6a750 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -584,6 +584,37 @@ func TestS3_StalePartVersionExcludedFromAssembledBody(t *testing.T) { } } +// TestS3_FlushBucketRejectsDotSegmentBucketName is the regression for +// Codex P1 round 13 (originally surfaced on PR #716's merged-in +// s3.go): HandleBlob's dot-segment guard runs at the SCRATCH path, +// but the per-bucket FINAL output path went through +// `filepath.Join(s.outRoot, "s3", EncodeSegment(b.name))` without +// validating the bucket name. EncodeSegment preserves `.` and +// `..`, so a crafted !s3|bucket|meta record with name="." would +// drop `_bucket.json` directly under `/s3/` (instead of a +// per-bucket subdir), and ".." would escape the s3/ subtree +// entirely. flushBucket now rejects sole-dot and empty bucket +// names before the join. +func TestS3_FlushBucketRejectsDotSegmentBucketName(t *testing.T) { + t.Parallel() + cases := []string{".", "..", ""} + for _, name := range cases { + t.Run(name, func(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(name), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": name, "generation": uint64(1), + })); err != nil { + t.Fatal(err) + } + err := enc.Finalize() + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for bucket name %q", err, name) + } + }) + } +} + // TestS3_HandleBlobRejectsScratchEscape is the regression for Codex // P1 round 11: HandleBlob composed scratch paths with EncodeSegment, // which preserves `.` and `..` (RFC3986 unreserved). A bucket or From 1a875aaee7ee83c1813911eb4d9b77c36b3f37de Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 2 May 2026 20:08:13 +0900 Subject: [PATCH 18/20] backup: refuse dot-segment bucket in HandleIncompleteUpload (PR #716, round 7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 round 13 follow-up (commit f710f5e4): The round-6 fix added a dot-segment guard to flushBucket, but that runs at FINALIZE. HandleIncompleteUpload writes to disk EARLIER — every upload-family record opens a per-bucket records.jsonl on first arrival, and that path was built via `filepath.Join(outRoot, "s3", EncodeSegment(bucket), "_incomplete_uploads")` without validating the bucket segment. EncodeSegment preserves "." and "..", so a malformed snapshot upload record with bucket="." or ".." would escape the s3/ subtree at HandleIncompleteUpload time, well before flushBucket's guard runs. The same fail-closed bucket-name check now runs at the HandleIncompleteUpload entry. Caller audit per loop's audit-semantics rule: HandleIncompleteUpload is invoked only by test code; the surfaced ErrS3MalformedKey class matches the existing parseUploadFamily failure path. Test: TestS3_HandleIncompleteUploadRejectsDotSegmentBucket covers ".", "..", and empty bucket names. --- internal/backup/s3.go | 15 +++++++++++++++ internal/backup/s3_test.go | 29 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index 1095c56ca..93739b692 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -488,6 +488,21 @@ func (s *S3Encoder) HandleIncompleteUpload(prefix string, key, value []byte) err if !ok { return errors.Wrapf(ErrS3MalformedKey, "upload-family key: %q", key) } + // Reject dot-segment / empty bucket names BEFORE the + // filesystem join — same fix as flushBucket (round 6) but for + // the --include-incomplete-uploads code path, which runs + // during HandleIncompleteUpload and so escapes outRoot before + // Finalize's bucket-name guard ever runs. EncodeSegment + // preserves "." and "..", so a malformed snapshot upload + // record with bucket="." or ".." would otherwise let + // `filepath.Join` collapse the s3/ subtree and write + // records.jsonl outside the dump root. Codex P1 round 13 + // (PR #716, follow-up). + switch bucket { + case ".", "..", "": + return errors.Wrapf(ErrS3MalformedKey, + "bucket name %q in upload-family key is empty or a dot segment", bucket) + } b := s.bucketState(bucket) if b.incompleteUploadsJL == nil { dir := filepath.Join(s.outRoot, "s3", EncodeSegment([]byte(bucket)), "_incomplete_uploads") diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index a4fe6a750..c231d74be 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -584,6 +584,35 @@ func TestS3_StalePartVersionExcludedFromAssembledBody(t *testing.T) { } } +// TestS3_HandleIncompleteUploadRejectsDotSegmentBucket is the +// regression for Codex P1 round 13 follow-up (PR #716): the +// flushBucket bucket-name guard runs at FINALIZE, but +// HandleIncompleteUpload writes to disk earlier — every upload- +// family record opens a per-bucket records.jsonl on first +// arrival. Without an early dot-segment guard, a malformed +// snapshot upload record with bucket="." or ".." would let +// `filepath.Join` escape the s3/ subtree at +// HandleIncompleteUpload time, before the flushBucket guard ever +// runs. The same dot-segment refusal now runs in this earlier +// code path. +func TestS3_HandleIncompleteUploadRejectsDotSegmentBucket(t *testing.T) { + t.Parallel() + cases := []string{".", "..", ""} + for _, bucket := range cases { + t.Run(bucket, func(t *testing.T) { + t.Parallel() + enc, _ := newS3Encoder(t) + enc.WithIncludeIncompleteUploads(true) + err := enc.HandleIncompleteUpload(S3UploadMetaPrefix, + s3keys.UploadMetaKey(bucket, 1, "obj", "u"), + []byte("payload")) + if !errors.Is(err, ErrS3MalformedKey) { + t.Fatalf("err=%v want ErrS3MalformedKey for bucket name %q", err, bucket) + } + }) + } +} + // TestS3_FlushBucketRejectsDotSegmentBucketName is the regression for // Codex P1 round 13 (originally surfaced on PR #716's merged-in // s3.go): HandleBlob's dot-segment guard runs at the SCRATCH path, From ae7c6e2a95aabe9af7b71e8e7e50a29ca73a4b89 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 2 May 2026 20:21:02 +0900 Subject: [PATCH 19/20] backup: per-generation orphan chunk paths (PR #716, round 8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P2 round 13 (commit edfc944c): `flushOrphanObject` wrote stale chunks to `/_orphans//` where scratchName is `(uploadID, partNo, chunkNo, partVersion)`. Two stale generations sharing those four identifiers — possible during delete/recreate cleanup windows where the live system recycled the upload identifier — would overwrite one another, silently dropping forensic data the operator opted in to preserve via --include-orphans. Embed the object's generation in the path: `/_orphans//gen-/`. Each stale gen lands in its own subdir; restore tools can iterate per-gen. Caller audit (per loop's audit-semantics rule for output-shape change): single internal caller (flushBucketObjects); no public API change, only the on-disk layout under _orphans/ becomes one level deeper. The existing TestS3_OrphanChunksWrittenWhenIncludeOrphans test was updated to match the new path. Tests: - TestS3_OrphanChunksWrittenWhenIncludeOrphans updated for per-gen path. - NEW TestS3_OrphanChunksPreservedAcrossGenerations drives two stale gens sharing (uploadID, partNo, chunkNo, partVersion) and asserts both survive distinctly under their own gen-/ dirs. --- internal/backup/s3.go | 12 +++++++++- internal/backup/s3_test.go | 46 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index 93739b692..df1701ce1 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -2,6 +2,7 @@ package backup import ( "encoding/json" + "fmt" "io" "os" "path/filepath" @@ -752,7 +753,16 @@ func (s *S3Encoder) flushOrphanObject(b *s3BucketState, bucketDir string, obj *s return errors.Wrapf(ErrS3MalformedKey, "orphan object key %q is a dot segment (would escape orphan dir)", obj.object) } - dir := filepath.Join(bucketDir, "_orphans", EncodeSegment([]byte(obj.object))) + // Include the generation in the orphan path. Without this, + // two stale generations of the same object key sharing + // (uploadID, partNo, chunkNo, partVersion) — possible during + // delete/recreate cleanup windows where the live system + // recycled identifiers — would overwrite one another in + // `_orphans//`, silently dropping forensic + // data the operator opted in to preserve via + // --include-orphans. Codex P2 round 13 (PR #716). + genDir := fmt.Sprintf("gen-%d", obj.generation) + dir := filepath.Join(bucketDir, "_orphans", EncodeSegment([]byte(obj.object)), genDir) if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode return errors.WithStack(err) } diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index c231d74be..0f80a8d85 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -538,9 +538,51 @@ func TestS3_OrphanChunksWrittenWhenIncludeOrphans(t *testing.T) { if err := enc.Finalize(); err != nil { t.Fatal(err) } - dir := filepath.Join(root, "s3", "b", "_orphans", EncodeSegment([]byte("ghost"))) + // Orphan chunks land in /_orphans//gen-/ + // (round 13 follow-up: per-generation isolation). The chunk's + // generation == 1 here because we drove HandleBlob with gen=1. + dir := filepath.Join(root, "s3", "b", "_orphans", EncodeSegment([]byte("ghost")), "gen-1") if _, err := os.Stat(dir); err != nil { - t.Fatalf("expected _orphans dir under --include-orphans: %v", err) + t.Fatalf("expected _orphans//gen-N dir under --include-orphans: %v", err) + } +} + +// TestS3_OrphanChunksPreservedAcrossGenerations is the regression for +// Codex P2 round 13 (PR #716): two stale generations of the same +// object that share (uploadID, partNo, chunkNo, partVersion) — for +// example a delete/recreate cleanup window where the live system +// recycled the upload identifier — would overwrite one another in a +// single `_orphans//` dir, silently dropping +// forensic data the operator opted in to preserve via +// --include-orphans. The orphan path now embeds the generation so +// each gen's chunks land in its own subdir. +func TestS3_OrphanChunksPreservedAcrossGenerations(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + enc.WithIncludeOrphans(true) + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey("b"), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": "b", "generation": uint64(3), // active gen = 3 + })); err != nil { + t.Fatal(err) + } + // Two stale generations sharing the same (uploadID, partNo, + // chunkNo, partVersion). Both must survive distinctly. + if err := enc.HandleBlob(s3keys.BlobKey("b", 1, "ghost", "u", 1, 0), []byte("g1-bytes")); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey("b", 2, "ghost", "u", 1, 0), []byte("g2-bytes")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + gen1 := filepath.Join(root, "s3", "b", "_orphans", EncodeSegment([]byte("ghost")), "gen-1") + gen2 := filepath.Join(root, "s3", "b", "_orphans", EncodeSegment([]byte("ghost")), "gen-2") + if _, err := os.Stat(gen1); err != nil { + t.Fatalf("gen-1 orphan dir missing: %v", err) + } + if _, err := os.Stat(gen2); err != nil { + t.Fatalf("gen-2 orphan dir missing: %v", err) } } From 6a1bef1489e368ed4245d80e75711fafefddbcf5 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 2 May 2026 23:39:45 +0900 Subject: [PATCH 20/20] backup: skip stray chunks for zero-count manifest parts (PR #716, round 9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit Major round 13 (commit 389c4f76): `verifyChunkCompleteness` skipped parts where `want.chunkCount == 0`, but `filterChunksForManifest` still included any chunks recorded for those (partNo, partVersion) keys. The asymmetry let stray chunks for a manifest-declared zero-count part bleed into the assembled body while the completeness check passed — a corrupted body that violated the declared-part contract without raising any error. Make `filterChunksForManifest` symmetric: skip chunks whose declared part has `chunkCount == 0`. The two paths now agree on the "zero-count part contributes no bytes" invariant. Caller audit (per loop's audit-semantics rule for behavior change): single caller `assembleObjectBody`. The change only narrows what chunks reach the assembler; downstream completeness check already skips zero-count parts so the consistency is restored, not broken. No public API change. Test: - TestS3_ZeroChunkCountPartIgnoresStrayChunks drives a two-part manifest where part 1 has chunk_count=1 (real content "hello") and part 2 has chunk_count=0 with a stray "STRAY" chunk recorded. Asserts the assembled body is exactly "hello" — no stray bytes leak. --- internal/backup/s3.go | 17 ++++++++++++- internal/backup/s3_test.go | 52 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/internal/backup/s3.go b/internal/backup/s3.go index df1701ce1..84608f393 100644 --- a/internal/backup/s3.go +++ b/internal/backup/s3.go @@ -1012,7 +1012,22 @@ func filterChunksForManifest(m map[s3ChunkKey]string, manifestUploadID string, d continue } if declaredParts != nil { - if _, ok := declaredParts[s3PartKey{partNo: k.partNo, partVersion: k.partVersion}]; !ok { + declared, ok := declaredParts[s3PartKey{partNo: k.partNo, partVersion: k.partVersion}] + if !ok { + continue + } + // Symmetric with verifyChunkCompleteness's + // `want.chunkCount == 0 → skip` branch: a + // manifest part declaring zero chunks must not + // contribute any bytes to the assembled body, + // even if stray chunks for that (partNo, + // partVersion) exist on disk. Without this + // filter the assembler would silently merge those + // stray chunks while the completeness check + // passed, producing a body that violates the + // declared-part contract. CodeRabbit Major round + // 13 (PR #716). + if declared.chunkCount == 0 { continue } } diff --git a/internal/backup/s3_test.go b/internal/backup/s3_test.go index 0f80a8d85..86131a4f3 100644 --- a/internal/backup/s3_test.go +++ b/internal/backup/s3_test.go @@ -881,6 +881,58 @@ func readBytesFile(t *testing.T, path string) []byte { return b } +// TestS3_ZeroChunkCountPartIgnoresStrayChunks is the regression for +// CodeRabbit Major round 13 (PR #716): verifyChunkCompleteness skips +// parts with `want.chunkCount == 0`, but filterChunksForManifest used +// to include any chunks recorded for those parts in the assembled +// body. The asymmetry let stray chunks for a manifest-declared +// zero-count part bleed into the body while the completeness check +// passed. filterChunksForManifest now also skips chunks whose +// declared part has chunkCount == 0, so the assembled body +// faithfully reflects the manifest contract. +func TestS3_ZeroChunkCountPartIgnoresStrayChunks(t *testing.T) { + t.Parallel() + enc, root := newS3Encoder(t) + bucket := "b" + gen := uint64(1) + object := "obj" + uploadID := "u" + if err := enc.HandleBucketMeta(s3keys.BucketMetaKey(bucket), encodeS3BucketMetaValue(t, map[string]any{ + "bucket_name": bucket, "generation": gen, + })); err != nil { + t.Fatal(err) + } + // Manifest declares two parts: part 1 with one chunk (the real + // content), and part 2 with chunk_count=0 (a logically-empty + // part). A stray chunk for part 2 must not contribute to the + // body — the manifest says that part has zero chunks. + if err := enc.HandleObjectManifest(s3keys.ObjectManifestKey(bucket, gen, object), encodeS3ManifestValue(t, map[string]any{ + "upload_id": uploadID, "size_bytes": 5, "parts": []map[string]any{ + {"part_no": 1, "size_bytes": 5, "chunk_count": 1}, + {"part_no": 2, "size_bytes": 0, "chunk_count": 0}, + }, + })); err != nil { + t.Fatal(err) + } + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 1, 0), []byte("hello")); err != nil { + t.Fatal(err) + } + // Stray chunk for the zero-declared part 2 — must be filtered out. + if err := enc.HandleBlob(s3keys.BlobKey(bucket, gen, object, uploadID, 2, 0), []byte("STRAY")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + got, err := os.ReadFile(filepath.Join(root, "s3", bucket, object)) //nolint:gosec // test path + if err != nil { + t.Fatal(err) + } + if string(got) != "hello" { + t.Fatalf("body=%q want %q (stray chunk for zero-count part leaked into body)", got, "hello") + } +} + // TestS3_DuplicateChunksWithMissingIndexRejected is the regression // for Codex P1 round 12: the previous count-and-maxIndex check // admitted false positives. For declared chunk_count=3, observed