From be9328d702e7e1d8228adbc28a0439d067a497ae Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Tue, 19 May 2026 13:26:58 -0500 Subject: [PATCH 1/5] Convert littdb to use otel --- .../db_engine/litt/littbuilder/build_utils.go | 59 +--- sei-db/db_engine/litt/littbuilder/db_impl.go | 37 +- sei-db/db_engine/litt/littdb_config.go | 17 +- .../db_engine/litt/metrics/littdb_metrics.go | 329 ++++++++---------- sei-db/db_engine/litt/util/cache_metrics.go | 119 ++++--- 5 files changed, 258 insertions(+), 303 deletions(-) diff --git a/sei-db/db_engine/litt/littbuilder/build_utils.go b/sei-db/db_engine/litt/littbuilder/build_utils.go index c97e9cc2f2..a9abc18309 100644 --- a/sei-db/db_engine/litt/littbuilder/build_utils.go +++ b/sei-db/db_engine/litt/littbuilder/build_utils.go @@ -1,17 +1,13 @@ package littbuilder import ( + "context" "fmt" "log/slog" - "net/http" "os" "path" - "strings" - "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/collectors" - "github.com/prometheus/client_golang/prometheus/promhttp" + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/dbcache" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable" @@ -243,45 +239,26 @@ func buildLogger(config *litt.Config) *slog.Logger { return slog.Default() } -// buildMetrics creates a new metrics object based on the configuration. If the returned server is not nil, -// then it is the responsibility of the caller to eventually call server.Shutdown(). -func buildMetrics(config *litt.Config, logger *slog.Logger) (*metrics.LittDBMetrics, *http.Server) { +// buildMetrics creates a new metrics object backed by the global OTel +// MeterProvider. When MetricsEnabled is true, this configures the global +// provider with a Prometheus exporter and starts an HTTP server on +// MetricsPort that serves /metrics. The returned shutdown function flushes +// the provider; it is the responsibility of the caller to invoke it during +// teardown. +func buildMetrics(config *litt.Config, logger *slog.Logger) (*metrics.LittDBMetrics, func(context.Context) error) { if !config.MetricsEnabled { return nil, nil } - var registry *prometheus.Registry - var server *http.Server - - if config.MetricsEnabled { - if config.MetricsRegistry == nil { - registry = prometheus.NewRegistry() - registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) - registry.MustRegister(collectors.NewGoCollector()) - - logger.Info("Starting metrics server", "port", config.MetricsPort) - addr := fmt.Sprintf(":%d", config.MetricsPort) - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.HandlerFor( - registry, - promhttp.HandlerOpts{}, - )) - server = &http.Server{ - Addr: addr, - Handler: mux, - ReadHeaderTimeout: 10 * time.Second, - } - - go func() { - err := server.ListenAndServe() - if err != nil && !strings.Contains(err.Error(), "http: Server closed") { - logger.Error("metrics server error", "error", err) - } - }() - } else { - registry = config.MetricsRegistry - } + reg, shutdown, err := commonmetrics.SetupOtelPrometheus() + if err != nil { + logger.Error("failed to set up OTel Prometheus exporter", "error", err) + return nil, nil } - return metrics.NewLittDBMetrics(registry, config.MetricsNamespace), server + addr := fmt.Sprintf(":%d", config.MetricsPort) + logger.Info("Starting metrics server", "port", config.MetricsPort) + commonmetrics.StartMetricsServer(config.CTX, reg, addr) + + return metrics.NewLittDBMetrics(), shutdown } diff --git a/sei-db/db_engine/litt/littbuilder/db_impl.go b/sei-db/db_engine/litt/littbuilder/db_impl.go index 949aa1de53..4dc2809f1a 100644 --- a/sei-db/db_engine/litt/littbuilder/db_impl.go +++ b/sei-db/db_engine/litt/littbuilder/db_impl.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log/slog" - "net/http" "sync" "sync/atomic" "time" @@ -53,8 +52,8 @@ type db struct { // Metrics for the database. metrics *metrics.LittDBMetrics - // The HTTP server for metrics. nil if metrics are disabled or if an external party is managing the server. - metricsServer *http.Server + // Shuts down the OTel MeterProvider configured by buildMetrics. nil if metrics are disabled. + metricsShutdown func(context.Context) error // A function that releases file locks. releaseLocks func() @@ -125,9 +124,9 @@ func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, e } var dbMetrics *metrics.LittDBMetrics - var metricsServer *http.Server + var metricsShutdown func(context.Context) error if config.MetricsEnabled { - dbMetrics, metricsServer = buildMetrics(config, config.Logger) + dbMetrics, metricsShutdown = buildMetrics(config, config.Logger) } if config.SnapshotDirectory != "" { @@ -136,16 +135,16 @@ func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, e } database := &db{ - ctx: config.CTX, - logger: config.Logger, - clock: config.Clock, - ttl: config.TTL, - gcPeriod: config.GCPeriod, - tableBuilder: tableBuilder, - tables: make(map[string]litt.ManagedTable), - metrics: dbMetrics, - metricsServer: metricsServer, - releaseLocks: releaseLocks, + ctx: config.CTX, + logger: config.Logger, + clock: config.Clock, + ttl: config.TTL, + gcPeriod: config.GCPeriod, + tableBuilder: tableBuilder, + tables: make(map[string]litt.ManagedTable), + metrics: dbMetrics, + metricsShutdown: metricsShutdown, + releaseLocks: releaseLocks, } if config.MetricsEnabled { @@ -281,11 +280,13 @@ func (d *db) Destroy() error { // gatherMetrics is a method that periodically collects metrics. func (d *db) gatherMetrics(interval time.Duration) { - if d.metricsServer != nil { + if d.metricsShutdown != nil { defer func() { - err := d.metricsServer.Close() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + err := d.metricsShutdown(shutdownCtx) if err != nil { - d.logger.Error("error closing metrics server", "error", err) + d.logger.Error("error shutting down metrics provider", "error", err) } }() } diff --git a/sei-db/db_engine/litt/littdb_config.go b/sei-db/db_engine/litt/littdb_config.go index 4c5594c4de..ec2e29d0cd 100644 --- a/sei-db/db_engine/litt/littdb_config.go +++ b/sei-db/db_engine/litt/littdb_config.go @@ -7,7 +7,6 @@ import ( "math" "time" - "github.com/prometheus/client_golang/prometheus" "github.com/sei-protocol/sei-chain/sei-db/common/unit" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable/keymap" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/util" @@ -117,17 +116,12 @@ type Config struct { // than keymap.MemKeymapType, performing this check may be very expensive. By default, this is false. DoubleWriteProtection bool - // If enabled, collect DB metrics and export them to prometheus. By default, this is false. + // If enabled, collect DB metrics and export them via the global OTel MeterProvider. By default, this is false. + // When enabled, the database configures a Prometheus exporter on the global provider and serves /metrics on + // MetricsPort. MetricsEnabled bool - // The namespace to use for metrics. If empty, the default namespace "litt" is used. - MetricsNamespace string - - // The prometheus registry to use for metrics. If nil and metrics are enabled, a new registry is created. - MetricsRegistry *prometheus.Registry - - // The port to use for the metrics server. Ignored if MetricsEnabled is false or MetricsRegistry is not nil. - // The default is 9101. + // The port to use for the metrics server. Ignored if MetricsEnabled is false. The default is 9101. MetricsPort int // The interval at which various DB metrics are updated. The default is 1 second. @@ -194,7 +188,6 @@ func DefaultConfigNoPaths() *Config { Fsync: true, DoubleWriteProtection: false, MetricsEnabled: false, - MetricsNamespace: "litt", MetricsPort: 9101, MetricsUpdateInterval: time.Second, PurgeLocks: false, @@ -258,7 +251,7 @@ func (c *Config) SanityCheck() error { if c.GCPeriod == 0 { return fmt.Errorf("gc period must be at least 1") } - if (c.MetricsEnabled || c.MetricsRegistry != nil) && c.MetricsUpdateInterval == 0 { + if c.MetricsEnabled && c.MetricsUpdateInterval == 0 { return fmt.Errorf("metrics update interval must be at least 1 if metrics are enabled") } diff --git a/sei-db/db_engine/litt/metrics/littdb_metrics.go b/sei-db/db_engine/litt/metrics/littdb_metrics.go index 6e41b6c44a..c7d1aa468c 100644 --- a/sei-db/db_engine/litt/metrics/littdb_metrics.go +++ b/sei-db/db_engine/litt/metrics/littdb_metrics.go @@ -1,10 +1,14 @@ package metrics import ( + "context" "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/util" ) @@ -22,56 +26,64 @@ import ( // - segment creation rate // - used/unused segment space (useful for detecting shard assignment issues) -// LittDBMetrics encapsulates metrics for a LittDB. +const littMeterName = "litt" + +// LittDBMetrics encapsulates metrics for a LittDB. Metrics are exported via +// whatever exporter is configured on the global OTel MeterProvider (e.g. +// Prometheus, OTLP). The caller is responsible for setting up the provider +// before calling NewLittDBMetrics (see commonmetrics.SetupOtelPrometheus). +// +// Per-table observations are tagged with a "table" attribute. A nil +// LittDBMetrics acts as a no-op for all Report* methods. type LittDBMetrics struct { // The size of individual tables in the database. - tableSizeInBytes *prometheus.GaugeVec + tableSizeInBytes metric.Int64Gauge // The number of keys in individual tables in the database. - tableKeyCount *prometheus.GaugeVec + tableKeyCount metric.Int64Gauge // The number of bytes read from disk since startup. - bytesReadCounter *prometheus.CounterVec + bytesReadCounter metric.Int64Counter // The number of keys read from disk since startup. - keysReadCounter *prometheus.CounterVec + keysReadCounter metric.Int64Counter // The number of cache hits since startup. - cacheHitCounter *prometheus.CounterVec + cacheHitCounter metric.Int64Counter // The number of cache misses since startup. - cacheMissCounter *prometheus.CounterVec + cacheMissCounter metric.Int64Counter // Reports on the read latency of the database. This metric includes both cache hits and cache misses. - readLatency *prometheus.SummaryVec + readLatency metric.Float64Histogram // Reports on the write latency of the database, but only measures the time to read a value when a // cache miss occurs. - cacheMissLatency *prometheus.SummaryVec + cacheMissLatency metric.Float64Histogram // The number of bytes written to disk since startup. Only includes values, not metadata. - bytesWrittenCounter *prometheus.CounterVec + bytesWrittenCounter metric.Int64Counter // The number of keys written to disk since startup. - keysWrittenCounter *prometheus.CounterVec + keysWrittenCounter metric.Int64Counter // Reports on the write latency of the database. - writeLatency *prometheus.SummaryVec + writeLatency metric.Float64Histogram // The number of times a flush operation has been performed. - flushCount *prometheus.CounterVec + flushCount metric.Int64Counter // Reports on the latency of a flush operation. - flushLatency *prometheus.SummaryVec + flushLatency metric.Float64Histogram // Reports on the latency of a flushing segment files. This is a subset of the time spent during a flush operation. - segmentFlushLatency *prometheus.SummaryVec + segmentFlushLatency metric.Float64Histogram // Reports on the latency of a keymap flush operation. This is a subset of the time spent during a flush operation. - keymapFlushLatency *prometheus.SummaryVec + keymapFlushLatency metric.Float64Histogram - // The latency of garbage collection operations.1 - garbageCollectionLatency *prometheus.SummaryVec + // The latency of garbage collection operations. + garbageCollectionLatency metric.Float64Histogram // Metrics for the write cache. writeCacheMetrics *util.CacheMetrics @@ -80,179 +92,124 @@ type LittDBMetrics struct { readCacheMetrics *util.CacheMetrics } -// NewLittDBMetrics creates a new LittDBMetrics instance. -func NewLittDBMetrics(registry *prometheus.Registry, namespace string) *LittDBMetrics { - if registry == nil { - return nil - } - - objectives := map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001} - - tableSizeInBytes := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "table_size_bytes", - Help: "The size of individual tables in the database in bytes.", - }, - []string{"table"}, - ) - - tableKeyCount := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "table_key_count", - Help: "The number of keys in individual tables in the database.", - }, - []string{"table"}, +// NewLittDBMetrics creates a new LittDBMetrics instance backed by the global +// OTel MeterProvider. The caller must configure a MeterProvider with a +// Prometheus or other exporter before calling this (e.g. via +// commonmetrics.SetupOtelPrometheus). +func NewLittDBMetrics() *LittDBMetrics { + meter := otel.Meter(littMeterName) + + tableSizeInBytes, _ := meter.Int64Gauge( + "litt_table_size_bytes", + metric.WithDescription("The size of individual tables in the database in bytes."), + metric.WithUnit("By"), ) - bytesReadCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "bytes_read", - Help: "The number of bytes read from disk since startup.", - }, - []string{"table"}, + tableKeyCount, _ := meter.Int64Gauge( + "litt_table_key_count", + metric.WithDescription("The number of keys in individual tables in the database."), + metric.WithUnit("{count}"), ) - keysReadCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "keys_read", - Help: "The number of keys read from disk since startup.", - }, - []string{"table"}, + bytesReadCounter, _ := meter.Int64Counter( + "litt_bytes_read", + metric.WithDescription("The number of bytes read from disk since startup."), + metric.WithUnit("By"), ) - cacheHitCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "cache_hits", - Help: "The number of cache hits since startup.", - }, - []string{"table"}, + keysReadCounter, _ := meter.Int64Counter( + "litt_keys_read", + metric.WithDescription("The number of keys read from disk since startup."), + metric.WithUnit("{count}"), ) - cacheMissCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "cache_misses", - Help: "The number of cache misses since startup.", - }, - []string{"table"}, + cacheHitCounter, _ := meter.Int64Counter( + "litt_cache_hits", + metric.WithDescription("The number of cache hits since startup."), + metric.WithUnit("{count}"), ) - readLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "read_latency_ms", - Help: "Reports on the read latency of the database. " + - "This metric includes both cache hits and cache misses.", - Objectives: objectives, - }, - []string{"table"}, + cacheMissCounter, _ := meter.Int64Counter( + "litt_cache_misses", + metric.WithDescription("The number of cache misses since startup."), + metric.WithUnit("{count}"), ) - cacheMissLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "cache_miss_latency_ms", - Help: "Reports on the write latency of the database, " + - "but only measures the time to read a value when a cache miss occurs.", - Objectives: objectives, - }, - []string{"table"}, + readLatency, _ := meter.Float64Histogram( + "litt_read_latency_seconds", + metric.WithDescription( + "Reports on the read latency of the database. "+ + "This metric includes both cache hits and cache misses."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - bytesWrittenCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "bytes_written", - Help: "The number of bytes written to disk since startup. Only includes values, not metadata.", - }, - []string{"table"}, + cacheMissLatency, _ := meter.Float64Histogram( + "litt_cache_miss_latency_seconds", + metric.WithDescription( + "Reports on the read latency of the database, "+ + "but only measures the time to read a value when a cache miss occurs."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - keysWrittenCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "keys_written", - Help: "The number of keys written to disk since startup.", - }, - []string{"table"}, + bytesWrittenCounter, _ := meter.Int64Counter( + "litt_bytes_written", + metric.WithDescription("The number of bytes written to disk since startup. Only includes values, not metadata."), + metric.WithUnit("By"), ) - writeLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "write_latency_ms", - Help: "Reports on the write latency of the database.", - Objectives: objectives, - }, - []string{"table"}, + keysWrittenCounter, _ := meter.Int64Counter( + "litt_keys_written", + metric.WithDescription("The number of keys written to disk since startup."), + metric.WithUnit("{count}"), ) - flushCount := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "flush_count", - Help: "The number of times a flush operation has been performed.", - }, - []string{"table"}, + writeLatency, _ := meter.Float64Histogram( + "litt_write_latency_seconds", + metric.WithDescription("Reports on the write latency of the database."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - flushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "flush_latency_ms", - Help: "Reports on the latency of a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + flushCount, _ := meter.Int64Counter( + "litt_flush_count", + metric.WithDescription("The number of times a flush operation has been performed."), + metric.WithUnit("{count}"), ) - segmentFlushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "segment_flush_latency_ms", - Help: "Reports on segment flush latency. This is a subset of the time spent during a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + flushLatency, _ := meter.Float64Histogram( + "litt_flush_latency_seconds", + metric.WithDescription("Reports on the latency of a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - keymapFlushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "keymap_flush_latency_ms", - Help: "Reports on the latency of a keymap flush operation. " + - "This is a subset of the time spent during a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + segmentFlushLatency, _ := meter.Float64Histogram( + "litt_segment_flush_latency_seconds", + metric.WithDescription("Reports on segment flush latency. This is a subset of the time spent during a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - garbageCollectionLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "garbage_collection_latency_ms", - Help: "Reports on the latency of garbage collection operations.", - Objectives: objectives, - }, - []string{"table"}, + keymapFlushLatency, _ := meter.Float64Histogram( + "litt_keymap_flush_latency_seconds", + metric.WithDescription( + "Reports on the latency of a keymap flush operation. "+ + "This is a subset of the time spent during a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - writeCacheMetrics := util.NewCacheMetrics( - registry, - namespace, - "chunk_write", + garbageCollectionLatency, _ := meter.Float64Histogram( + "litt_garbage_collection_latency_seconds", + metric.WithDescription("Reports on the latency of garbage collection operations."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - readCacheMetrics := util.NewCacheMetrics( - registry, - namespace, - "chunk_read", - ) + writeCacheMetrics := util.NewCacheMetrics("chunk_write") + readCacheMetrics := util.NewCacheMetrics("chunk_read") return &LittDBMetrics{ tableSizeInBytes: tableSizeInBytes, @@ -276,6 +233,13 @@ func NewLittDBMetrics(registry *prometheus.Registry, namespace string) *LittDBMe } } +// tableAttr returns the OTel measurement option that tags an observation with +// the given table name. Allocated per call rather than cached because callers +// pass arbitrary table names; for hot-path call sites consider caching upstream. +func tableAttr(tableName string) metric.MeasurementOption { + return metric.WithAttributes(attribute.String("table", tableName)) +} + // CollectPeriodicMetrics is a method that is periodically called to collect metrics. Tables are not permitted to be // added or dropped while this method is running. func (m *LittDBMetrics) CollectPeriodicMetrics(tables map[string]litt.ManagedTable) { @@ -283,14 +247,16 @@ func (m *LittDBMetrics) CollectPeriodicMetrics(tables map[string]litt.ManagedTab return } + ctx := context.Background() for _, table := range tables { tableName := table.Name() + attrs := tableAttr(tableName) tableSize := table.Size() - m.tableSizeInBytes.WithLabelValues(tableName).Set(float64(tableSize)) + m.tableSizeInBytes.Record(ctx, int64(tableSize), attrs) //nolint:gosec // table size fits int64 tableKeyCount := table.KeyCount() - m.tableKeyCount.WithLabelValues(tableName).Set(float64(tableKeyCount)) + m.tableKeyCount.Record(ctx, int64(tableKeyCount), attrs) //nolint:gosec // key count fits int64 } } @@ -305,15 +271,18 @@ func (m *LittDBMetrics) ReportReadOperation( return } - m.bytesReadCounter.WithLabelValues(tableName).Add(float64(dataSize)) - m.keysReadCounter.WithLabelValues(tableName).Inc() - m.readLatency.WithLabelValues(tableName).Observe(latency.Seconds()) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.bytesReadCounter.Add(ctx, int64(dataSize), attrs) //nolint:gosec // data size fits int64 + m.keysReadCounter.Add(ctx, 1, attrs) + m.readLatency.Record(ctx, latency.Seconds(), attrs) if cacheHit { - m.cacheHitCounter.WithLabelValues(tableName).Inc() + m.cacheHitCounter.Add(ctx, 1, attrs) } else { - m.cacheMissCounter.WithLabelValues(tableName).Inc() - m.cacheMissLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.cacheMissCounter.Add(ctx, 1, attrs) + m.cacheMissLatency.Record(ctx, latency.Seconds(), attrs) } } @@ -328,9 +297,12 @@ func (m *LittDBMetrics) ReportWriteOperation( return } - m.bytesWrittenCounter.WithLabelValues(tableName).Add(float64(dataSize)) - m.keysWrittenCounter.WithLabelValues(tableName).Add(float64(batchSize)) - m.writeLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.bytesWrittenCounter.Add(ctx, int64(dataSize), attrs) //nolint:gosec // data size fits int64 + m.keysWrittenCounter.Add(ctx, int64(batchSize), attrs) //nolint:gosec // batch size fits int64 + m.writeLatency.Record(ctx, latency.Seconds(), attrs) } // ReportFlushOperation reports the results of a flush operation. @@ -339,8 +311,11 @@ func (m *LittDBMetrics) ReportFlushOperation(tableName string, latency time.Dura return } - m.flushCount.WithLabelValues(tableName).Inc() - m.flushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.flushCount.Add(ctx, 1, attrs) + m.flushLatency.Record(ctx, latency.Seconds(), attrs) } // ReportSegmentFlushLatency reports the amount of time taken to flush value files. @@ -349,7 +324,7 @@ func (m *LittDBMetrics) ReportSegmentFlushLatency(tableName string, latency time return } - m.segmentFlushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.segmentFlushLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } // ReportKeymapFlushLatency reports the amount of time taken to flush the keymap. @@ -358,7 +333,7 @@ func (m *LittDBMetrics) ReportKeymapFlushLatency(tableName string, latency time. return } - m.keymapFlushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.keymapFlushLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } // ReportGarbageCollectionLatency reports the latency of a garbage collection operation. @@ -367,7 +342,7 @@ func (m *LittDBMetrics) ReportGarbageCollectionLatency(tableName string, latency return } - m.garbageCollectionLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.garbageCollectionLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } func (m *LittDBMetrics) GetWriteCacheMetrics() *util.CacheMetrics { diff --git a/sei-db/db_engine/litt/util/cache_metrics.go b/sei-db/db_engine/litt/util/cache_metrics.go index 572f8e3327..e6f92ca620 100644 --- a/sei-db/db_engine/litt/util/cache_metrics.go +++ b/sei-db/db_engine/litt/util/cache_metrics.go @@ -1,74 +1,81 @@ package util import ( + "context" "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" ) -// CacheMetrics is a struct that holds metrics for a cache. A nil CacheMetrics instance acts as a no-op. +const cacheMeterName = "litt" + +// CacheMetrics is a struct that holds OTel metrics for a cache. A nil +// CacheMetrics instance acts as a no-op for all report* methods. +// +// Multiple CacheMetrics instances may be created for the same process; each +// receives references to the same underlying instruments because OTel +// instrument registration is idempotent. The "cache" attribute (set at +// construction time) distinguishes series in the exporter +// (e.g. litt_chunk_cache_keys_added{cache="chunk_read"}). type CacheMetrics struct { - keyCount *prometheus.GaugeVec - weight *prometheus.GaugeVec - keysAdded *prometheus.CounterVec - weightAdded *prometheus.CounterVec - evictionLatency *prometheus.SummaryVec + // Pre-computed attribute option reused on every recording to avoid + // per-call allocations on the hot path. + attrs metric.MeasurementOption + + keyCount metric.Int64Gauge + weight metric.Int64Gauge + keysAdded metric.Int64Counter + weightAdded metric.Int64Counter + evictionLatency metric.Float64Histogram } -// NewCacheMetrics creates a new CacheMetrics instance. If the registry is nil, it returns nil. -// The cacheName does not need to include the suffix "_cache" as this is added automatically. -func NewCacheMetrics(registry *prometheus.Registry, namespace string, cacheName string) *CacheMetrics { - if registry == nil { - return nil - } - - evictionLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: cacheName + "_cache_eviction_latency_ms", - Help: "Reports on the eviction latency of the cache.", - }, - []string{}, +// NewCacheMetrics creates a new CacheMetrics that records via the global OTel +// MeterProvider. The cacheName is attached as the "cache" attribute on every +// observation, allowing multiple cache instances to be distinguished in the +// exporter (for example "chunk_read" vs "chunk_write"). +// +// The caller must have configured a MeterProvider before calling this (e.g. +// commonmetrics.SetupOtelPrometheus). +func NewCacheMetrics(cacheName string) *CacheMetrics { + meter := otel.Meter(cacheMeterName) + + keyCount, _ := meter.Int64Gauge( + "litt_chunk_cache_key_count", + metric.WithDescription("Reports on the number of keys in the cache."), + metric.WithUnit("{count}"), ) - keyCount := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: cacheName + "_cache_key_count", - Help: "Reports on the number of keys in the cache", - }, - []string{}, + weight, _ := meter.Int64Gauge( + "litt_chunk_cache_weight_bytes", + metric.WithDescription("Reports on the weight of the cache in bytes."), + metric.WithUnit("By"), ) - weight := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: cacheName + "_cache_weight", - Help: "Reports on the weight of the cache", - }, - []string{}, + keysAdded, _ := meter.Int64Counter( + "litt_chunk_cache_keys_added", + metric.WithDescription("Reports on the number of keys added to the cache."), + metric.WithUnit("{count}"), ) - keysAdded := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: cacheName + "_cache_keys_added", - Help: "Reports on the number of keys added to the cache", - }, - []string{}, + weightAdded, _ := meter.Int64Counter( + "litt_chunk_cache_weight_added_bytes", + metric.WithDescription("Reports on the weight of the entries added to the cache."), + metric.WithUnit("By"), ) - weightAdded := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: cacheName + "_cache_weight_added", - Help: "Reports on the weight of the entries added to the cache", - }, - []string{}, + evictionLatency, _ := meter.Float64Histogram( + "litt_chunk_cache_eviction_latency_seconds", + metric.WithDescription("Reports on the eviction latency of the cache."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) return &CacheMetrics{ + attrs: metric.WithAttributes(attribute.String("cache", cacheName)), keyCount: keyCount, weight: weight, keysAdded: keysAdded, @@ -83,8 +90,9 @@ func (m *CacheMetrics) reportInsertion(weight uint64) { return } - m.keysAdded.WithLabelValues().Inc() - m.weightAdded.WithLabelValues().Add(float64(weight)) + ctx := context.Background() + m.keysAdded.Add(ctx, 1, m.attrs) + m.weightAdded.Add(ctx, int64(weight), m.attrs) //nolint:gosec // weight fits int64 } // reportEviction is used to report an entry being evicted from the cache. @@ -93,7 +101,7 @@ func (m *CacheMetrics) reportEviction(age time.Duration) { return } - m.evictionLatency.WithLabelValues().Observe(ToMilliseconds(age)) + m.evictionLatency.Record(context.Background(), age.Seconds(), m.attrs) } // reportCurrentSize is used to report the current size/weight of the cache. @@ -102,6 +110,7 @@ func (m *CacheMetrics) reportCurrentSize(size int, weight uint64) { return } - m.keyCount.WithLabelValues().Set(float64(size)) - m.weight.WithLabelValues().Set(float64(weight)) + ctx := context.Background() + m.keyCount.Record(ctx, int64(size), m.attrs) //nolint:gosec // size fits int64 + m.weight.Record(ctx, int64(weight), m.attrs) //nolint:gosec // weight fits int64 } From 3f1f31ed1974826dd0488a25697523ecc957f796 Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Tue, 19 May 2026 15:26:58 -0500 Subject: [PATCH 2/5] support secondary keys --- sei-db/db_engine/litt/README.md | 30 +- sei-db/db_engine/litt/dbcache/cached_table.go | 16 +- .../litt/disktable/control_loop_messages.go | 2 +- sei-db/db_engine/litt/disktable/disk_table.go | 87 ++- .../disk_table_secondary_keys_test.go | 543 ++++++++++++++++++ .../litt/disktable/disk_table_test.go | 60 +- .../litt/disktable/segment/key_file.go | 40 +- .../litt/disktable/segment/key_file_test.go | 17 +- .../litt/disktable/segment/metadata_file.go | 3 +- .../litt/disktable/segment/segment.go | 183 +++++- .../litt/disktable/segment/segment_test.go | 450 ++++++++++++++- .../litt/disktable/segment/segment_version.go | 21 +- .../litt/disktable/segment/value_file.go | 36 +- .../litt/disktable/segment/value_file_test.go | 75 ++- sei-db/db_engine/litt/memtable/mem_table.go | 57 +- sei-db/db_engine/litt/table.go | 31 +- sei-db/db_engine/litt/test/db_test.go | 8 +- .../litt/test/keymap_migration_test.go | 8 +- sei-db/db_engine/litt/test/migration_data.go | 68 ++- sei-db/db_engine/litt/test/migration_test.go | 16 +- sei-db/db_engine/litt/test/table_test.go | 119 +++- .../testdata/v3/test/keymap/data/000002.log | Bin 3141 -> 3387 bytes .../test/testdata/v3/test/segments/0-0.values | Bin 77 -> 98 bytes .../test/testdata/v3/test/segments/0-1.values | Bin 98 -> 96 bytes .../test/testdata/v3/test/segments/0-2.values | Bin 88 -> 109 bytes .../test/testdata/v3/test/segments/0-3.values | Bin 101 -> 89 bytes .../test/testdata/v3/test/segments/0.keys | Bin 622 -> 821 bytes .../test/testdata/v3/test/segments/0.metadata | Bin 18 -> 18 bytes .../test/testdata/v3/test/segments/1-0.values | Bin 84 -> 110 bytes .../test/testdata/v3/test/segments/1-1.values | Bin 101 -> 91 bytes .../test/testdata/v3/test/segments/1-2.values | Bin 74 -> 93 bytes .../test/testdata/v3/test/segments/1-3.values | Bin 80 -> 93 bytes .../test/testdata/v3/test/segments/1.keys | Bin 577 -> 749 bytes .../test/testdata/v3/test/segments/1.metadata | Bin 18 -> 18 bytes .../test/testdata/v3/test/segments/2-0.values | Bin 103 -> 105 bytes .../test/testdata/v3/test/segments/2-1.values | Bin 98 -> 97 bytes .../test/testdata/v3/test/segments/2-2.values | Bin 98 -> 93 bytes .../test/testdata/v3/test/segments/2-3.values | Bin 85 -> 87 bytes .../test/testdata/v3/test/segments/2.keys | Bin 655 -> 859 bytes .../test/testdata/v3/test/segments/2.metadata | Bin 18 -> 18 bytes .../test/testdata/v3/test/segments/3-0.values | Bin 112 -> 82 bytes .../test/testdata/v3/test/segments/3-1.values | Bin 85 -> 121 bytes .../test/testdata/v3/test/segments/3-2.values | Bin 98 -> 75 bytes .../test/testdata/v3/test/segments/3-3.values | Bin 96 -> 60 bytes .../test/testdata/v3/test/segments/3.keys | Bin 646 -> 855 bytes .../test/testdata/v3/test/segments/3.metadata | Bin 18 -> 18 bytes .../test/testdata/v3/test/segments/4-0.values | Bin 96 -> 0 bytes .../test/testdata/v3/test/segments/4-1.values | Bin 86 -> 0 bytes .../test/testdata/v3/test/segments/4-2.values | Bin 86 -> 0 bytes .../test/testdata/v3/test/segments/4-3.values | Bin 89 -> 0 bytes .../test/testdata/v3/test/segments/4.keys | Bin 615 -> 0 bytes .../test/testdata/v3/test/segments/4.metadata | Bin 18 -> 18 bytes sei-db/db_engine/litt/types/key_kind.go | 42 ++ sei-db/db_engine/litt/types/kv_pair.go | 9 - sei-db/db_engine/litt/types/put_request.go | 11 + sei-db/db_engine/litt/types/scoped_key.go | 5 + sei-db/db_engine/litt/types/secondary_key.go | 15 + 57 files changed, 1714 insertions(+), 238 deletions(-) create mode 100644 sei-db/db_engine/litt/disktable/disk_table_secondary_keys_test.go create mode 100644 sei-db/db_engine/litt/types/key_kind.go delete mode 100644 sei-db/db_engine/litt/types/kv_pair.go create mode 100644 sei-db/db_engine/litt/types/put_request.go create mode 100644 sei-db/db_engine/litt/types/secondary_key.go diff --git a/sei-db/db_engine/litt/README.md b/sei-db/db_engine/litt/README.md index 33a863de69..d3492e844c 100644 --- a/sei-db/db_engine/litt/README.md +++ b/sei-db/db_engine/litt/README.md @@ -119,8 +119,8 @@ Source: [table.go](table.go) ```go type Table interface { Name() string -Put(key []byte, value []byte) error -PutBatch(batch []*types.KVPair) error +Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error +PutBatch(batch []*types.PutRequest) error Get(key []byte) ([]byte, bool, error) Exists(key []byte) (bool, error) Flush() error @@ -130,15 +130,33 @@ SetCacheSize(size uint64) error } ``` -Source: [kv_pair.go](types/kv_pair.go) +Both primary keys and secondary keys must not exceed 64 KiB (2^16 - 1 bytes). Values may be up to 2^32 bytes. +Source: [put_request.go](types/put_request.go) + +```go +type PutRequest struct { + Key []byte + Value []byte + SecondaryKeys []*SecondaryKey // optional, may be nil +} ``` -type KVPair struct { - Key []byte - Value []byte + +Source: [secondary_key.go](types/secondary_key.go) + +```go +type SecondaryKey struct { + Key []byte // a globally unique key alias + Offset uint32 // byte offset into the parent value + Length uint32 // length of the byte range (Offset+Length <= len(Value)) } ``` +A secondary key is a first-class key that aliases a sub-range (or the whole) of the parent value's +bytes. `Get`, `Exists`, `KeyCount`, and TTL all treat secondary keys identically to primary keys. +Secondary keys share the value's bytes on disk, so each one costs roughly one keymap entry rather +than duplicating value bytes. + ## Getting Started Below is a functional example showing how to use LittDB. diff --git a/sei-db/db_engine/litt/dbcache/cached_table.go b/sei-db/db_engine/litt/dbcache/cached_table.go index f434195ff1..935594c02e 100644 --- a/sei-db/db_engine/litt/dbcache/cached_table.go +++ b/sei-db/db_engine/litt/dbcache/cached_table.go @@ -51,22 +51,28 @@ func (c *cachedTable) Name() string { return c.base.Name() } -func (c *cachedTable) Put(key []byte, value []byte) error { - err := c.base.Put(key, value) +func (c *cachedTable) Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error { + err := c.base.Put(key, value, secondaryKeys...) if err != nil { return fmt.Errorf("failed to put entry into base table: %w", err) } c.writeCache.Put(string(key), value) + for _, sk := range secondaryKeys { + c.writeCache.Put(string(sk.Key), value[sk.Offset:sk.Offset+sk.Length]) + } return nil } -func (c *cachedTable) PutBatch(batch []*types.KVPair) error { +func (c *cachedTable) PutBatch(batch []*types.PutRequest) error { err := c.base.PutBatch(batch) if err != nil { return err } - for _, kv := range batch { - c.writeCache.Put(util.UnsafeBytesToString(kv.Key), kv.Value) + for _, req := range batch { + c.writeCache.Put(util.UnsafeBytesToString(req.Key), req.Value) + for _, sk := range req.SecondaryKeys { + c.writeCache.Put(util.UnsafeBytesToString(sk.Key), req.Value[sk.Offset:sk.Offset+sk.Length]) + } } return nil } diff --git a/sei-db/db_engine/litt/disktable/control_loop_messages.go b/sei-db/db_engine/litt/disktable/control_loop_messages.go index f6d18a9d00..e11152ad14 100644 --- a/sei-db/db_engine/litt/disktable/control_loop_messages.go +++ b/sei-db/db_engine/litt/disktable/control_loop_messages.go @@ -24,7 +24,7 @@ type controlLoopWriteRequest struct { controlLoopMessage // values is a slice of key-value pairs to write. - values []*types.KVPair + values []*types.PutRequest } // controlLoopSetShardingFactorRequest is a request to set the sharding factor that is sent to the control loop. diff --git a/sei-db/db_engine/litt/disktable/disk_table.go b/sei-db/db_engine/litt/disktable/disk_table.go index b58f7bfd1c..e2ee3b595b 100644 --- a/sei-db/db_engine/litt/disktable/disk_table.go +++ b/sei-db/db_engine/litt/disktable/disk_table.go @@ -724,43 +724,86 @@ func (d *DiskTable) CacheAwareGet( return value, true, false, nil } -func (d *DiskTable) Put(key []byte, value []byte) error { - return d.PutBatch([]*types.KVPair{{Key: key, Value: value}}) +func (d *DiskTable) Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error { + return d.PutBatch([]*types.PutRequest{{Key: key, Value: value, SecondaryKeys: secondaryKeys}}) } -func (d *DiskTable) PutBatch(batch []*types.KVPair) error { +func (d *DiskTable) PutBatch(batch []*types.PutRequest) error { if ok, err := d.errorMonitor.IsOk(); !ok { return fmt.Errorf("cannot process PutBatch() request, DB is in panicked state due to error: %w", err) } - if d.metrics != nil { - start := d.clock() - totalSize := uint64(0) - for _, kv := range batch { - totalSize += uint64(len(kv.Value)) - } - defer func() { - end := d.clock() - delta := end.Sub(start) - d.metrics.ReportWriteOperation(d.name, delta, uint64(len(batch)), totalSize) - }() - } + // Per-request key count (primary + secondaries). Pre-computed during validation so we can use + // it both for metrics and for the keyCount.Add() at the end. + totalKeys := int64(0) + totalSize := uint64(0) for _, kv := range batch { - if len(kv.Key) > math.MaxUint32 { - return fmt.Errorf("key is too large, length must not exceed 2^32 bytes: %d bytes", len(kv.Key)) - } - if len(kv.Value) > math.MaxUint32 { - return fmt.Errorf("value is too large, length must not exceed 2^32 bytes: %d bytes", len(kv.Value)) - } if kv.Key == nil { return fmt.Errorf("nil keys are not supported") } if kv.Value == nil { return fmt.Errorf("nil values are not supported") } + if len(kv.Key) > math.MaxUint16 { + return fmt.Errorf("key is too large, length must not exceed 2^16 bytes: %d bytes", len(kv.Key)) + } + if len(kv.Value) > math.MaxUint32 { + return fmt.Errorf("value is too large, length must not exceed 2^32 bytes: %d bytes", len(kv.Value)) + } + // Validate every secondary key in this request, and detect duplicate keys (primary vs + // secondary, secondary vs secondary) within the request. Cross-request collisions remain + // the caller's responsibility, matching existing semantics for primary keys. + seen := make(map[string]struct{}, 1+len(kv.SecondaryKeys)) + seen[util.UnsafeBytesToString(kv.Key)] = struct{}{} + for _, sk := range kv.SecondaryKeys { + if sk == nil { + return fmt.Errorf("nil secondary key is not supported") + } + if sk.Key == nil { + return fmt.Errorf("nil secondary key bytes are not supported") + } + if len(sk.Key) > math.MaxUint16 { + return fmt.Errorf("secondary key is too large, length must not exceed 2^16 bytes: %d bytes", + len(sk.Key)) + } + end := uint64(sk.Offset) + uint64(sk.Length) + if end > uint64(len(kv.Value)) { + return fmt.Errorf( + "secondary key range [%d, %d) exceeds value length %d", sk.Offset, end, len(kv.Value)) + } + skKey := util.UnsafeBytesToString(sk.Key) + if _, dup := seen[skKey]; dup { + return fmt.Errorf("duplicate key %x within PutRequest", sk.Key) + } + seen[skKey] = struct{}{} + } + + totalKeys += int64(1 + len(kv.SecondaryKeys)) + totalSize += uint64(len(kv.Value)) + } + + if d.metrics != nil { + start := d.clock() + defer func() { + end := d.clock() + delta := end.Sub(start) + d.metrics.ReportWriteOperation(d.name, delta, uint64(totalKeys), totalSize) //nolint:gosec // totalKeys non-negative + }() + } + + // All requests validated. Populate the unflushed data cache: each key (primary or secondary) + // is stored under its own key, with secondaries pointing at a zero-copy sub-slice of the parent + // value. This makes Get/Exists/CacheAwareGet treat secondaries identically to primaries before + // the data is durable. + for _, kv := range batch { d.unflushedDataCache.Store(util.UnsafeBytesToString(kv.Key), kv.Value) + for _, sk := range kv.SecondaryKeys { + d.unflushedDataCache.Store( + util.UnsafeBytesToString(sk.Key), + kv.Value[sk.Offset:sk.Offset+sk.Length]) + } } request := &controlLoopWriteRequest{ @@ -771,7 +814,7 @@ func (d *DiskTable) PutBatch(batch []*types.KVPair) error { return fmt.Errorf("failed to send write request: %w", err) } - d.keyCount.Add(int64(len(batch))) + d.keyCount.Add(totalKeys) return nil } diff --git a/sei-db/db_engine/litt/disktable/disk_table_secondary_keys_test.go b/sei-db/db_engine/litt/disktable/disk_table_secondary_keys_test.go new file mode 100644 index 0000000000..614fdc8bbc --- /dev/null +++ b/sei-db/db_engine/litt/disktable/disk_table_secondary_keys_test.go @@ -0,0 +1,543 @@ +package disktable + +import ( + "fmt" + "log/slog" + "os" + "path" + "path/filepath" + "sync/atomic" + "testing" + "time" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable/keymap" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable/segment" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/types" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/util" + "github.com/stretchr/testify/require" +) + +// buildOneShardMemKeyDiskTable creates a disk table with sharding factor 1 so every value lands in +// the same value file. This makes torn-write recovery tests deterministic — we know exactly which +// file to truncate. +func buildOneShardMemKeyDiskTable( + clock func() time.Time, + name string, + paths []string, +) (litt.ManagedTable, error) { + logger := slog.Default() + keymapPath := filepath.Join(paths[0], keymap.KeymapDirectoryName) + keymapTypeFile, err := setupKeymapTypeFile(keymapPath, keymap.MemKeymapType) + if err != nil { + return nil, fmt.Errorf("failed to load keymap type file: %w", err) + } + keys, _, err := keymap.NewMemKeymap(logger, "", true) + if err != nil { + return nil, fmt.Errorf("failed to create keymap: %w", err) + } + config, err := litt.DefaultConfig(paths...) + if err != nil { + return nil, fmt.Errorf("failed to create config: %w", err) + } + config.Clock = clock + config.GCPeriod = time.Millisecond + config.Fsync = false + config.Logger = logger + config.ShardingFactor = 1 + // Pick a target file size large enough that several Puts can co-exist in one segment without + // rotation; the recovery test specifically wants the torn group to share a segment with the + // surviving groups so the all-or-nothing behavior is observable. + config.TargetSegmentFileSize = 1 << 20 + + table, err := NewDiskTable( + config, + name, + keys, + keymapPath, + keymapTypeFile, + paths, + true, + nil, + ) + if err != nil { + return nil, fmt.Errorf("failed to create disk table: %w", err) + } + return table, nil +} + +// This file collects tests specific to the secondary-key API of DiskTable: +// +// * basic reads/exists/key-count semantics for secondaries, both before and after flush, +// * input validation in PutBatch (oversized / nil / out-of-range / duplicate), +// * aliasing the whole value to another key, +// * TTL/GC reaping the primary and its secondaries together, +// * end-to-end recovery proving that a torn final Put loses every key in its group while every +// completed group survives. +// +// The validation, aliasing, KeyCount, restart and recovery tests run against every disk-table +// implementation in tableBuilders. The cached-write-cache test only makes sense for the cached +// variants, so the test bodies skip the other implementations. + +// putBatchSingle is a tiny helper to PutBatch a single PutRequest, which is otherwise verbose. +func putBatchSingle(t *testing.T, table litt.ManagedTable, req *types.PutRequest) { + t.Helper() + require.NoError(t, table.PutBatch([]*types.PutRequest{req})) +} + +// TestSecondaryKeyReadsBeforeAndAfterFlush proves that a secondary key behaves like any other key +// at every read-side boundary. The same Get/Exists call works pre-flush (served from the +// unflushed data cache) and post-flush (served from the keymap + segment Read). +func TestSecondaryKeyReadsBeforeAndAfterFlush(t *testing.T) { + t.Parallel() + for _, tb := range tableBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tb.builder(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + value := []byte("the quick brown fox jumps over the lazy dog") + primary := []byte("primary") + sk1 := &types.SecondaryKey{Key: []byte("quick"), Offset: 4, Length: 5} + sk2 := &types.SecondaryKey{Key: []byte("brown"), Offset: 10, Length: 5} + sk3 := &types.SecondaryKey{Key: []byte("alias"), Offset: 0, Length: uint32(len(value))} + + require.NoError(t, table.Put(primary, value, sk1, sk2, sk3)) + + verify := func(stage string) { + t.Helper() + got, ok, err := table.Get(primary) + require.NoError(t, err, stage) + require.True(t, ok, stage) + require.Equal(t, value, got, stage) + + for _, sk := range []*types.SecondaryKey{sk1, sk2, sk3} { + ok, err := table.Exists(sk.Key) + require.NoError(t, err, stage) + require.True(t, ok, stage) + + got, ok, err := table.Get(sk.Key) + require.NoError(t, err, stage) + require.True(t, ok, stage) + require.Equal(t, value[sk.Offset:sk.Offset+sk.Length], got, stage) + } + + require.EqualValues(t, 4, table.KeyCount(), stage) + } + + verify("before flush") + require.NoError(t, table.Flush()) + verify("after flush") + + require.NoError(t, table.Destroy()) + }) + } +} + +// TestSecondaryKeyValidationErrors verifies that every documented validation rule is enforced and +// that a rejected Put leaves no observable side-effect (KeyCount unchanged). +func TestSecondaryKeyValidationErrors(t *testing.T) { + t.Parallel() + for _, tb := range tableBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tb.builder(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + require.Zero(t, table.KeyCount()) + + value := []byte("hello world") + + // Offset+Length exceeds the value. + err = table.Put([]byte("p1"), value, &types.SecondaryKey{Key: []byte("s1"), Offset: 6, Length: 100}) + require.Error(t, err) + + // nil secondary key bytes. + err = table.Put([]byte("p2"), value, &types.SecondaryKey{Key: nil, Offset: 0, Length: 1}) + require.Error(t, err) + + // secondary key collides with the primary. + err = table.Put([]byte("p3"), value, &types.SecondaryKey{Key: []byte("p3"), Offset: 0, Length: 1}) + require.Error(t, err) + + // two secondaries collide with each other in the same Put. + err = table.Put([]byte("p4"), value, + &types.SecondaryKey{Key: []byte("dup"), Offset: 0, Length: 1}, + &types.SecondaryKey{Key: []byte("dup"), Offset: 1, Length: 1}, + ) + require.Error(t, err) + + // primary key too long. + oversized := make([]byte, 1<<16) + err = table.Put(oversized, value) + require.Error(t, err) + + // secondary key too long. + err = table.Put([]byte("p5"), value, &types.SecondaryKey{Key: oversized, Offset: 0, Length: 1}) + require.Error(t, err) + + // No successful Put happened, so the table must report zero keys. + require.Zero(t, table.KeyCount()) + + require.NoError(t, table.Destroy()) + }) + } +} + +// TestSecondaryKeyAliasing covers the alias-the-whole-value pattern: Put(K, V, {A, 0, len(V)}) → +// Get(K) and Get(A) both return V with KeyCount==2. +func TestSecondaryKeyAliasing(t *testing.T) { + t.Parallel() + for _, tb := range tableBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tb.builder(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + primary := []byte("primary") + alias := []byte("alias") + value := []byte("payload") + require.NoError(t, table.Put(primary, value, + &types.SecondaryKey{Key: alias, Offset: 0, Length: uint32(len(value))})) + require.NoError(t, table.Flush()) + + got, ok, err := table.Get(primary) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, value, got) + + got, ok, err = table.Get(alias) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, value, got) + + require.EqualValues(t, 2, table.KeyCount()) + + require.NoError(t, table.Destroy()) + }) + } +} + +// TestSecondaryKeyTTLGroupExpiration verifies that a primary and all of its secondaries expire +// together: once the TTL window passes for the primary, every secondary becomes unreachable on +// the same GC pass. The buildMemKeyDiskTableSingleShard test config uses TargetSegmentFileSize=100, +// so writing a few hundred bytes of cycler data is enough to rotate past the segment that holds +// our group; once the old segment is sealed and its lastValueTimestamp is older than the TTL, GC +// reaps it. +func TestSecondaryKeyTTLGroupExpiration(t *testing.T) { + t.Parallel() + + rand := util.NewTestRandom() + directory := t.TempDir() + + startTime := rand.Time() + var fakeTime atomic.Pointer[time.Time] + fakeTime.Store(&startTime) + clock := func() time.Time { return *fakeTime.Load() } + + tableName := rand.String(8) + table, err := buildMemKeyDiskTableSingleShard(clock, tableName, []string{directory}) + require.NoError(t, err) + + ttl := 30 * time.Second + require.NoError(t, table.SetTTL(ttl)) + + value := []byte("hello world") + require.NoError(t, table.Put([]byte("primary"), value, + &types.SecondaryKey{Key: []byte("hello"), Offset: 0, Length: 5}, + &types.SecondaryKey{Key: []byte("world"), Offset: 6, Length: 5}, + )) + require.NoError(t, table.Flush()) + require.EqualValues(t, 3, table.KeyCount()) + + // Write enough additional data to push the group's segment past TargetSegmentFileSize (100) + // and force a rotation, so the group's segment becomes sealed and therefore reapable. + for i := 0; i < 50; i++ { + key := []byte(fmt.Sprintf("filler-%03d", i)) + require.NoError(t, table.Put(key, make([]byte, 16))) + } + require.NoError(t, table.Flush()) + + // Advance the clock past the TTL and trigger GC. + advanced := startTime.Add(2 * ttl) + fakeTime.Store(&advanced) + + // One more Put + Flush after the clock advance so the GC pass sees the new lastValueTimestamp + // on any active segment. + require.NoError(t, table.Put([]byte("post-advance"), []byte("trigger"))) + require.NoError(t, table.Flush()) + + require.NoError(t, table.(*DiskTable).RunGC()) + + // Wait for the GC pass to reap the expired segment. + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + ok, err := table.Exists([]byte("primary")) + require.NoError(t, err) + if !ok { + break + } + time.Sleep(20 * time.Millisecond) + require.NoError(t, table.(*DiskTable).RunGC()) + } + + // Primary and all secondaries are reaped together. + for _, key := range [][]byte{[]byte("primary"), []byte("hello"), []byte("world")} { + ok, err := table.Exists(key) + require.NoError(t, err) + require.False(t, ok, "expected expired key %q to be gone", key) + } + + require.NoError(t, table.Destroy()) +} + +// restartWithSecondariesTest exercises the table-restart code path with a workload that mixes +// 0-3 secondaries per Put. After restart every primary AND every secondary must still be +// readable. This is the disk-table-level analogue of the existing TestRestart, and pins down the +// keymap-reload behavior for the new ScopedKey.Kind field. +func restartWithSecondariesTest(t *testing.T, tableBuilder *tableBuilder) { + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tableBuilder.builder(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + // keyToValue holds the expected bytes for each surviving key (primary OR secondary). + keyToValue := make(map[string][]byte) + + iterations := 200 + restartIteration := iterations / 2 + for i := 0; i < iterations; i++ { + if i == restartIteration { + require.NoError(t, table.Close()) + table, err = tableBuilder.builder(time.Now, tableName, []string{directory}) + require.NoError(t, err) + for k, v := range keyToValue { + got, ok, err := table.Get([]byte(k)) + require.NoError(t, err) + require.True(t, ok, "key %q lost across restart", k) + require.Equal(t, v, got) + } + } + + primary := rand.PrintableVariableBytes(16, 32) + value := rand.PrintableVariableBytes(8, 64) + + // 0-3 secondaries; offsets/lengths chosen to span both strict sub-ranges and the whole + // value. + nSecondaries := int(rand.Int32Range(0, 4)) + secondaries := make([]*types.SecondaryKey, 0, nSecondaries) + for s := 0; s < nSecondaries; s++ { + offset := uint32(rand.Int32Range(0, int32(len(value)))) + maxLen := uint32(len(value)) - offset + if maxLen == 0 { + continue + } + length := uint32(rand.Int32Range(1, int32(maxLen+1))) + skKey := rand.PrintableVariableBytes(16, 32) + if _, exists := keyToValue[string(skKey)]; exists { + continue + } + secondaries = append(secondaries, &types.SecondaryKey{Key: skKey, Offset: offset, Length: length}) + } + + require.NoError(t, table.Put(primary, value, secondaries...)) + keyToValue[string(primary)] = value + for _, sk := range secondaries { + keyToValue[string(sk.Key)] = value[sk.Offset : sk.Offset+sk.Length] + } + + if rand.BoolWithProbability(0.1) { + require.NoError(t, table.Flush()) + } + } + + require.NoError(t, table.Flush()) + for k, v := range keyToValue { + got, ok, err := table.Get([]byte(k)) + require.NoError(t, err) + require.True(t, ok, "key %q missing at end of test", k) + require.Equal(t, v, got) + } + + require.NoError(t, table.Destroy()) +} + +func TestRestartWithSecondaries(t *testing.T) { + t.Parallel() + for _, tb := range tableBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + restartWithSecondariesTest(t, tb) + }) + } +} + +// TestGroupAtomicRecoveryEndToEnd is the high-level analogue of the segment-level +// TestSealLoadedSegmentGroupAtomicity: a torn final Put loses every key in its group while every +// completed Put survives. We drive this through DiskTable's public API to make sure the group +// atomicity invariant survives the keymap reload that happens at startup. +// +// The test runs only against MemKeyDiskTableSingleShard, where we know the segment layout exactly +// so we can corrupt it deterministically. The recovery contract is the same for the other disk +// table flavors (they share the same segment package) but corrupting a multi-shard layout would +// require figuring out which shard hosted the torn write. +func TestGroupAtomicRecoveryEndToEnd(t *testing.T) { + t.Parallel() + + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + + table, err := buildOneShardMemKeyDiskTable(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + // Two completed Puts, then a third Put whose value we will truncate. We Flush after each to + // move the writes through the value file's flushedSize boundary. + require.NoError(t, table.Put([]byte("survivor-1"), []byte("v1"))) + require.NoError(t, table.Flush()) + + require.NoError(t, table.Put([]byte("survivor-primary"), []byte("hello"), + &types.SecondaryKey{Key: []byte("survivor-secondary"), Offset: 0, Length: 5}, + )) + require.NoError(t, table.Flush()) + + require.NoError(t, table.Put([]byte("torn-primary"), []byte("worldwide"), + &types.SecondaryKey{Key: []byte("torn-secondary"), Offset: 0, Length: 5}, + )) + require.NoError(t, table.Flush()) + + require.NoError(t, table.Close()) + + // Find the segment that holds the torn write: it's the highest-indexed segment whose value + // file is non-empty. (Disk table may rotate to a fresh segment after each flush, so the very + // latest metadata may belong to an empty rollover segment.) + segmentDir := findLatestSegmentDir(t, directory, tableName) + require.NotEmpty(t, segmentDir) + segIdx := segmentIndexWithLargestValueFile(t, segmentDir) + + // Truncate the value file so the primary's tail goes missing. The secondary at the front + // would individually fit, but group-atomic recovery drops it anyway. + valPath := path.Join(segmentDir, fmt.Sprintf("%d-0%s", segIdx, segment.ValuesFileExtension)) + data, err := os.ReadFile(valPath) + require.NoError(t, err) + require.GreaterOrEqual(t, len(data), 3) + require.NoError(t, os.WriteFile(valPath, data[:len(data)-3], 0600)) + + // Flip the metadata's sealed byte from 1 back to 0 to simulate a crash before sealing. This + // is what makes LoadSegment run the recovery path on reopen. + metaPath := path.Join(segmentDir, fmt.Sprintf("%d%s", segIdx, segment.MetadataFileExtension)) + mdBytes, err := os.ReadFile(metaPath) + require.NoError(t, err) + require.Equal(t, segment.V3MetadataSize, len(mdBytes)) + mdBytes[segment.V3MetadataSize-1] = 0 + require.NoError(t, os.WriteFile(metaPath, mdBytes, 0600)) + + // Reopen. + table, err = buildOneShardMemKeyDiskTable(time.Now, tableName, []string{directory}) + require.NoError(t, err) + + // Survivors remain. + for _, key := range [][]byte{ + []byte("survivor-1"), + []byte("survivor-primary"), + []byte("survivor-secondary"), + } { + ok, err := table.Exists(key) + require.NoError(t, err) + require.True(t, ok, "expected %q to survive recovery", key) + } + + // Torn group is gone, both primary and secondary. + for _, key := range [][]byte{[]byte("torn-primary"), []byte("torn-secondary")} { + ok, err := table.Exists(key) + require.NoError(t, err) + require.False(t, ok, "expected %q to be discarded by recovery", key) + } + + require.NoError(t, table.Destroy()) +} + +// findLatestSegmentDir locates the segment directory created by the single-shard mem-keymap disk +// table at the given root. The directory layout is +// //segments/, with each segment occupying a triple of files prefixed by its +// segment index. We return the segments directory itself; the test then walks its files to find +// the highest-indexed segment. +func findLatestSegmentDir(t *testing.T, root, tableName string) string { + t.Helper() + segmentsDir := filepath.Join(root, tableName, "segments") + info, err := os.Stat(segmentsDir) + require.NoError(t, err) + require.True(t, info.IsDir()) + return segmentsDir +} + +// segmentIndexWithLargestValueFile walks the segments directory and returns the index of the +// segment whose value file is the largest. The torn write we corrupt in the recovery test always +// lives in the segment with the most value bytes (the one we wrote into most recently); ignoring +// rollover-only segments (which may exist after a Close) makes the test robust to whatever the +// disk table happens to do at shutdown. +func segmentIndexWithLargestValueFile(t *testing.T, segmentsDir string) uint32 { + t.Helper() + entries, err := os.ReadDir(segmentsDir) + require.NoError(t, err) + + var bestIdx uint32 + var bestSize int64 = -1 + for _, e := range entries { + name := e.Name() + const suffix = segment.ValuesFileExtension + if len(name) < len(suffix) || name[len(name)-len(suffix):] != suffix { + continue + } + // value files are named "-.values"; we always corrupt shard 0 so we only + // consider files whose shard portion is "0". + stripped := name[:len(name)-len(suffix)] + dash := -1 + for i := len(stripped) - 1; i >= 0; i-- { + if stripped[i] == '-' { + dash = i + break + } + } + require.GreaterOrEqual(t, dash, 0) + if stripped[dash+1:] != "0" { + continue + } + idx, err := parseUint32(stripped[:dash]) + require.NoError(t, err) + + info, err := e.Info() + require.NoError(t, err) + if info.Size() > bestSize { + bestSize = info.Size() + bestIdx = idx + } + } + require.GreaterOrEqual(t, bestSize, int64(0), "no value files found in %s", segmentsDir) + return bestIdx +} + +func parseUint32(s string) (uint32, error) { + var n uint32 + for _, r := range s { + if r < '0' || r > '9' { + return 0, fmt.Errorf("not a number: %q", s) + } + n = n*10 + uint32(r-'0') + } + return n, nil +} diff --git a/sei-db/db_engine/litt/disktable/disk_table_test.go b/sei-db/db_engine/litt/disktable/disk_table_test.go index 3ab9418ce6..360431f7a1 100644 --- a/sei-db/db_engine/litt/disktable/disk_table_test.go +++ b/sei-db/db_engine/litt/disktable/disk_table_test.go @@ -315,11 +315,11 @@ func restartTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -407,11 +407,11 @@ func middleFileMissingTest(t *testing.T, tableBuilder *tableBuilder, typeToDelet require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -529,11 +529,11 @@ func initialFileMissingTest(t *testing.T, tableBuilder *tableBuilder, typeToDele require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -625,11 +625,11 @@ func initialFileMissingTest(t *testing.T, tableBuilder *tableBuilder, typeToDele require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -721,11 +721,11 @@ func lastFileMissingTest(t *testing.T, tableBuilder *tableBuilder, typeToDelete require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -824,11 +824,11 @@ func lastFileMissingTest(t *testing.T, tableBuilder *tableBuilder, typeToDelete require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -919,11 +919,11 @@ func truncatedKeyFileTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1060,11 +1060,11 @@ func truncatedKeyFileTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1149,11 +1149,11 @@ func truncatedValueFileTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1305,11 +1305,11 @@ func truncatedValueFileTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1395,11 +1395,11 @@ func unflushedKeysTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1515,11 +1515,11 @@ func unflushedKeysTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1790,11 +1790,11 @@ func restartWithMultipleStorageDirectoriesTest(t *testing.T, tableBuilder *table require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -1976,11 +1976,11 @@ func changingShardingFactorTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -2099,11 +2099,11 @@ func tableSizeTest(t *testing.T, tableBuilder *tableBuilder) { expectedValues[string(key)] = value creationTimes[string(key)] = newTime } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value creationTimes[string(key)] = newTime } diff --git a/sei-db/db_engine/litt/disktable/segment/key_file.go b/sei-db/db_engine/litt/disktable/segment/key_file.go index 7bc4684abe..9ca967943b 100644 --- a/sei-db/db_engine/litt/disktable/segment/key_file.go +++ b/sei-db/db_engine/litt/disktable/segment/key_file.go @@ -169,14 +169,32 @@ func (k *keyFile) atomicSwap(sync bool) error { return nil } +// KeyRecordHeaderSize is the on-disk size of the fixed-width portion of a key-file record that +// precedes the variable-length key bytes: one byte of KeyKind followed by a uint16 key-length prefix. +const KeyRecordHeaderSize = 3 + +// MaxKeyLength is the maximum permitted length of a key in bytes. The key file stores the key +// length as a uint16, which is more than enough headroom for any realistic key. +const MaxKeyLength = 1<<16 - 1 + // write writes a key to the key file. func (k *keyFile) write(scopedKey *types.ScopedKey) error { if k.writer == nil { return fmt.Errorf("key file is sealed") } - // Write the length of the key. - err := binary.Write(k.writer, binary.BigEndian, uint32(len(scopedKey.Key))) //nolint:gosec // key length fits uint32 + if len(scopedKey.Key) > MaxKeyLength { + return fmt.Errorf("key length %d exceeds maximum of %d", len(scopedKey.Key), MaxKeyLength) + } + + // Write the kind byte (1 B). + err := k.writer.WriteByte(byte(scopedKey.Kind)) + if err != nil { + return fmt.Errorf("failed to write kind to key file: %w", err) + } + + // Write the length of the key (2 B, big-endian). + err = binary.Write(k.writer, binary.BigEndian, uint16(len(scopedKey.Key))) //nolint:gosec // bounded above if err != nil { return fmt.Errorf("failed to write key length to key file: %w", err) } @@ -194,7 +212,7 @@ func (k *keyFile) write(scopedKey *types.ScopedKey) error { } k.size += uint64( //nolint:gosec // sizes are non-negative - 4 /* uint32 size of key */ + + KeyRecordHeaderSize + len(scopedKey.Key) + types.AddressSerializedSize) @@ -266,14 +284,13 @@ func (k *keyFile) readKeys() ([]*types.ScopedKey, error) { keys := make([]*types.ScopedKey, 0) index := 0 - for { - // We need at least 4 bytes to read the length of the key. - if index+4 > len(keyBytes) { //nolint:staticcheck // QF1006 - // There are fewer than 4 bytes left in the file. - break - } - keyLength := int(binary.BigEndian.Uint32(keyBytes[index : index+4])) - index += 4 + // We need the fixed-width header (kind + uint16 key length) before we can decide whether the + // next record fits. + for index+KeyRecordHeaderSize <= len(keyBytes) { + kind := types.KeyKind(keyBytes[index]) + index++ + keyLength := int(binary.BigEndian.Uint16(keyBytes[index : index+2])) + index += 2 // We need to read the key, as well as the serialized address (which embeds the shard ID and value size). if index+keyLength+types.AddressSerializedSize > len(keyBytes) { @@ -293,6 +310,7 @@ func (k *keyFile) readKeys() ([]*types.ScopedKey, error) { keys = append(keys, &types.ScopedKey{ Key: key, Address: address, + Kind: kind, }) } diff --git a/sei-db/db_engine/litt/disktable/segment/key_file_test.go b/sei-db/db_engine/litt/disktable/segment/key_file_test.go index 0ed106e98e..5c2e88f519 100644 --- a/sei-db/db_engine/litt/disktable/segment/key_file_test.go +++ b/sei-db/db_engine/litt/disktable/segment/key_file_test.go @@ -19,6 +19,16 @@ func TestReadWriteKeys(t *testing.T) { index := rand.Uint32() + // Cycle through all four KeyKind values so the on-disk record layout is exercised end-to-end. + // Index 0 has the implicit zero-value (KeyKindStandalone), confirming that a ScopedKey with + // no explicit Kind round-trips as a Standalone primary. + kinds := []types.KeyKind{ + types.KeyKindStandalone, + types.KeyKindPrimary, + types.KeyKindSecondary, + types.KeyKindFinalSecondary, + } + keyCount := rand.Int32Range(100, 200) keys := make([]*types.ScopedKey, keyCount) for i := 0; i < int(keyCount); i++ { @@ -29,7 +39,12 @@ func TestReadWriteKeys(t *testing.T) { uint8(rand.Uint32Range(0, 256)), rand.Uint32(), ) - keys[i] = &types.ScopedKey{Key: key, Address: address} + // First record left implicit (Kind defaults to zero/Standalone); the rest rotate. + var kind types.KeyKind + if i != 0 { + kind = kinds[i%len(kinds)] + } + keys[i] = &types.ScopedKey{Key: key, Address: address, Kind: kind} } segmentPath, err := NewSegmentPath(directory, "", "table") diff --git a/sei-db/db_engine/litt/disktable/segment/metadata_file.go b/sei-db/db_engine/litt/disktable/segment/metadata_file.go index fe6d950e0c..1f53836ba3 100644 --- a/sei-db/db_engine/litt/disktable/segment/metadata_file.go +++ b/sei-db/db_engine/litt/disktable/segment/metadata_file.go @@ -22,7 +22,8 @@ const ( // deleted. MetadataSwapExtension = MetadataFileExtension + util.SwapFileExtension - // V3MetadataSize is the size of the metadata file at LatestSegmentVersion (ShardedAddressSegmentVersion). + // V3MetadataSize is the size of the metadata file at the current LatestSegmentVersion (the name + // is kept for backwards compatibility; the metadata layout has not actually changed since v3). // Layout: // - 4 bytes for version // - 1 byte for the sharding factor diff --git a/sei-db/db_engine/litt/disktable/segment/segment.go b/sei-db/db_engine/litt/disktable/segment/segment.go index 06a51a416e..203bd65a0b 100644 --- a/sei-db/db_engine/litt/disktable/segment/segment.go +++ b/sei-db/db_engine/litt/disktable/segment/segment.go @@ -261,40 +261,100 @@ func (s *Segment) SegmentIndex() uint32 { // sealLoadedSegment is responsible for sealing a segment loaded from disk that is not already sealed. // While doing this, it is responsible for making the key file consistent with the values present in the // value files. +// +// Recovery is "group-atomic": every Put that wrote 1+N key file records (one primary + N secondaries) +// is either kept whole or dropped whole. A group is kept iff (1) its closing record +// (KeyKindStandalone for a 0-secondary Put, or KeyKindFinalSecondary for an N>=1 Put) is present in +// the key file, and (2) every address in the group fits within the flushed bytes of its value file. +// Any other state (partial keyfile record, primary without a closing terminator, stray secondary not +// preceded by a primary, value-file truncated mid-group) results in the entire group being discarded. func (s *Segment) sealLoadedSegment(now time.Time) error { scopedKeys, err := s.keys.readKeys() if err != nil { return fmt.Errorf("failed to read keys: %w", err) } - // keys with values that are not present in the value files + // keys belonging to groups that passed both key-file and value-file completeness checks goodKeys := make([]*types.ScopedKey, 0, len(scopedKeys)) - // keys with values that weren't flushed out to the value files before the DB crashed + // keys belonging to groups that were torn (either mid-key-file or mid-value-file) badKeys := make([]*types.ScopedKey, 0, len(scopedKeys)) + // commitGroup applies the all-or-nothing value-file completeness check to a group's keys and + // routes them to goodKeys or badKeys accordingly. A group survives only if every address in it + // is fully present in its shard's value file. + commitGroup := func(group []*types.ScopedKey) { + if len(group) == 0 { + return + } + for _, sk := range group { + shard := sk.Address.ShardID() + end := uint64(sk.Address.Offset()) + uint64(sk.Address.ValueSize()) + if s.shards[shard].Size() < end { + badKeys = append(badKeys, group...) + return + } + } + goodKeys = append(goodKeys, group...) + } + + // Validate shard IDs up front: a shard ID beyond the segment's sharding factor cannot come from + // normal operation, so we treat it as disk corruption and refuse to seal the segment rather + // than risk silently dropping data. for _, scopedKey := range scopedKeys { shard := scopedKey.Address.ShardID() - if int(shard) >= len(s.shards) { - // A shard ID that exceeds the segment's sharding factor cannot be the result of normal - // operation, so treat it as disk corruption and refuse to seal the segment. Recovery here - // would risk silently dropping data; require human intervention instead. return fmt.Errorf( "segment %d has key with shard ID %d outside sharding factor %d: data corruption detected", s.index, shard, len(s.shards)) } + } - requiredValueFileLength := uint64(scopedKey.Address.Offset()) + - 4 /* value size uint32 */ + - uint64(scopedKey.Address.ValueSize()) - - if s.shards[shard].Size() < requiredValueFileLength { - badKeys = append(badKeys, scopedKey) - } else { - goodKeys = append(goodKeys, scopedKey) + // Walk records in order, accumulating a group buffer that we commit on each terminator. + var currentGroup []*types.ScopedKey + for _, scopedKey := range scopedKeys { + switch scopedKey.Kind { + case types.KeyKindStandalone: + // A standalone primary closes its group immediately. Any in-flight group (which would + // indicate a torn primary-with-secondaries write that was followed by a fresh + // standalone) is dropped. + if len(currentGroup) > 0 { + badKeys = append(badKeys, currentGroup...) + currentGroup = nil + } + commitGroup([]*types.ScopedKey{scopedKey}) + case types.KeyKindPrimary: + // Starting a new group. Any in-flight group is torn. + if len(currentGroup) > 0 { + badKeys = append(badKeys, currentGroup...) + currentGroup = nil + } + currentGroup = append(currentGroup, scopedKey) + case types.KeyKindSecondary: + // A secondary that is not preceded by a primary is a stray record (its primary was torn + // off the front of the file or never written). Drop it. Otherwise, accumulate. + if len(currentGroup) == 0 { + badKeys = append(badKeys, scopedKey) + } else { + currentGroup = append(currentGroup, scopedKey) + } + case types.KeyKindFinalSecondary: + if len(currentGroup) == 0 { + badKeys = append(badKeys, scopedKey) + } else { + currentGroup = append(currentGroup, scopedKey) + commitGroup(currentGroup) + currentGroup = nil + } + default: + return fmt.Errorf("segment %d has key file record with unknown kind %d: data corruption detected", + s.index, scopedKey.Kind) } } + // A group that was never closed (the file ended before its FinalSecondary was written) is torn. + if len(currentGroup) > 0 { + badKeys = append(badKeys, currentGroup...) + } if len(badKeys) > 0 { // We have at least one bad key. Rewrite the keyfile with only the good keys. @@ -395,18 +455,25 @@ func (s *Segment) SetNextSegment(nextSegment *Segment) { s.nextSegment = nextSegment } -// Write records a key-value pair in the data segment, returning the maximum size of all shards within this segment. +// Write records a key-value pair (with optional secondary keys) in the data segment, returning the +// running key count and key-file size of the segment. // -// This method does not ensure that the key-value pair is actually written to disk, only that it will eventually be -// written to disk. Flush must be called to ensure that all data previously passed to Write is written to disk. -func (s *Segment) Write(data *types.KVPair) (keyCount uint32, keyFileSize uint64, err error) { +// This method does not ensure that the key-value pair is actually written to disk, only that it will +// eventually be written to disk. Flush must be called to ensure that all data previously passed to +// Write is written to disk. +// +// The primary key and all of its secondary keys are written contiguously to the key file in a single +// "group": the primary first, followed by each secondary in order. The kind tag on the primary +// (KeyKindStandalone vs. KeyKindPrimary) and on the last secondary (KeyKindFinalSecondary) is what +// lets recovery distinguish a fully-written group from a torn write. +func (s *Segment) Write(data *types.PutRequest) (keyCount uint32, keyFileSize uint64, err error) { if s.metadata.sealed { return 0, 0, fmt.Errorf("segment is sealed, cannot write data") } - // Shard assignment is round-robin: each successive call deposits the value into the next shard, wrapping around - // after metadata.shardingFactor calls. This is safe to do without locking because Write is invoked exclusively - // from the disk_table control loop goroutine. + // Shard assignment is round-robin: each successive call deposits the value into the next shard, + // wrapping around after metadata.shardingFactor calls. This is safe to do without locking + // because Write is invoked exclusively from the disk_table control loop goroutine. shard := s.nextShard s.nextShard++ if s.nextShard == s.metadata.shardingFactor { @@ -421,15 +488,45 @@ func (s *Segment) Write(data *types.KVPair) (keyCount uint32, keyFileSize uint64 return 0, 0, fmt.Errorf("value file already contains %d bytes, cannot add a new value", currentSize) } - s.unflushedKeyCount.Add(1) firstByteIndex := uint32(currentSize) + valueLen := uint64(len(data.Value)) + if uint64(firstByteIndex)+valueLen > math.MaxUint32 { + return 0, 0, + fmt.Errorf("value of length %d would push value file past 2^32 bytes (current size %d)", + valueLen, currentSize) + } + + // Validate every secondary key's address fits in uint32 *before* sending anything, so we never + // produce a partial write. + for _, sk := range data.SecondaryKeys { + end := uint64(firstByteIndex) + uint64(sk.Offset) + uint64(sk.Length) + if end > math.MaxUint32 { + return 0, 0, + fmt.Errorf("secondary key range [%d, %d) would exceed 2^32 byte addressable range", + uint64(firstByteIndex)+uint64(sk.Offset), end) + } + } + + n := len(data.SecondaryKeys) + totalKeys := uint32(1 + n) //nolint:gosec // n bounded by caller validation - s.shardSizes[shard] += uint64(len(data.Value)) + 4 /* uint32 length */ + // Determine kind of the primary key based on whether secondaries follow it. + primaryKind := types.KeyKindStandalone + if n > 0 { + primaryKind = types.KeyKindPrimary + } + + // Update accounting before sending so that callers observe consistent state. + s.unflushedKeyCount.Add(int64(totalKeys)) + s.shardSizes[shard] += valueLen if s.shardSizes[shard] > s.maxShardSize { s.maxShardSize = s.shardSizes[shard] } - s.keyCount++ - s.keyFileSize += uint64(len(data.Key)) + 4 /* uint32 length */ + types.AddressSerializedSize + s.keyCount += totalKeys + s.keyFileSize += keyRecordSize(data.Key) + for _, sk := range data.SecondaryKeys { + s.keyFileSize += keyRecordSize(sk.Key) + } // Forward the value to the shard control loop, which asynchronously writes it to the value file. shardRequest := &valueToWrite{ @@ -442,21 +539,45 @@ func (s *Segment) Write(data *types.KVPair) (keyCount uint32, keyFileSize uint64 fmt.Errorf("failed to send value to shard control loop: %v", err) } - // Forward the value to the key and its address file control loop, which asynchronously writes it to the key file. - keyRequest := &types.ScopedKey{ + // Forward the primary key to the key file control loop, which asynchronously writes it to the + // key file. Primary always goes first; recovery relies on this ordering. + primaryRequest := &types.ScopedKey{ Key: data.Key, - Address: types.NewAddress(s.index, firstByteIndex, shard, uint32(len(data.Value))), //nolint:gosec // value len fits uint32 + Address: types.NewAddress(s.index, firstByteIndex, shard, uint32(valueLen)), //nolint:gosec // bounded above + Kind: primaryKind, } - - err = util.Send(s.errorMonitor, s.keyFileChannel, keyRequest) + err = util.Send(s.errorMonitor, s.keyFileChannel, primaryRequest) if err != nil { return 0, 0, fmt.Errorf("failed to send key to key file control loop: %v", err) } + for i, sk := range data.SecondaryKeys { + kind := types.KeyKindSecondary + if i == n-1 { + kind = types.KeyKindFinalSecondary + } + secondaryRequest := &types.ScopedKey{ + Key: sk.Key, + Address: types.NewAddress(s.index, firstByteIndex+sk.Offset, shard, sk.Length), + Kind: kind, + } + err = util.Send(s.errorMonitor, s.keyFileChannel, secondaryRequest) + if err != nil { + return 0, 0, fmt.Errorf("failed to send secondary key to key file control loop: %v", err) + } + } + return s.keyCount, s.keyFileSize, nil } +// keyRecordSize returns the number of bytes a key file record consumes given a key of the supplied +// length. Includes the kind byte (1), the uint16 key-length prefix (2), the key bytes, and the +// fixed-width serialized address. +func keyRecordSize(key []byte) uint64 { + return uint64(KeyRecordHeaderSize) + uint64(len(key)) + uint64(types.AddressSerializedSize) //nolint:gosec // sizes non-negative +} + // GetMaxShardSize returns the maximum size of all shards in this segment. func (s *Segment) GetMaxShardSize() uint64 { return s.maxShardSize @@ -482,7 +603,7 @@ func (s *Segment) Read(key []byte, dataAddress types.Address) ([]byte, error) { return nil, fmt.Errorf("failed to resolve shard for read: %w", err) } - value, err := values.read(dataAddress.Offset()) + value, err := values.read(dataAddress.Offset(), dataAddress.ValueSize()) if err != nil { return nil, fmt.Errorf("failed to read value: %w", err) } diff --git a/sei-db/db_engine/litt/disktable/segment/segment_test.go b/sei-db/db_engine/litt/disktable/segment/segment_test.go index cec1f494e6..8fc86c1e67 100644 --- a/sei-db/db_engine/litt/disktable/segment/segment_test.go +++ b/sei-db/db_engine/litt/disktable/segment/segment_test.go @@ -2,8 +2,10 @@ package segment import ( "bytes" + "fmt" "log/slog" "os" + "path" "sort" "testing" "time" @@ -67,9 +69,9 @@ func TestWriteAndReadSegmentSingleShard(t *testing.T) { value := values[i] expectedValues[string(key)] = value - expectedLargestShardSize += uint64(len(value)) + 4 /* uint32 length */ + expectedLargestShardSize += uint64(len(value)) - _, _, err := seg.Write(&types.KVPair{Key: key, Value: value}) + _, _, err := seg.Write(&types.PutRequest{Key: key, Value: value}) largestShardSize := seg.GetMaxShardSize() require.NoError(t, err) require.Equal(t, expectedLargestShardSize, largestShardSize) @@ -216,10 +218,10 @@ func TestWriteAndReadSegmentMultiShard(t *testing.T) { value := values[i] expectedValues[string(key)] = value - _, _, err := seg.Write(&types.KVPair{Key: key, Value: value}) + _, _, err := seg.Write(&types.PutRequest{Key: key, Value: value}) require.NoError(t, err) largestShardSize := seg.GetMaxShardSize() - require.True(t, largestShardSize >= uint64(len(value)+4)) + require.True(t, largestShardSize >= uint64(len(value))) // Occasionally flush the segment to disk. if rand.BoolWithProbability(0.25) { @@ -374,10 +376,10 @@ func TestWriteAndReadColdShard(t *testing.T) { value := values[i] expectedValues[string(key)] = value - _, _, err := seg.Write(&types.KVPair{Key: key, Value: value}) + _, _, err := seg.Write(&types.PutRequest{Key: key, Value: value}) require.NoError(t, err) largestShardSize := seg.GetMaxShardSize() - require.True(t, largestShardSize >= uint64(len(value)+4)) + require.True(t, largestShardSize >= uint64(len(value))) } // Seal the segment and read all keys and values. @@ -549,7 +551,7 @@ func TestRoundRobinShardAssignment(t *testing.T) { for i := 0; i < valueCount; i++ { key := rand.PrintableVariableBytes(8, 32) value := rand.PrintableVariableBytes(8, 32) - _, _, err := seg.Write(&types.KVPair{Key: key, Value: value}) + _, _, err := seg.Write(&types.PutRequest{Key: key, Value: value}) require.NoError(t, err) flushFn, err := seg.Flush() @@ -580,3 +582,437 @@ func TestRoundRobinShardAssignment(t *testing.T) { "shard %d received %d values, expected %d", s, perShardCounts[s], valuesPerShard) } } + +// writeNoErr is a tiny wrapper that asserts seg.Write succeeded. seg.Write returns three values, so +// we cannot pass its result directly to require.NoError. +func writeNoErr(t *testing.T, seg *Segment, req *types.PutRequest) { + t.Helper() + _, _, err := seg.Write(req) + require.NoError(t, err) +} + +// newSingleShardSegment is a small test helper that creates a fresh single-shard segment for tests +// that need to control on-disk layout exactly. It returns the segment and the segment path so the +// caller can locate the on-disk files after the segment is sealed. +func newSingleShardSegment(t *testing.T) (*Segment, *SegmentPath, uint32) { + t.Helper() + rand := util.NewTestRandom() + logger := slog.Default() + directory := t.TempDir() + index := rand.Uint32() + + segmentPath, err := NewSegmentPath(directory, "", "table") + require.NoError(t, err) + require.NoError(t, segmentPath.MakeDirectories(false)) + + seg, err := CreateSegment( + logger, + util.NewErrorMonitor(t.Context(), logger, nil), + index, + []*SegmentPath{segmentPath}, + false, + 1, + false, + ) + require.NoError(t, err) + return seg, segmentPath, index +} + +// keysByKey indexes a slice of ScopedKey by key bytes for easier lookup. +func keysByKey(keys []*types.ScopedKey) map[string]*types.ScopedKey { + out := make(map[string]*types.ScopedKey, len(keys)) + for _, k := range keys { + out[string(k.Key)] = k + } + return out +} + +// TestSegmentSecondaryKeyAddresses verifies that a Put with a primary plus several secondaries +// produces one ScopedKey per key, that each Address reads back the correct (sub-)range of the +// stored value, that the per-record Kind tags match the group structure, and that a Put with no +// secondaries emits a single Standalone record. +func TestSegmentSecondaryKeyAddresses(t *testing.T) { + t.Parallel() + + value := []byte("the quick brown fox jumps over the lazy dog") + primaryKey := []byte("primary") + // Mix of strict sub-range secondaries and one alias-the-whole-value secondary. + sk1 := &types.SecondaryKey{Key: []byte("quick"), Offset: 4, Length: 5} // "quick" + sk2 := &types.SecondaryKey{Key: []byte("brown"), Offset: 10, Length: 5} // "brown" + sk3 := &types.SecondaryKey{Key: []byte("whole"), Offset: 0, Length: uint32(len(value))} + standaloneKey := []byte("standalone") + standaloneValue := []byte("no-secondaries-here") + + seg, _, _ := newSingleShardSegment(t) + + _, _, err := seg.Write(&types.PutRequest{ + Key: primaryKey, + Value: value, + SecondaryKeys: []*types.SecondaryKey{sk1, sk2, sk3}, + }) + require.NoError(t, err) + + _, _, err = seg.Write(&types.PutRequest{Key: standaloneKey, Value: standaloneValue}) + require.NoError(t, err) + + flushedKeys, err := seg.Seal(time.Now()) + require.NoError(t, err) + require.Len(t, flushedKeys, 5) + + byKey := keysByKey(flushedKeys) + + // Primary readback. + primary := byKey[string(primaryKey)] + require.NotNil(t, primary) + require.Equal(t, types.KeyKindPrimary, primary.Kind) + got, err := seg.Read(primary.Key, primary.Address) + require.NoError(t, err) + require.Equal(t, value, got) + + // Secondary readback. + for i, sk := range []*types.SecondaryKey{sk1, sk2, sk3} { + entry := byKey[string(sk.Key)] + require.NotNil(t, entry, "secondary %d missing from flushed keys", i) + require.Equal(t, sk.Length, entry.Address.ValueSize()) + got, err := seg.Read(entry.Key, entry.Address) + require.NoError(t, err) + require.Equal(t, value[sk.Offset:sk.Offset+sk.Length], got) + } + + // Kind tagging on the group: middle secondaries are KeyKindSecondary, last is FinalSecondary. + require.Equal(t, types.KeyKindSecondary, byKey["quick"].Kind) + require.Equal(t, types.KeyKindSecondary, byKey["brown"].Kind) + require.Equal(t, types.KeyKindFinalSecondary, byKey["whole"].Kind) + + // Standalone Put: single record tagged KeyKindStandalone. + standalone := byKey[string(standaloneKey)] + require.NotNil(t, standalone) + require.Equal(t, types.KeyKindStandalone, standalone.Kind) + got, err = seg.Read(standalone.Key, standalone.Address) + require.NoError(t, err) + require.Equal(t, standaloneValue, got) +} + +// TestKeyFileKindRoundTrip writes one of each KeyKind through Segment.Write, seals, reloads via +// LoadSegment, and verifies via GetKeys that the on-disk record kinds round-trip exactly. This +// locks in the on-disk byte ordering for the future "last-durable-primary" iteration PR. +func TestKeyFileKindRoundTrip(t *testing.T) { + t.Parallel() + + logger := slog.Default() + seg, segmentPath, index := newSingleShardSegment(t) + + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("standalone"), + Value: []byte("v0"), + }) + + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p1"), + Value: []byte("hello world"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("hello"), Offset: 0, Length: 5}, + }, + }) + + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p2"), + Value: []byte("alphabet"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("alpha"), Offset: 0, Length: 5}, + {Key: []byte("bet"), Offset: 5, Length: 3}, + }, + }) + + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + // Reload from disk and verify the on-disk record kinds. + seg2, err := LoadSegment( + logger, + util.NewErrorMonitor(t.Context(), logger, nil), + index, + []*SegmentPath{segmentPath}, + false, + time.Now(), + false, + ) + require.NoError(t, err) + + keys, err := seg2.GetKeys() + require.NoError(t, err) + require.Len(t, keys, 6) + + // Record order is insertion order within the single key file goroutine. + expected := []struct { + key string + kind types.KeyKind + }{ + {"standalone", types.KeyKindStandalone}, + {"p1", types.KeyKindPrimary}, + {"hello", types.KeyKindFinalSecondary}, + {"p2", types.KeyKindPrimary}, + {"alpha", types.KeyKindSecondary}, + {"bet", types.KeyKindFinalSecondary}, + } + for i, exp := range expected { + require.Equal(t, exp.key, string(keys[i].Key), "record %d key mismatch", i) + require.Equal(t, exp.kind, keys[i].Kind, "record %d kind mismatch (key=%s)", i, exp.key) + } +} + +// markSegmentUnsealed flips the sealed byte on the segment's metadata file from 1 back to 0, +// simulating a segment that crashed before it could write the sealed metadata. We can't use a +// running segment for this because the Seal call is what shuts down the segment's goroutines; the +// pattern is to fully seal, then reach into the file system and corrupt the metadata. +func markSegmentUnsealed(t *testing.T, segmentPath *SegmentPath, index uint32) { + t.Helper() + metaPath := path.Join(segmentPath.SegmentDirectory(), fmt.Sprintf("%d%s", index, MetadataFileExtension)) + data, err := os.ReadFile(metaPath) + require.NoError(t, err) + require.Equal(t, V3MetadataSize, len(data)) + data[V3MetadataSize-1] = 0 + require.NoError(t, os.WriteFile(metaPath, data, 0600)) +} + +// truncateKeyFileBy truncates the segment's key file by `bytes` bytes from the end. +func truncateKeyFileBy(t *testing.T, segmentPath *SegmentPath, index uint32, bytes int) { + t.Helper() + keyPath := path.Join(segmentPath.SegmentDirectory(), fmt.Sprintf("%d%s", index, KeyFileExtension)) + data, err := os.ReadFile(keyPath) + require.NoError(t, err) + require.GreaterOrEqual(t, len(data), bytes) + require.NoError(t, os.WriteFile(keyPath, data[:len(data)-bytes], 0600)) +} + +// truncateValueFileBy truncates the segment's value file for the given shard by `bytes` bytes +// from the end. +func truncateValueFileBy(t *testing.T, segmentPath *SegmentPath, index uint32, shard uint8, bytes int) { + t.Helper() + valPath := path.Join(segmentPath.SegmentDirectory(), fmt.Sprintf("%d-%d%s", index, shard, ValuesFileExtension)) + data, err := os.ReadFile(valPath) + require.NoError(t, err) + require.GreaterOrEqual(t, len(data), bytes) + require.NoError(t, os.WriteFile(valPath, data[:len(data)-bytes], 0600)) +} + +// reloadSegmentExpectingRecovery reloads a segment after corrupting it. Returns the post-recovery +// key list (sorted by insertion order from the key file). +func reloadSegmentExpectingRecovery(t *testing.T, segmentPath *SegmentPath, index uint32) ([]*types.ScopedKey, *Segment) { + t.Helper() + logger := slog.Default() + seg, err := LoadSegment( + logger, + util.NewErrorMonitor(t.Context(), logger, nil), + index, + []*SegmentPath{segmentPath}, + false, + time.Now(), + false, + ) + require.NoError(t, err) + keys, err := seg.GetKeys() + require.NoError(t, err) + return keys, seg +} + +// TestSealLoadedSegmentGroupAtomicity covers all of the torn-write scenarios that +// sealLoadedSegment must handle. Each subtest builds a sealed segment, manually corrupts it on +// disk to simulate a crash mid-write, flips the metadata's sealed bit back to false, then reloads +// and asserts which keys are kept and which are dropped. The "all-or-nothing per group" invariant +// is the property under test. +func TestSealLoadedSegmentGroupAtomicity(t *testing.T) { + t.Parallel() + + // Each test case writes a sequence of PutRequests, then describes how to corrupt the on-disk + // files before recovery. expectedKeys lists the keys (in key-file order) that should survive. + t.Run("clean_standalone_survives", func(t *testing.T) { + t.Parallel() + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{Key: []byte("k1"), Value: []byte("v1")}) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 1) + require.Equal(t, "k1", string(keys[0].Key)) + require.Equal(t, types.KeyKindStandalone, keys[0].Kind) + }) + + t.Run("clean_group_survives", func(t *testing.T) { + t.Parallel() + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p"), + Value: []byte("hello"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("he"), Offset: 0, Length: 2}, + {Key: []byte("llo"), Offset: 2, Length: 3}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 3) + require.Equal(t, types.KeyKindPrimary, keys[0].Kind) + require.Equal(t, types.KeyKindSecondary, keys[1].Kind) + require.Equal(t, types.KeyKindFinalSecondary, keys[2].Kind) + }) + + t.Run("primary_without_terminator_discarded", func(t *testing.T) { + t.Parallel() + // A Put of primary + 2 secondaries with the key file truncated such that only the primary + // record remains. The primary has Kind=KeyKindPrimary but no FinalSecondary closes it, so + // the whole group must be discarded. + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p"), + Value: []byte("hello"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("he"), Offset: 0, Length: 2}, + {Key: []byte("llo"), Offset: 2, Length: 3}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + secondaryRecBytes := int(keyRecordSize([]byte("he")) + keyRecordSize([]byte("llo"))) + truncateKeyFileBy(t, segmentPath, index, secondaryRecBytes) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Empty(t, keys) + }) + + t.Run("primary_plus_partial_secondaries_discarded", func(t *testing.T) { + t.Parallel() + // Primary + 2 secondaries, key file truncated to drop the FinalSecondary record. Group is + // torn (no closing terminator), discard. + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p"), + Value: []byte("hello"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("he"), Offset: 0, Length: 2}, + {Key: []byte("llo"), Offset: 2, Length: 3}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + truncateKeyFileBy(t, segmentPath, index, int(keyRecordSize([]byte("llo")))) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Empty(t, keys) + }) + + t.Run("partial_key_record_discarded", func(t *testing.T) { + t.Parallel() + // Truncate the file mid-record (cut into the middle of a key's bytes). readKeys will stop + // at that point and recovery should not commit the in-flight group. + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("standalone-kept"), + Value: []byte("v0"), + }) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("torn-primary"), + Value: []byte("hello"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("torn-secondary"), Offset: 0, Length: 5}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + truncateKeyFileBy(t, segmentPath, index, 5) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 1) + require.Equal(t, "standalone-kept", string(keys[0].Key)) + }) + + t.Run("group_discarded_when_value_file_torn", func(t *testing.T) { + t.Parallel() + // Primary + secondaries written; we truncate the value file so the primary's address (the + // one with the largest [offset, offset+len) span) no longer fits. The whole group must drop — + // even though a short secondary at the front of the value would individually fit. + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("standalone-kept"), + Value: []byte("survivor"), + }) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("torn-primary"), + Value: []byte("hellooooo"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("he"), Offset: 0, Length: 2}, + {Key: []byte("oo"), Offset: 7, Length: 2}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + truncateValueFileBy(t, segmentPath, index, 0, 3) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 1) + require.Equal(t, "standalone-kept", string(keys[0].Key)) + }) + + t.Run("group_survives_when_value_file_complete", func(t *testing.T) { + t.Parallel() + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("p"), + Value: []byte("hellooooo"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("he"), Offset: 0, Length: 2}, + {Key: []byte("oo"), Offset: 7, Length: 2}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 3) + }) + + t.Run("two_clean_groups_plus_torn_third", func(t *testing.T) { + t.Parallel() + seg, segmentPath, index := newSingleShardSegment(t) + writeNoErr(t, seg, &types.PutRequest{Key: []byte("first"), Value: []byte("v1")}) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("second-primary"), + Value: []byte("hi"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("second-secondary"), Offset: 0, Length: 2}, + }, + }) + writeNoErr(t, seg, &types.PutRequest{ + Key: []byte("third-primary"), + Value: []byte("hi"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("third-secondary"), Offset: 0, Length: 2}, + }, + }) + _, err := seg.Seal(time.Now()) + require.NoError(t, err) + + truncateKeyFileBy(t, segmentPath, index, int(keyRecordSize([]byte("third-secondary")))) + markSegmentUnsealed(t, segmentPath, index) + + keys, _ := reloadSegmentExpectingRecovery(t, segmentPath, index) + require.Len(t, keys, 3) + require.Equal(t, "first", string(keys[0].Key)) + require.Equal(t, "second-primary", string(keys[1].Key)) + require.Equal(t, "second-secondary", string(keys[2].Key)) + }) +} diff --git a/sei-db/db_engine/litt/disktable/segment/segment_version.go b/sei-db/db_engine/litt/disktable/segment/segment_version.go index 0ad9652fb6..a2c406e9bf 100644 --- a/sei-db/db_engine/litt/disktable/segment/segment_version.go +++ b/sei-db/db_engine/litt/disktable/segment/segment_version.go @@ -9,12 +9,21 @@ package segment type SegmentVersion uint32 const ( - // ShardedAddressSegmentVersion is the on-disk format that: - // - Replaces the legacy 8-byte address + separate value size in the key file with the 13-byte sharded - // Address layout (index, offset, shardID, valueSize). The keymap stores the same layout. - // - Drops the per-segment hashing salt from the metadata file. Shards are assigned to values in - // round-robin order at write time, which makes the key->shard mapping unpredictable to outside - // callers without needing a hash function or any randomness in the metadata. + // ShardedAddressSegmentVersion is the current on-disk format. It defines: + // - The 13-byte sharded Address layout in the key file (index, offset, shardID, valueSize). The + // keymap stores the same layout. + // - No per-segment hashing salt in the metadata file; shards are assigned to values in round-robin + // order at write time, which makes the key->shard mapping unpredictable to outside callers + // without needing a hash function or any randomness in the metadata. + // - No per-value length prefix in value files. The length lives only in the Address that points + // at the value, which lets secondary keys alias sub-ranges of a value without duplicating data. + // - Per-record `| kind(u8) | keyLen(u16) | key | address(13) |` layout in the key file. The kind + // byte distinguishes primary keys from secondary keys and marks group boundaries used at + // recovery time to discard torn writes atomically. Key length is capped at 64 KiB. + // + // The constant name predates the value-file and key-file changes; it is retained because no + // instance of this codebase has been deployed to production, so there is no compatibility cost to + // folding the new format into the same version number rather than bumping it. ShardedAddressSegmentVersion SegmentVersion = 3 ) diff --git a/sei-db/db_engine/litt/disktable/segment/value_file.go b/sei-db/db_engine/litt/disktable/segment/value_file.go index a23123193d..5cfd031a76 100644 --- a/sei-db/db_engine/litt/disktable/segment/value_file.go +++ b/sei-db/db_engine/litt/disktable/segment/value_file.go @@ -2,7 +2,6 @@ package segment import ( "bufio" - "encoding/binary" "fmt" "io" "log/slog" @@ -192,12 +191,13 @@ func (v *valueFile) path() string { return path.Join(v.segmentPath.SegmentDirectory(), v.name()) } -// read reads a value from the value file. -func (v *valueFile) read(firstByteIndex uint32) ([]byte, error) { +// read reads a length-byte range from the value file. The length is supplied by the caller (it lives in the +// Address that points at this value) so the value file itself stores no length prefix. +func (v *valueFile) read(firstByteIndex uint32, length uint32) ([]byte, error) { flushedSize := v.flushedSize.Load() - if uint64(firstByteIndex) >= flushedSize { - return nil, fmt.Errorf("index %d is out of bounds (current flushed size is %d)", - firstByteIndex, flushedSize) + if uint64(firstByteIndex)+uint64(length) > flushedSize { + return nil, fmt.Errorf("range [%d, %d) is out of bounds (current flushed size is %d)", + firstByteIndex, uint64(firstByteIndex)+uint64(length), flushedSize) } file, err := os.OpenFile(v.path(), os.O_RDONLY, 0600) //nolint:gosec // path validated by segment manager @@ -212,18 +212,12 @@ func (v *valueFile) read(firstByteIndex uint32) ([]byte, error) { }() _, err = file.Seek(int64(firstByteIndex), 0) - reader := bufio.NewReader(file) - - // Read the length of the value. - var length uint32 - err = binary.Read(reader, binary.BigEndian, &length) if err != nil { - return nil, fmt.Errorf("failed to read value length from value file: %v", err) + return nil, fmt.Errorf("failed to seek value file: %v", err) } - // Read the value itself. value := make([]byte, length) - bytesRead, err := io.ReadFull(reader, value) + bytesRead, err := io.ReadFull(file, value) if err != nil { return nil, fmt.Errorf("failed to read value from value file: %v", err) } @@ -236,6 +230,9 @@ func (v *valueFile) read(firstByteIndex uint32) ([]byte, error) { } // write writes a value to the value file, returning the index of the first byte written. +// +// Values are written without a length prefix; the length is recorded in the Address returned by the +// owning segment, which lets secondary keys point at sub-ranges of a value without duplicating data. func (v *valueFile) write(value []byte) (uint32, error) { if v.writer == nil { return 0, fmt.Errorf("value file is sealed") @@ -249,19 +246,12 @@ func (v *valueFile) write(value []byte) (uint32, error) { firstByteIndex := uint32(v.size) - // First, write the length of the value. - err := binary.Write(v.writer, binary.BigEndian, uint32(len(value))) //nolint:gosec // value length fits uint32 - if err != nil { - return 0, fmt.Errorf("failed to write value length to value file: %v", err) - } - - // Then, write the value itself. - _, err = v.writer.Write(value) + _, err := v.writer.Write(value) if err != nil { return 0, fmt.Errorf("failed to write value to value file: %v", err) } - v.size += uint64(len(value) + 4) //nolint:gosec // value length non-negative + v.size += uint64(len(value)) //nolint:gosec // value length non-negative return firstByteIndex, nil } diff --git a/sei-db/db_engine/litt/disktable/segment/value_file_test.go b/sei-db/db_engine/litt/disktable/segment/value_file_test.go index a2668f9697..ff4872cc63 100644 --- a/sei-db/db_engine/litt/disktable/segment/value_file_test.go +++ b/sei-db/db_engine/litt/disktable/segment/value_file_test.go @@ -9,6 +9,14 @@ import ( "github.com/stretchr/testify/require" ) +// valueLocation pairs the (offset, length) of a written value so callers can later read it back. +// The value file no longer stores a length prefix, so callers must remember the length themselves +// (in production code, the length lives in the key file's Address record). +type valueLocation struct { + offset uint32 + length uint32 +} + func TestWriteThenReadValues(t *testing.T) { t.Parallel() rand := util.NewTestRandom() @@ -22,11 +30,11 @@ func TestWriteThenReadValues(t *testing.T) { expectedFileSize := uint64(0) for i := 0; i < int(valueCount); i++ { values[i] = rand.VariableBytes(1, 100) - expectedFileSize += uint64(len(values[i])) + 4 /* length uint32 */ + expectedFileSize += uint64(len(values[i])) } - // A map from the first byte index of the value to the value itself. - addressMap := make(map[uint32][]byte) + // A map from the location of the value to the value itself. + addressMap := make(map[valueLocation][]byte) segmentPath, err := NewSegmentPath(directory, "", "table") require.NoError(t, err) @@ -38,7 +46,8 @@ func TestWriteThenReadValues(t *testing.T) { for _, value := range values { address, err := file.write(value) require.NoError(t, err) - addressMap[address] = value + loc := valueLocation{offset: address, length: uint32(len(value))} //nolint:gosec // bounded + addressMap[loc] = value // Occasionally flush the file to disk. if rand.BoolWithProbability(0.25) { @@ -50,8 +59,8 @@ func TestWriteThenReadValues(t *testing.T) { if rand.BoolWithProbability(0.1) { err = file.flush() require.NoError(t, err) - for key, val := range addressMap { - readValue, err := file.read(key) + for loc, val := range addressMap { + readValue, err := file.read(loc.offset, loc.length) require.NoError(t, err) require.Equal(t, val, readValue) } @@ -61,8 +70,8 @@ func TestWriteThenReadValues(t *testing.T) { // Seal the file and read all values. err = file.seal() require.NoError(t, err) - for key, val := range addressMap { - readValue, err := file.read(key) + for loc, val := range addressMap { + readValue, err := file.read(loc.offset, loc.length) require.NoError(t, err) require.Equal(t, val, readValue) } @@ -72,13 +81,14 @@ func TestWriteThenReadValues(t *testing.T) { require.NoError(t, err) actualFileSize := uint64(stat.Size()) require.Equal(t, actualFileSize, reportedFileSize) + require.Equal(t, expectedFileSize, reportedFileSize) // Create a new in-memory instance from the on-disk file and verify that it behaves the same. file2, err := loadValueFile(logger, index, shard, []*SegmentPath{segmentPath}) require.NoError(t, err) require.Equal(t, file.size, file2.size) - for key, val := range addressMap { - readValue, err := file2.read(key) + for loc, val := range addressMap { + readValue, err := file2.read(loc.offset, loc.length) require.NoError(t, err) require.Equal(t, val, readValue) } @@ -109,8 +119,8 @@ func TestReadingTruncatedValueFile(t *testing.T) { values[i] = rand.VariableBytes(1, 100) } - // A map from the first byte index of the value to the value itself. - addressMap := make(map[uint32][]byte) + // A map from the location of the value to the value itself. + addressMap := make(map[valueLocation][]byte) segmentPath, err := NewSegmentPath(directory, "", "table") require.NoError(t, err) @@ -119,18 +129,21 @@ func TestReadingTruncatedValueFile(t *testing.T) { file, err := createValueFile(logger, index, shard, segmentPath, false) require.NoError(t, err) - var lastAddress uint32 + var lastLoc valueLocation for _, value := range values { address, err := file.write(value) require.NoError(t, err) - addressMap[address] = value - lastAddress = address + loc := valueLocation{offset: address, length: uint32(len(value))} //nolint:gosec // bounded + addressMap[loc] = value + lastLoc = loc } err = file.seal() require.NoError(t, err) - // Truncate the file. Chop off some bytes from the last value, but do not corrupt the length prefix. + // Truncate the file by chopping off some bytes from the end of the last value. Without the + // length prefix in the file, every byte we cut off is value data, so reads of the last value + // must fail and every other value must still read back correctly. lastValueLength := len(values[valueCount-1]) filePath := file.path() @@ -148,34 +161,12 @@ func TestReadingTruncatedValueFile(t *testing.T) { require.NoError(t, err) // We should be able to read all values except for the last one. - for key, val := range addressMap { - if key == lastAddress { - _, err := file.read(key) - require.Error(t, err) - } else { - readValue, err := file.read(key) - require.NoError(t, err) - require.Equal(t, val, readValue) - } - } - - // Truncate the file. Corrupt the length prefix of the last value. - prefixBytesToRemove := rand.Int32Range(1, 4) - bytes = originalBytes[:len(originalBytes)-int(prefixBytesToRemove)] - - err = os.WriteFile(filePath, bytes, 0644) - require.NoError(t, err) - - file, err = loadValueFile(logger, index, shard, []*SegmentPath{segmentPath}) - require.NoError(t, err) - - // We should be able to read all values except for the last one. - for key, val := range addressMap { - if key == lastAddress { - _, err := file.read(key) + for loc, val := range addressMap { + if loc == lastLoc { + _, err := file.read(loc.offset, loc.length) require.Error(t, err) } else { - readValue, err := file.read(key) + readValue, err := file.read(loc.offset, loc.length) require.NoError(t, err) require.Equal(t, val, readValue) } diff --git a/sei-db/db_engine/litt/memtable/mem_table.go b/sei-db/db_engine/litt/memtable/mem_table.go index ff322d55a4..2b16d8facc 100644 --- a/sei-db/db_engine/litt/memtable/mem_table.go +++ b/sei-db/db_engine/litt/memtable/mem_table.go @@ -92,29 +92,64 @@ func (m *memTable) KeyCount() uint64 { return uint64(len(m.data)) } -func (m *memTable) Put(key []byte, value []byte) error { - stringKey := string(key) - expiration := &expirationRecord{ - creationTime: m.clock(), - key: stringKey, +func (m *memTable) Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error { + // Validate first so a failed validation never leaves a partial insert behind. + if key == nil { + return fmt.Errorf("nil keys are not supported") + } + if value == nil { + return fmt.Errorf("nil values are not supported") + } + seen := make(map[string]struct{}, 1+len(secondaryKeys)) + seen[string(key)] = struct{}{} + for _, sk := range secondaryKeys { + if sk == nil { + return fmt.Errorf("nil secondary key is not supported") + } + if sk.Key == nil { + return fmt.Errorf("nil secondary key bytes are not supported") + } + end := uint64(sk.Offset) + uint64(sk.Length) + if end > uint64(len(value)) { + return fmt.Errorf( + "secondary key range [%d, %d) exceeds value length %d", sk.Offset, end, len(value)) + } + skKey := string(sk.Key) + if _, dup := seen[skKey]; dup { + return fmt.Errorf("duplicate key %x within Put", sk.Key) + } + seen[skKey] = struct{}{} } + stringKey := string(key) + now := m.clock() + m.lock.Lock() defer m.lock.Unlock() - _, ok := m.data[stringKey] - if ok { + if _, ok := m.data[stringKey]; ok { return fmt.Errorf("key %x already exists", key) } + for _, sk := range secondaryKeys { + if _, ok := m.data[string(sk.Key)]; ok { + return fmt.Errorf("secondary key %x already exists", sk.Key) + } + } + m.data[stringKey] = value - m.expirationQueue.Push(expiration) + m.expirationQueue.Push(&expirationRecord{creationTime: now, key: stringKey}) + for _, sk := range secondaryKeys { + skString := string(sk.Key) + m.data[skString] = value[sk.Offset : sk.Offset+sk.Length] + m.expirationQueue.Push(&expirationRecord{creationTime: now, key: skString}) + } return nil } -func (m *memTable) PutBatch(batch []*types.KVPair) error { - for _, kv := range batch { - err := m.Put(kv.Key, kv.Value) +func (m *memTable) PutBatch(batch []*types.PutRequest) error { + for _, req := range batch { + err := m.Put(req.Key, req.Value, req.SecondaryKeys...) if err != nil { return err } diff --git a/sei-db/db_engine/litt/table.go b/sei-db/db_engine/litt/table.go index 58c189b77b..9394831c74 100644 --- a/sei-db/db_engine/litt/table.go +++ b/sei-db/db_engine/litt/table.go @@ -22,25 +22,36 @@ type Table interface { // Note that when this method returns, data written may not be crash durable on disk // (although the write does have atomicity). In order to ensure crash durability, call Flush(). // - // The maximum size of the key is 2^32 bytes. The maximum size of the value is 2^32 bytes. - // This database has been optimized under the assumption that values are generally much larger than keys. - // This affects performance, but not correctness. + // Optional secondary keys may be supplied; each secondary key acts as an additional alias for a + // sub-range of the value (or the whole value, when Offset=0 and Length=len(value)). Secondary + // keys are first-class keys: they appear in KeyCount(), Get(), Exists(), and are subject to the + // same TTL as the primary. They share the value's bytes on disk, so they cost one keymap entry + // each and do not duplicate value bytes. Secondary keys must be globally unique just like + // primary keys, and must not collide with the primary key or other secondaries. + // + // The maximum size of a key (primary or secondary) is 64 KiB (2^16 - 1 bytes). The maximum size + // of the value is 2^32 bytes. This database has been optimized under the assumption that values + // are generally much larger than keys. This affects performance, but not correctness. // // It is not safe to modify the byte slices passed to this function after the call - // (both the key and the value). - Put(key []byte, value []byte) error + // (the key bytes, the value bytes, and every secondary key's bytes). + Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error // PutBatch stores multiple values in the database. Similar to Put, but allows for multiple values to be written // at once. This may improve performance, but it otherwise has identical properties to a sequence of Put calls // (i.e. this method does not atomically write the entire batch). // - // The maximum size of a key is 2^32 bytes. The maximum size of a value is 2^32 bytes. - // This database has been optimized under the assumption that values are generally much larger than keys. - // This affects performance, but not correctness. + // Each PutRequest may include zero or more secondary keys (see Put for semantics). Validation + // is per-request: a request with any invalid secondary keys is rejected without applying any + // part of that request, but other requests in the batch may still be applied. + // + // The maximum size of a key (primary or secondary) is 64 KiB (2^16 - 1 bytes). The maximum size + // of a value is 2^32 bytes. This database has been optimized under the assumption that values + // are generally much larger than keys. This affects performance, but not correctness. // // It is not safe to modify the byte slices passed to this function after the call - // (including the key byte slices and the value byte slices). - PutBatch(batch []*types.KVPair) error + // (including the key byte slices, the value byte slices, and every secondary key's bytes). + PutBatch(batch []*types.PutRequest) error // Get retrieves a value from the database. The returned boolean indicates whether the key exists in the database // (returns false if the key does not exist). If an error is returned, the value of the other returned values are diff --git a/sei-db/db_engine/litt/test/db_test.go b/sei-db/db_engine/litt/test/db_test.go index 70be8a5089..f48652b9c0 100644 --- a/sei-db/db_engine/litt/test/db_test.go +++ b/sei-db/db_engine/litt/test/db_test.go @@ -154,11 +154,11 @@ func randomDBOperationsTest(t *testing.T, builder *dbBuilder) { require.NoError(t, err) expectedValues[tableName][string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[tableName][string(key)] = value } err = table.PutBatch(batch) @@ -284,11 +284,11 @@ func dbRestartTest(t *testing.T, builder *dbBuilder) { require.NoError(t, err) expectedValues[tableName][string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[tableName][string(key)] = value } err = table.PutBatch(batch) diff --git a/sei-db/db_engine/litt/test/keymap_migration_test.go b/sei-db/db_engine/litt/test/keymap_migration_test.go index 4829b3ea96..afd839ed7f 100644 --- a/sei-db/db_engine/litt/test/keymap_migration_test.go +++ b/sei-db/db_engine/litt/test/keymap_migration_test.go @@ -57,11 +57,11 @@ func TestKeymapMigration(t *testing.T) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -206,11 +206,11 @@ func TestFailedKeymapMigration(t *testing.T) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) diff --git a/sei-db/db_engine/litt/test/migration_data.go b/sei-db/db_engine/litt/test/migration_data.go index 321f3f9664..2d10745e0b 100644 --- a/sei-db/db_engine/litt/test/migration_data.go +++ b/sei-db/db_engine/litt/test/migration_data.go @@ -1,7 +1,71 @@ package test -// This map is used for migration tests. This data is written to a table at the old version, and used to verify that -// the data after migration is the same as the data before migration. +import "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/types" + +// migrationPuts is the canonical input written to the migration-test fixture. It mirrors what real +// callers do: a sequence of Puts, some of which include secondary keys. Three primaries near the +// end carry secondaries that exercise every KeyKind path: +// +// - "kindStandalone-primary" is a 0-secondary Put (covered by every other entry too, but called +// out by name for readability). +// - "kindPrimary-with-one-secondary" carries exactly one secondary, exercising +// KeyKindPrimary + KeyKindFinalSecondary. +// - "kindPrimary-with-three-secondaries" carries three secondaries (a mix of strict sub-range +// and alias-the-whole-value), exercising KeyKindPrimary + 2× KeyKindSecondary + +// KeyKindFinalSecondary. +// +// Cross-version migration verifies that every primary AND every secondary survives the round +// trip through whatever the current on-disk format happens to be. +var migrationPuts = func() []*types.PutRequest { + out := make([]*types.PutRequest, 0, len(migrationData)+3) + for key, value := range migrationData { + out = append(out, &types.PutRequest{Key: []byte(key), Value: []byte(value)}) + } + + out = append(out, + &types.PutRequest{ + Key: []byte("kindStandalone-primary"), + Value: []byte("standalone"), + }, + &types.PutRequest{ + Key: []byte("kindPrimary-with-one-secondary"), + Value: []byte("hello world"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("kindFinal-only-secondary"), Offset: 0, Length: 5}, // "hello" + }, + }, + &types.PutRequest{ + Key: []byte("kindPrimary-with-three-secondaries"), + Value: []byte("the quick brown fox jumps over the lazy dog"), + SecondaryKeys: []*types.SecondaryKey{ + {Key: []byte("kindMid-quick"), Offset: 4, Length: 5}, // "quick" + {Key: []byte("kindMid-brown"), Offset: 10, Length: 5}, // "brown" + {Key: []byte("kindFinal-alias-whole"), Offset: 0, Length: 43 /* len of the value */}, // alias the whole value + }, + }, + ) + + return out +}() + +// expectedMigrationKVs flattens migrationPuts into a single map from key bytes -> expected bytes. +// Every primary key maps to its full value; every secondary key maps to the sub-range of its +// parent's value bytes that it points at. +var expectedMigrationKVs = func() map[string]string { + out := make(map[string]string, len(migrationPuts)) + for _, p := range migrationPuts { + out[string(p.Key)] = string(p.Value) + for _, sk := range p.SecondaryKeys { + out[string(sk.Key)] = string(p.Value[sk.Offset : sk.Offset+sk.Length]) + } + } + return out +}() + +// migrationData is the original key->value fixture from v3 and earlier. Newer fixtures extend it +// via migrationPuts above; keeping migrationData as a standalone map preserves the historical +// payload (so generated v3 data remains byte-for-byte identical were one to regenerate it under +// the old code). var migrationData = map[string]string{ "S7MOxfceWW": "oSNhtpEtRb48ntgPkhL", "uQxQ25apaahwztuOzNi": "Tn2MgaTP5B", diff --git a/sei-db/db_engine/litt/test/migration_test.go b/sei-db/db_engine/litt/test/migration_test.go index 3648155996..0199533589 100644 --- a/sei-db/db_engine/litt/test/migration_test.go +++ b/sei-db/db_engine/litt/test/migration_test.go @@ -49,13 +49,13 @@ func TestGenerateData(t *testing.T) { table, err := db.GetTable("test") require.NoError(t, err) - for key, value := range migrationData { - err = table.Put([]byte(key), []byte(value)) + for _, p := range migrationPuts { + err = table.Put(p.Key, p.Value, p.SecondaryKeys...) require.NoError(t, err) } // verify the data in the table - for key, value := range migrationData { + for key, value := range expectedMigrationKVs { v, exists, err := table.Get([]byte(key)) require.NoError(t, err) require.True(t, exists) @@ -129,11 +129,11 @@ func testMigration(t *testing.T, migrationPath string) { table, err := db.GetTable("test") require.NoError(t, err) - // Verify the data in the table matches our expected data - for key, value := range migrationData { + // Verify the data in the table matches our expected data (including secondary keys). + for key, value := range expectedMigrationKVs { v, exists, err := table.Get([]byte(key)) require.NoError(t, err) - require.True(t, exists) + require.True(t, exists, "key %q missing after migration", key) require.Equal(t, value, string(v)) } @@ -158,7 +158,7 @@ func testMigration(t *testing.T, migrationPath string) { } // Verify the original data. - for key, value := range migrationData { + for key, value := range expectedMigrationKVs { v, exists, err := table.Get([]byte(key)) require.NoError(t, err, "Error reading migration data") require.True(t, exists, "Migration data doesn't exist") @@ -177,7 +177,7 @@ func testMigration(t *testing.T, migrationPath string) { require.NoError(t, err, "Failed to get table after reopening") // Verify original migration data is still intact - for key, value := range migrationData { + for key, value := range expectedMigrationKVs { v, exists, err := table.Get([]byte(key)) require.NoError(t, err, "Error reading migration data after reopen") require.True(t, exists, "Migration data doesn't exist after reopen") diff --git a/sei-db/db_engine/litt/test/table_test.go b/sei-db/db_engine/litt/test/table_test.go index de4e8ed0c9..0fc208159e 100644 --- a/sei-db/db_engine/litt/test/table_test.go +++ b/sei-db/db_engine/litt/test/table_test.go @@ -290,11 +290,11 @@ func randomTableOperationsTest(t *testing.T, tableBuilder *tableBuilder) { require.NoError(t, err) expectedValues[string(key)] = value } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value } err = table.PutBatch(batch) @@ -407,11 +407,11 @@ func garbageCollectionTest(t *testing.T, tableBuilder *tableBuilder) { expectedValues[string(key)] = value creationTimes[string(key)] = newTime } else { - batch := make([]*types.KVPair, 0, batchSize) + batch := make([]*types.PutRequest, 0, batchSize) for j := int32(0); j < batchSize; j++ { key := rand.PrintableVariableBytes(32, 64) value := rand.PrintableVariableBytes(1, 128) - batch = append(batch, &types.KVPair{Key: key, Value: value}) + batch = append(batch, &types.PutRequest{Key: key, Value: value}) expectedValues[string(key)] = value creationTimes[string(key)] = newTime } @@ -549,3 +549,114 @@ func TestInvalidTableName(t *testing.T) { require.Error(t, err) require.Nil(t, table) } + +// secondaryKeyBasicsTest runs against every table implementation registered in tableBuilders. It +// verifies that secondary keys behave like first-class keys at the Table interface: Put accepts +// them, Get returns the correct sub-range bytes both before and after Flush, Exists reports them +// as present, and KeyCount counts them. +func secondaryKeyBasicsTest(t *testing.T, tb *tableBuilder) { + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tb.builder(time.Now, tableName, directory) + require.NoError(t, err) + + value := []byte("the quick brown fox") + primary := []byte("primary") + sk1 := &types.SecondaryKey{Key: []byte("quick"), Offset: 4, Length: 5} + sk2 := &types.SecondaryKey{Key: []byte("alias"), Offset: 0, Length: uint32(len(value))} + + require.NoError(t, table.Put(primary, value, sk1, sk2)) + + verify := func(stage string) { + t.Helper() + got, ok, err := table.Get(primary) + require.NoError(t, err, stage) + require.True(t, ok, stage) + require.Equal(t, value, got, stage) + + ok, err = table.Exists(sk1.Key) + require.NoError(t, err, stage) + require.True(t, ok, stage) + got, ok, err = table.Get(sk1.Key) + require.NoError(t, err, stage) + require.True(t, ok, stage) + require.Equal(t, value[sk1.Offset:sk1.Offset+sk1.Length], got, stage) + + got, ok, err = table.Get(sk2.Key) + require.NoError(t, err, stage) + require.True(t, ok, stage) + require.Equal(t, value, got, stage) + + require.EqualValues(t, 3, table.KeyCount(), stage) + } + + verify("before flush") + require.NoError(t, table.Flush()) + verify("after flush") + + require.NoError(t, table.Destroy()) +} + +func TestSecondaryKeyBasics(t *testing.T) { + t.Parallel() + for _, tb := range tableBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + secondaryKeyBasicsTest(t, tb) + }) + } +} + +// secondaryKeyCachedWriteHotTest verifies that immediately after Put, both the primary and every +// secondary key are hot in the cached table's write cache (CacheAwareGet with +// onlyReadFromCache=true returns the bytes without touching disk). Skips non-cached +// implementations since CacheAwareGet on those is functionally identical to Get. +func secondaryKeyCachedWriteHotTest(t *testing.T, tb *tableBuilder) { + rand := util.NewTestRandom() + directory := t.TempDir() + tableName := rand.String(8) + table, err := tb.builder(time.Now, tableName, directory) + require.NoError(t, err) + + value := []byte("hello world") + require.NoError(t, table.Put([]byte("primary"), value, + &types.SecondaryKey{Key: []byte("hello"), Offset: 0, Length: 5}, + &types.SecondaryKey{Key: []byte("world"), Offset: 6, Length: 5}, + )) + + for _, kv := range []struct { + key []byte + expected []byte + }{ + {[]byte("primary"), value}, + {[]byte("hello"), []byte("hello")}, + {[]byte("world"), []byte("world")}, + } { + got, ok, hot, err := table.CacheAwareGet(kv.key, true) + require.NoError(t, err, "key=%s", kv.key) + require.True(t, ok, "key=%s", kv.key) + require.True(t, hot, "key=%s expected to be in write cache", kv.key) + require.Equal(t, kv.expected, got, "key=%s", kv.key) + } + + require.NoError(t, table.Destroy()) +} + +func TestSecondaryKeyCachedWriteHot(t *testing.T) { + t.Parallel() + // Cached variants only: the non-cached builders treat CacheAwareGet(_, true) as "miss". + cachedBuilders := []*tableBuilder{ + {"cached memtable", buildCachedMemTable}, + {"cached mem keymap disk table", buildCachedMemKeyDiskTable}, + {"cached pebbledb keymap disk table", buildCachedPebbleDBKeyDiskTable}, + } + for _, tb := range cachedBuilders { + tb := tb + t.Run(tb.name, func(t *testing.T) { + t.Parallel() + secondaryKeyCachedWriteHotTest(t, tb) + }) + } +} diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/keymap/data/000002.log b/sei-db/db_engine/litt/test/testdata/v3/test/keymap/data/000002.log index fc9ba6bff3c1636605b23898d9bbfa92f343445c..f51d65d12dc55c51bd29edafaf1b401593c3a463 100644 GIT binary patch literal 3387 zcmZ`*PqV7n5r1c`L=jN- z+2-5i>*O0uR@o%0oIYNBi&Re4ISYQ>{p-`GyXhbQ{*V8(KY#z(yLaz$c*FXDH!98r z*S;lGjpA$lPLTj_M*%%w>o@T@51WaS>>bC^j}HNag^K7U^fzg>HWQA%7996AZ7hx- zBEVHar8Pa^&qHZ_o?a}MVjuA57%rjwp00M6!A9+~qYNs510_`03>#BhrA4PDJg_1V zW6yM-U#pw`MA~hJO?}hcXUY|zEue}&^>nSxvgZbO+IX~di!+H5Fc(pNdo!l7uvnVX z!zl>KyJz&Q^tOJ79Zwi*tm~I$qGq~Z0DT^nz15|C;OmN2O;(mPJ7@S;_e($P4{!Ij*R7>8Vy=wl#HvVVa2I%=P+m_t!J>Py?N2g@1J)cWw_#<&&-2UN~y!?q?SP=;H#j* zu-o!n!}bEP*Uc~;xC%f1^KZ3(e@1HQ*I);)FYrb=x;`0hnb2(Mpa6Er!p0(t+OZoW zIn}PjJA%PcM1{_^+MXzkz;vd&haM1N|Kx4E<~3^zzfY2YsKKRhJSZ*pI4$nngtpu7 zDddd-uiy=Jx1wre#)~@CYSOywAL^ulo=}d_`CDL%k?~M`z6Y`pF6o6#2c{OyNZEnW z+&z*7YDzDgAtxo=h*TV4-vD75y)c1yG+AwINVni>i4@Zz;RcZpa1_zAVR}L`rkzx6 z>`Zf}J?{H8a6>m(buWwB%uHM*3&jVZE}_z#;#-oe*ikiF)i^1G8H{4nx#hJz*OW$j z6EozuKw3gC?wOmYVl7&#k|kb~4QB?K$oF^JT)1PSn$F$4G(*09W9@n@;~FzoHjXm)eL8W8{>Y;awA&mFSO9*)diQ~e*j|KgF5s6X%aS; zBmWW37BJ%)9c#UaYWrKm*{=Amhi~)-#Wm~4Q`eaaG0iYBX%V1o;QcQD zuRs3vC%gOqz8<=%8-^+i{-=BAw8_O2XRVzBM_36N4Y60E7z@IX{0F<1nj<` z;&v^?N7m)0Q%@r;mw<2-P;t#$3voS58#GIq;4J~c+Z@U@I<+Lh^*<9bB##07`cIU< zXC;nr4;;0|IvM%`jL*2KD1-6dxbbUdQMGsCEW-<6{0SA9p`op#H1?)4d3*T$Rtrx~ zzpI;0xTZ`-vPqK!^cQ$zkBWw!9#TJID1OE~Xh{QqL3zKyq+X}B8(zq>6FoS}sNlr? zbGLJ`&&^Ymc|%Gdj|%iWaTD*lIc|@noG)z1EpIv-WUJekK5sB*XzC8=ev~qG8rZ5msa{U!YPfCrnYs+rIov5$t<_dz>(i7wb=6x2 z_rOL0l~Y+%w|%QRGnqqOlt@hy*MPl*CnDMF@|4o=uO_^gDGxA)xnH93bxhUuK-cXg zy+lendOWUnr>{tIHx(BAqAxXX$8?d6{&@=U9mS!s~bFddoV;+0TslmGduWg$~CH_ z*%AyKE#oJeFX9u`Y1O;?fzl^oBrte`v+WkhL7T2K+t!u1LNGXRUC_aPKT6gIVW*I; z0-wp)_JYcdD^m{^R!24Vd~32HeF^X~(BiS~DonfGCEpou7vv`nSROd>;C1RW1(_WV z7jmsdZ8X+^_!CZY;AUPG;`xf&n}%T7$yAuBWcUE|YrJ7JbmRqlsdgm64w@Qwp-E3C z0^mReZ`RS+rhLi#t79pAv*NaSW3G{0XHUZT@!V~= zb`e!DrNs3dC0th1RJ!g?*G&B*a2L_f_Kt5(FS>8)-qE*SgR^tc&!m394GNAN`U<4f zJM@}37yA`tL(4c~y-(6TER3`0=O8UOzV2Z$@AiM>-lJdrk5$|Ptxt9x>yg@IK>H52 z5Nug?%-2)o7^IDYe-pUu`%mEJd`uphy}W}z{p6GGIePe-*hkOGf^jzw-eW^x^1qM= BOCJCL literal 3141 zcmX|ENzb#$5&n9Txp;ZKytAAVO_EWP(L@;=u)+E!N^Aqh7z|!8;7erVh0QBA*ho3# zl1r4s{F3~X{DWLZ`5)WBDg&t*ZQ9Jzh6X@xOz352Dps3@Fw8^@Mc_rWlSfTM)U(?*_vjCd=13!sKzP@eQ9w~JolKH&=r$R;8!p1au6)+Xsz|GL-%^fz&`?U0lj%r zf3@+VCADD`H;=}5z*<0MY9a1y_Gy|eM~E`jkFOn9Q&rY?(zY?*Fo%|`@EJ}4Zv|DB z^ia5tl;w4H+xirn!QTUO1(kP8B|fn}KbwW7o`HV?Vthu9n;z2RouPy|<7MFJwc~30 z!L-hcZBKUsp)*~tGhBEPS1B_Nc{A=k#jY<6v{r^|z+C_M_dopb@s)5SxbO{6eDrGA z7bs1V?5Plf4$zVDhO_Y;1mk3R5cgU$$6o#v1G>?-&TCy_$2s(J<=wyXQ(bP}EnV5l zk7Uhn&bi1d4s}KCr{Y3bNDB8nrHh2NG6`@CpF*{x4JU!~6qd}Q;qK*mhJOkCZ>Y99 z3QK)0O_lw6#I4p`kO{nl7bWx_9oK22ITuey^Lz@L=0u+4A|xiL7})*WqB-9rzLr5> zVP$+!b>ngC-&d3u39DuI-WSw70N=a<@XnmW(}p!Cj`BP|wlc-^n9Rs;z=blZ&$s?) z%B@bfSvc>RET`TwiEmyWl-8s6%++bx>xg+6X>b9j!{!-zoXbZepA*GQ@h`w$L2PH|Mx>+{o|EI4Uom-b39Q|-F2HG*IXPW(geU0G76}~Pb_|8 z$g%4ABXERZ#2P9wBgb6c(>R=|q+t;ZmKv(K%Z{aunFnKfu`biOD(nFtzo#zPTcf9l z?wK?B8=YnXq;1tv>At1*?Al2#&BrXmz5rg_SGd%V26RFnNC^Vr=dje#`+f45P_5N@ zwQ`g6cGuD|DMW$-Gv1IFOV@ty=3cqWJv00@hVZGpQOAH)c%eJ=^d_5Y{tjqss3630 zXx;aW;%9o_f?=E}gKeudW;uU2NJA!gnq88r0Dc9mRaA2*o@<>heQzelG{eNC;ei0j zmLhuh8)9tORIqWxt((lmxlXPJ5cwYXaYCt5GU%;rb$Zhmq}~V!OBoYvLNS@pUaGhE zo6X$ZXFf9EK?N0ihIE-mZEJiar3Pv}JaN>jp?&KfgqF5#B=%OBUo%zZbwv*NBW+fyKhyZw8M74=I zxKrKZ&hd7((BJw(ZUNlGSM9OQq+zeKAKgeah%PKuRI#b%gbD9G%AR&^0VxQA0QnoL z2peUp>Au(quajxZ&J0(8xqvEOb9&JOiLyphpGio(K>IV`eM80F)0xHM!rCb0ZXpmX zHB?-sytt(Yv3s0mMt_YF(oADIu*dGxyqwq8$R~F^X#E06u~o$}R6nNp@R$?+76@_D zMtk<&T=CD4R2V3c-c><`@$~u-uEVKH`W_&pDb!G@_h@vd8Y42j8A&6+z5}Wnsz&l{ z)N_N2C36#BUy*wZ@Q=V(N9C>L?zP2{wU)h@TpIwA>$8YTkwq_5`n)mkbJ8vWBL|Ng zhCvA(Mb+GU<8C*3B?|!$yk&gza!=-QS6l2woa9bTfV7O>M@MPt@;W1I)5F-2+R2JE zTrfX^1)J}OEaN+KR&`E$?M~ViAWJ+y)W7=gkIr9Tc}PCsZ{$5!>(k52MPr=pF`l@F zbtTvG&p`(UZc}o@n;(u{%6A&XD}sT`RaCI{og2GrJUYY~f`Gs8s7hUiuI^47x68E= z?K(%;EWyF00@i$V4O45UPdP)rzyPk|Lc7;Y5`5V6+$MR;RZ#8Gddepw;u0+hJkML5 zGSx3%L8^{yrRA}Q;iEn5v^H8J!{2}uZ-tuC78mhK!jDN!fJ5H^Z55TCXZ75rTgi{XdJSGjhb6qzLFS3mWEt;8S4dak%OJTc^m-Dnuv(zR7C7{Kxc${DgKk&53 oYNV<1*k2x*1bNAoQN`y5Lv3mF&(7%C@OUJ3dlDw%XZFBvG6d>%W=#xFL4Ym%y2CUttcE za106d3k;01OfC#?Ei?)X3koR>OHT@RaVyD7G)*b&FO zPxEuG2+oPBFfj=Y$ac4IHZVzxEH}3-4e-eIGYcqi3(fZ_bS+6Y4fF{}GYay1F|!GK+4K0%9D-rOiGFhja@*>l0rR_!u<+@^1R$~gG{SZ(tRTV4l*0K diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/0-2.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/0-2.values index dfaf4da22736f13a01e4a41318a8a57b81cf4fda..f7c862584d1590cb449c5e0cb0f19ad65ae3b5b1 100644 GIT binary patch literal 109 zcmV~$(GG$j6adg)a&Wb0lNqJBbb-vbfGI<#1~n&N-#KQk=c-&(6FNH;!!7KupXhbV z?!3SU$`q_La&C_2?Pd4Stb_-xE30Pg#`fx l_)Q{0Lkd%h^8y1(JdK>PK|;ZPam@Q!1&Pa!FMVvM5+NDT@k>C`edt;HYdOi_?gIeY&Tbs!&vQ zSAD<7_xd#e^ej0Jo0_I*iCQhde@_>H;eFC5SF7S{y-uPi@~9$*HLo;p$C&0?`~_|L0CW;&EI4_vU2z+NkoW_219fVK zTSkdya?5&Hj63r)$=e1iLpuj5?+1KBibq(B0;9}KeS6Mv5Pp9{ZwIcK< znflxF+`S^{^g)?kT}ERqk`QnM19SojDyj2+ zX0@1*Kkd0gZ|3M5_RiW7+$v04h}UN{TK^%+M`j7b%pgd_Ry2~q3^Iwk2^+h}14oS) zgqzrjOSOab8bex4Gn1edmZIx?+8E_sD%Zv2xRmKKnb{9LEhrug%-y93GN`!M%wK(qENL7pyGopvbyA1fnf@O!1C10?CiKO5!N6 zwx^lN*n83pNb-*()YRG^JZTI!W%L7SJ_GgiJndRELU?3MDIG$Ey@RSDr=@v z_ytS>4%P8KGtGscBq!NpV+-KRlN5SKeA2vmTxAvHD;b9-}iUv@cCDE|rW!1-ijHwPuxT(|I-l>UPH|Hq=VFZ<_{gz!qr sD4nyL3wLKpuGYKhGTP(GV;E8aX66F&n?m!OX{SHr#>|nazybHGfAceBn*aa+ diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/0.metadata b/sei-db/db_engine/litt/test/testdata/v3/test/segments/0.metadata index f6b48c068d987f1a788c8a6a6c80094cc3e8e759..62e1a1ad6648e3de4a6e763c76de3cde76e75abc 100644 GIT binary patch literal 18 ZcmZQzU|?pE*eG!5V~zL*1_lOcMgSv_1M>g? literal 18 ZcmZQzU|?pE*s%NHTC-Ig3=9k+i~uG^1NHy_ diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/1-0.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/1-0.values index d980957c5c8a6f376b77a43ed0de12283646a82c..c33a5234f85456af6b01012eff5904dd47ba0b75 100644 GIT binary patch literal 110 zcmV~$(GG$j006*W3PZt1mni0#v|)v>Or|&xM6LDp-PLqT3_Ss#h2Jq%b{48W(j=_TX$EaeGRnl^^$e02*Ve&JpRsCc!!3dAa#OKCekhUP^vZm|%R8apNWXe$lksneTTMR`h0(wc@91Ud0BiXGS4{K{2h tSme-cl$lZJnd3c}KBZiJPehJIjw*JhdExLb`&0}VGACMTJ=m|w!4IdrAeaCE literal 101 zcmZQzU=S`aa!t$*a5gM-t2FeQ08M>G_ zRT>9mrzMw!Ia(CBIs%2c%Z=Q!3(9gV0~|epTp_c70=Lk7pF-D?bkjhefHd?=S literal 74 zcmZQzVBmN53dlDw%XZFBvG6d>%VA((;0Y-W3r`O8^$c}3E(h{?({jqqJUuc(GcEjb aqksZJ?jD(WQ4wjr;qGZsg=s~up;-W{SrhmG diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/1-3.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/1-3.values index 1ccc5fd0d245fcd11e7fe3328e614f6a82d58ac7..be04be4fbe797100796dd99770b3dc39278c88f9 100644 GIT binary patch literal 93 zcmXSPb3h!r=F_EWfy1DMW>67KoDJY0R)e75%I#MI|N0LDi$N?uWxcDFayct z`Cgvn0{~irxH1l~j3*%g;1jt&u2%p$c+V%&E~>6vy(Wtvj6PlnQ|h6-vf&ge>!e|y z$)pcJ+d9ZxKIYu>Y}TMJWYPkl$9wl2d3WV8m9yEqEiz7Q0FXyF;5i58SmA@!_d8j~E@{Jv}AI15!a)k7`93^qQ@tX6#QFoV|_Y|l%m)pcuW;a=? zCaAmRwTWrjM?AyuQJMf|dMY z+E$rJmra!plE-eig3DTypaSY#E{8;DVX018Io*Jp+rsCv^-?;HAc zV;DAA>M1W$hAgjbF~uHYRTib^!7ld4bh!ITE2O5=G*i1-Q$}_*n6VkU7t7Bji*b7hA zeC3gDbd3$(Ms7ehyI?kgj|ojgqH`{$x5I(w%w1T%tp Hd;R_a6e}W< literal 103 zcmZQzVBjirb;( A(*OVf literal 98 zcmZQzU=Z^5vdr-H8aEU0o%buu%LbS%oQwD8SObI&ew&2aNeHpzA|G7aHPfl_wGL0xtbV;vFDXlWkamzGxigb4iPV+SmNDe71^!H0ED-Cum qDG7}zNea%*iVO%ci3klTOexL_3@Gt5a>_Oiw{$NpcQ=mkFaQ8S`5qJi literal 85 zcmZQzVBiZW4NFf7c5y4oOEgU>V_;z53du9_O-~F7Fm(d*1Uy6X-NJke%M*>t^8?e0 jGk{|JIYmxJ{s9#c1!azw?yf~3O$Od3L7oMHNlu{vv^5rp diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/2.keys b/sei-db/db_engine/litt/test/testdata/v3/test/segments/2.keys index 4afd055d868951117f148720ff02e8cac63c4a5d..40c97fd82b0e8884d387afc31fe9e12f163b7dd7 100644 GIT binary patch literal 859 zcmXw&OS76V6oAuq=exRVw|BbeqN9K!)47X+2=cg~2r4^#AWua=(fs-*cLDD;v!$ zE7_Pe#4}n5y5h-E+%=rgEOo5mH}uj#_k>*?LkaK}AgM50s6ZR7uQ9>=+ zuA(Y=?yvq;ChQH3*w>s&T+ZZdpt=P?EVki-Z_ixG`okwz7w0#Y;ezXOCGp-)UzMh_ zC1Jm?t&LLLQ!i;dPcm7#FRM)OJTl=A8u5ho@l}M=U{H0fqH9Ib9sXD}?%F$U=Wp1t1!p(Gd=V^RHQGo$s4J!~pN8urla3W_M)*6nqaD+>^+XUU zLzZ_dF(D8)AG-(l<#S1mz0C813YK>Re$>s~5Pe_M1vqy^N7kmVD|9NluB=rjf0Bc< d;au3Q%Re$=&`6UTQy3KzUD0WM0`M^b{r?A3kL&;d literal 655 zcmXw0$#R=O6eRIY*s>*8a?T-DKI9@cL#lEC140NSFd#rT1z}bxT38gmKJ&;hRXs~} z&vd^x1VNDIhe(wIwK-}UK@gu2`Y}TDy=({a-qRa1%a`cflb_IL3M9LX{B7MerDHAz z5)>1J$8`HMSg+U{>t~InGXsBzHe=9a=7z6klA)$2!Ivwh!wMk(KVQdm!vBuiGj($? z`4zk$5Jp0mU*}BA94t(I0cVKyT-!IB=60vuE^WK2UU#buYo_4d*OfKefI8odnagf((>0@oifL8|FI=C~CvKj*j1Q5%M3n-(jeL%scTK2sYnq>Fu|#%b%QuH|9PFUW9@@L+1~ zT-t@^du45cfgNH@qJ62hHj{PMYqH!*1msFWmt&^ zg1bM?se2+2sz$;nqJ&o9bJ0RUZl8y5fo literal 112 zcmZQzVBqnsvMe+W39Sf7%g$h6VBjW~zsLHh=RRr{sjNpttuV;!=t`zH0WJ9H_3Yp=Q0$?m UFSL{Uj6ky;;NNe(E?}tn0qEr_fdBvi literal 85 zcmZQzVBjt>u5vYSH;V`hb7x>+;0i0q^-niVF^fzD^7zss!d$Y_OgxNz90Q{Afg*x_ kCC+|nEy^zTDmC#ostC#i3Yq%)C7GM17(1C3x};f# dq#IcHdzAZE7A1uhd8Q^Bl=&0~=J^!(007dd8G!%* literal 98 zcmZQzU=T1ebBoLk$jXi^HZ{!4u(V)cVBiZ)Ow4yKGstr^b197o1d0fnl|^NkIa!z( x`1)CdTezlF00o7-{WAQ@{VJ1!(+dnTvWgQcGk~JJ8HphoC7y2Xxq(KBh5*TU8a)62 diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/3-3.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/3-3.values index d41f6cfb174ad9d222251b87b8b6d7d50c6d1cc6..3764b786bf60453728ea74d946eb1f469c7a179d 100644 GIT binary patch literal 60 zcmZ=REzZtQ4)n-zOO7mY53+D{jWCZ24of!m&Mplr$n{S*O)-m1jHq%-4)6_)sw}E> PDlSRPOG(Vh&r1aWf1wqW literal 96 zcmZQzU=VOD%C5BV%};aBE_2Or^Gjx6U=Vc7F)wipF3fN(39TqEv+%b73i6kmM3njz u7#g{!n-zrTw(kw&L4J`aU%Ka;glEMJoB^n0+ diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/3.keys b/sei-db/db_engine/litt/test/testdata/v3/test/segments/3.keys index 8f85e07aa314335733a4a5aa9ec4814c98491720..fdaba553fa0fb8f9b9af9e9d10cab056f61856cb 100644 GIT binary patch literal 855 zcmZva-H)0;6vf9i`%>3+*UdJ4XxgL?ec6y8%7QOVL>WI(R0Q=60c8Xc6&LWY?~FTi zo3sQn6MpC1b1w`4MNWt7g^(HsZ^kfds9*eYH(BQTfyTLE7EQdD_0F>;R z{&wJ`I^XoVqfU_F8@wrCYZ@wHqsrIT^+e4W@!w-s4LxrdFL!V=s!)d00C>WcUy~Xi z>W-vZo?M@-R+K-G*b??8aN2CnWof+DCCUmU^AX<%NrZ~f@9RpX&aDi=qJn!G{Iq4^ z+ibykwgnwGo}g~M zw(0K!hY62pXC70k;;tVd2fh)N`i__wMJ_|MC&$`j-yd}wyFeiiz7RD;f$3wJpZ;HW zn*%@osT#RQ=c6uivZFmD^Ss=EB5^3lW)5d$d9=w1fwzCb)o~wZCwqQ{dGP$vlj$S# O#Ki92cW8z8^1lJGu%h_@ literal 646 zcmXw$O>?3^5Qf>^TVjIA?lqSjQkByxjv>_M5(Fdx1jeY~jUXQaA)XTIZF>YK z3uuP-XLWj2U`{?>AJ9ZMFHgXU}It4}9SA;)0WxP literal 18 ZcmZQzU|?pE*s%NHIw6M#3=9mSi~uKF1VR7+ diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-0.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-0.values index d322dab4cba3eafc69905fdc6a43ca1c6882e88a..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 96 zcmZQzU=S+GH}}m64RP@*F|KrWH}SNrU|?Y2Ps&M-C@%@m4Rf-H$gU~_3JGMpgqE3E vc;s3bn7g^UIA;OH_<}4PT_en+g2R$cy|YV!BE0!|C1u`buC6(L6`>^nz|k4f diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-1.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-1.values index d8978de1d685987fa9ab38150b0e6febe0aa1087..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 86 zcmZQzU=Yan%=LFnt+Grp@-_C!Pw{79VBj|~vM9B5b~USt$_dO5cLxgbIhPh^=O+hx k6!T+CPl@m=^)ibo~eljWj@7$c|HX`0ImWT6951J diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-3.values b/sei-db/db_engine/litt/test/testdata/v3/test/segments/4-3.values index e87dd639fe933d5098486150d4c0572a2bfc8e9b..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 89 zcmZQzVBj({&&V=zN%1SPU|?VnGB?RfNsr35tjzVT$jCD&E+_+v3T9OVIr#K*01Pn~ng9R* diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/4.keys b/sei-db/db_engine/litt/test/testdata/v3/test/segments/4.keys index 6b58409a6c6a54bf704d903b8b8ab2d66c108aaf..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 615 zcmXw$+j62n5Qf=guZ)Q?+1h=9R3&>if*j^zF9gv5BD!%9xifeGEtP{2<7FBWFBLhC`$<(jPxRe1Q8YUod;BgyTl!tWA2fNSYIs|G zmN%%tgBsn?!H(VGWEzuwKy=36z#2ws7K0B>^K^B( zaa9Iiz_txaRApexmgU+V%zw`1fK~jXvAX_F8RE2?%Aw_FjA5|8qiU^)dr5c|i`w3B z-s^2jU=6&{UFz33G{1uS4`MKeuG`BgTHy4vL!xX!kT96jLs*$ZRjusa*VnG;&q_&< x`4yw@U=33}m51@DKa8BrLuENCwn@QC^m^y5oia2WpJ;J#I$y?h_ZGC!_y)udVJiRt diff --git a/sei-db/db_engine/litt/test/testdata/v3/test/segments/4.metadata b/sei-db/db_engine/litt/test/testdata/v3/test/segments/4.metadata index 125f4bb0080090253ea8173f828797ad42a056c0..b0edd792d056a4509450644acf4d8081f83ef1e5 100644 GIT binary patch literal 18 YcmZQzU|?pE*eG!5)0zDq3_!pL04NItX#fBK literal 18 ZcmZQzU|?pE*s%NHy10893=9k+i~uRS1aklY diff --git a/sei-db/db_engine/litt/types/key_kind.go b/sei-db/db_engine/litt/types/key_kind.go new file mode 100644 index 0000000000..18551ed8ef --- /dev/null +++ b/sei-db/db_engine/litt/types/key_kind.go @@ -0,0 +1,42 @@ +package types + +// KeyKind tags each record in the per-segment key file. It distinguishes primary keys from secondary +// keys (which alias sub-ranges of another key's value bytes) and also delimits "groups" written by a +// single Put, used at recovery time to discard torn writes atomically. +// +// Layout on disk: each key-file record begins with a single KeyKind byte. Values 4-255 are reserved +// for future record kinds. +type KeyKind uint8 + +const ( + // KeyKindStandalone is a primary key whose Put did not include any secondary keys. The zero + // value is the default so any ScopedKey constructed without an explicit Kind is treated as an + // ordinary primary key. + KeyKindStandalone KeyKind = 0 + // KeyKindPrimary is a primary key whose Put included at least one secondary; the secondaries + // appear contiguously in the key file immediately after this record and terminate with a + // KeyKindFinalSecondary record. + KeyKindPrimary KeyKind = 1 + // KeyKindSecondary is a secondary key that is not the last secondary in its group. + KeyKindSecondary KeyKind = 2 + // KeyKindFinalSecondary is the last secondary in a group; it terminates the group and signals + // that the group is fully written. + KeyKindFinalSecondary KeyKind = 3 +) + +// IsPrimary reports whether this kind tags a primary key (KeyKindStandalone or KeyKindPrimary). +func (k KeyKind) IsPrimary() bool { + return k == KeyKindStandalone || k == KeyKindPrimary +} + +// IsSecondary reports whether this kind tags a secondary key (KeyKindSecondary or KeyKindFinalSecondary). +func (k KeyKind) IsSecondary() bool { + return k == KeyKindSecondary || k == KeyKindFinalSecondary +} + +// TerminatesGroup reports whether this kind closes its group. Recovery commits a buffered group +// when it encounters a record whose kind terminates the group (KeyKindStandalone or +// KeyKindFinalSecondary). +func (k KeyKind) TerminatesGroup() bool { + return k == KeyKindStandalone || k == KeyKindFinalSecondary +} diff --git a/sei-db/db_engine/litt/types/kv_pair.go b/sei-db/db_engine/litt/types/kv_pair.go deleted file mode 100644 index 7fbc8bf74e..0000000000 --- a/sei-db/db_engine/litt/types/kv_pair.go +++ /dev/null @@ -1,9 +0,0 @@ -package types - -// KVPair represents a key-value pair. -type KVPair struct { - // Key is the key. - Key []byte - // Value is the value. - Value []byte -} diff --git a/sei-db/db_engine/litt/types/put_request.go b/sei-db/db_engine/litt/types/put_request.go new file mode 100644 index 0000000000..40d2166a8c --- /dev/null +++ b/sei-db/db_engine/litt/types/put_request.go @@ -0,0 +1,11 @@ +package types + +// A request to put a key-value pair with optional secondary keys into the database. +type PutRequest struct { + // Key is the primary key. + Key []byte + // Value is the value to put. Only written once, even if secondary keys are provided. + Value []byte + // Secondary keys pointing to sub-ranges of the value. May be nil. + SecondaryKeys []*SecondaryKey +} diff --git a/sei-db/db_engine/litt/types/scoped_key.go b/sei-db/db_engine/litt/types/scoped_key.go index b785248f40..7dbb6253f1 100644 --- a/sei-db/db_engine/litt/types/scoped_key.go +++ b/sei-db/db_engine/litt/types/scoped_key.go @@ -7,4 +7,9 @@ type ScopedKey struct { Key []byte // The location where the value associated with the key is stored. Address Address + // Kind tags the record's role in the key file: ordinary primary, primary with secondaries to + // follow, or one of the secondaries that follow such a primary. The zero value + // (KeyKindStandalone) means an ordinary primary key, so call sites that do not care about + // secondary keys can construct ScopedKey literals as before. + Kind KeyKind } diff --git a/sei-db/db_engine/litt/types/secondary_key.go b/sei-db/db_engine/litt/types/secondary_key.go new file mode 100644 index 0000000000..474cf99dcd --- /dev/null +++ b/sei-db/db_engine/litt/types/secondary_key.go @@ -0,0 +1,15 @@ +package types + +// A SecondaryKey is used to access specific parts of a value with direct lookups (i.e. without needing to read the +// entire value into memory). It can also be used to alias the entire value to a different key. +type SecondaryKey struct { + // A key in the DB. Similar to primary keys, secondary keys must be globally unique and cannot be modified after + // creation (other than being deleted when the TTL expires). + Key []byte + // The offset of the start of the byte range described by the secondary key. Must be less than or equal to the + // length of the full value associated with the key. + Offset uint32 + // The length of the byte range described by the secondary key. Offset+Length must be less than or equal to the + // length of the full value associated with the key. + Length uint32 +} From 6724f2861ad55fcb538c7c894517d69704222b01 Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Fri, 22 May 2026 14:33:54 -0500 Subject: [PATCH 3/5] fix test flake --- .../db_engine/litt/benchmark/data_tracker.go | 79 ++++++++++++++++--- 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/sei-db/db_engine/litt/benchmark/data_tracker.go b/sei-db/db_engine/litt/benchmark/data_tracker.go index e72e8e7478..fa9c965e61 100644 --- a/sei-db/db_engine/litt/benchmark/data_tracker.go +++ b/sei-db/db_engine/litt/benchmark/data_tracker.go @@ -8,6 +8,7 @@ import ( "os" "path" "strings" + "sync" "time" "github.com/sei-protocol/sei-chain/sei-db/common/unit" @@ -94,9 +95,19 @@ type DataTracker struct { // The size of the values in bytes for new cohorts. valueSize uint64 - // This channel has capacity one and initially has one value in it. This value is drained when the DataTracker is - // fully stopped. Other threads can use this to block until the DataTracker is fully stopped. - closedChan chan struct{} + // Closed by Close() to signal the data generator goroutine to drain any pending + // reports from writtenKeyIndicesChan and then exit. This is what makes Close() + // deterministic with respect to ReportWrite: any ReportWrite call that has + // returned before Close() is invoked is guaranteed to be processed by the time + // Close() returns. + shutdownChan chan struct{} + + // Closed by the data generator goroutine when it has fully exited. Used by + // Close() to wait for the goroutine to finish. + doneChan chan struct{} + + // Ensures Close() is idempotent. + closeOnce sync.Once // Used to handle fatal errors in the DataTracker. errorMonitor *util.ErrorMonitor @@ -165,9 +176,6 @@ func NewDataTracker( safetyMargin := time.Duration(config.ReadSafetyMarginMinutes * float64(time.Minute)) safeTTL := ttl - safetyMargin - closedChan := make(chan struct{}, 1) - closedChan <- struct{}{} // Will be drained when the DataTracker is closed. - ctx, cancel := context.WithCancel(ctx) tracker := &DataTracker{ @@ -190,7 +198,8 @@ func NewDataTracker( safeTTL: safeTTL, valueSize: valueSize, generator: NewDataGenerator(config.Seed, config.RandomPoolSize), - closedChan: closedChan, + shutdownChan: make(chan struct{}), + doneChan: make(chan struct{}), errorMonitor: errorMonitor, } @@ -298,12 +307,17 @@ func (t *DataTracker) GetWriteInfo() *WriteInfo { } // ReportWrite is called when a key has been written to the database. This means that the key is now safe to be read. +// Reports submitted before Close() is invoked are guaranteed to be processed by the time Close() returns. +// Once shutdown has been initiated (via Close() or context cancellation), subsequent ReportWrite calls return +// immediately without queuing the report. func (t *DataTracker) ReportWrite(index uint64) { select { case t.writtenKeyIndicesChan <- index: return case <-t.ctx.Done(): return + case <-t.shutdownChan: + return } } @@ -331,11 +345,24 @@ func (t *DataTracker) GetReadInfoWithTimeout(timeout time.Duration) *ReadInfo { } } -// Close stops the key manager's background tasks. +// Close stops the key manager's background tasks. Close is idempotent. +// +// Close performs a graceful shutdown: any ReportWrite call that returned before Close was invoked +// is guaranteed to be processed (and any cohort completion side effects persisted) by the time +// Close returns. After Close starts, additional ReportWrite calls return immediately without +// queuing the report. func (t *DataTracker) Close() { - t.cancel() - t.closedChan <- struct{}{} - <-t.closedChan + t.closeOnce.Do(func() { + // Signal the data generator goroutine to drain pending reports and exit. Without this + // step, cancelling the context first could cause the goroutine to exit before + // processing reports that ReportWrite already enqueued, leaving on-disk cohort + // completion state inconsistent with what the caller has reported. + close(t.shutdownChan) + <-t.doneChan + // Cancel the context to unblock any callers still waiting in + // GetWriteInfo/GetReadInfo/GetReadInfoWithTimeout. + t.cancel() + }) } // dataGenerator is responsible for generating data in the background. @@ -343,7 +370,7 @@ func (t *DataTracker) dataGenerator() { ticker := time.NewTicker(time.Duration(t.config.CohortGCPeriodSeconds * float64(time.Second))) defer func() { ticker.Stop() - <-t.closedChan + close(t.doneChan) }() nextWriteInfo := t.generateNextWriteInfo() @@ -360,6 +387,10 @@ func (t *DataTracker) dataGenerator() { return case <-t.ctx.Done(): return + case <-t.shutdownChan: + // graceful shutdown initiated by Close() + t.drainPendingReports() + return case keyIndex := <-t.writtenKeyIndicesChan: // track keys that have been written so that we can read them in the future t.handleWrittenKey(keyIndex) @@ -381,6 +412,10 @@ func (t *DataTracker) dataGenerator() { return case <-t.ctx.Done(): return + case <-t.shutdownChan: + // graceful shutdown initiated by Close() + t.drainPendingReports() + return case keyIndex := <-t.writtenKeyIndicesChan: // track keys that have been written so that we can read them in the future t.handleWrittenKey(keyIndex) @@ -398,6 +433,26 @@ func (t *DataTracker) dataGenerator() { } } +// drainPendingReports processes any reports currently buffered in writtenKeyIndicesChan. +// Called by the data generator goroutine during graceful shutdown so that any ReportWrite +// call that returned before Close() is fully processed (i.e. cohort completion state +// persisted to disk reflects those reports). +// +// This is a best-effort drain of the channel buffer at the moment the goroutine sees the +// shutdown signal. ReportWrite stops accepting new submissions once shutdownChan is closed +// (and Close itself runs after all caller-side ReportWrite calls have returned), so by the +// time we get here the buffer is bounded and no new items can be enqueued. +func (t *DataTracker) drainPendingReports() { + for { + select { + case keyIndex := <-t.writtenKeyIndicesChan: + t.handleWrittenKey(keyIndex) + default: + return + } + } +} + // handleWrittenKey handles a key that has been written to the database. func (t *DataTracker) handleWrittenKey(keyIndex uint64) { // Add key index to the set of written keys we are tracking. From ebcdaf04863c60db1a0ee97bac09a7df9c38ba65 Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Fri, 22 May 2026 15:12:25 -0500 Subject: [PATCH 4/5] made suggested changes --- sei-db/db_engine/litt/README.md | 42 ++++++++++++------------- sei-db/db_engine/litt/table.go | 7 ++--- sei-db/db_engine/litt/types/key_kind.go | 17 ---------- 3 files changed, 23 insertions(+), 43 deletions(-) diff --git a/sei-db/db_engine/litt/README.md b/sei-db/db_engine/litt/README.md index d3492e844c..061203e615 100644 --- a/sei-db/db_engine/litt/README.md +++ b/sei-db/db_engine/litt/README.md @@ -14,8 +14,8 @@ - [Configuration Options](#configuration-options) - [CLI](#littdb-cli) - [Definitions](#definitions) -- [Architecture](docsrchitecture.md) -- [Filesystem Layout](docsilesystem_layout.md) +- [Architecture](docs/architecture.md) +- [Filesystem Layout](docs/filesystem_layout.md) # What is LittDB? @@ -107,10 +107,10 @@ Source: [db.go](db.go) ```go type DB interface { -GetTable(name string) (Table, error) -DropTable(name string) error -Stop() error -Destroy() error + GetTable(name string) (Table, error) + DropTable(name string) error + Stop() error + Destroy() error } ``` @@ -118,15 +118,15 @@ Source: [table.go](table.go) ```go type Table interface { -Name() string -Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error -PutBatch(batch []*types.PutRequest) error -Get(key []byte) ([]byte, bool, error) -Exists(key []byte) (bool, error) -Flush() error -Size() uint64 -SetTTL(ttl time.Duration) error -SetCacheSize(size uint64) error + Name() string + Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error + PutBatch(batch []*types.PutRequest) error + Get(key []byte) ([]byte, bool, error) + Exists(key []byte) (bool, error) + Flush() error + Size() uint64 + SetTTL(ttl time.Duration) error + SetCacheSize(size uint64) error } ``` @@ -165,17 +165,17 @@ Below is a functional example showing how to use LittDB. // Configure and build the database. config, err := littbuilder.DefaultConfig("path/to/where/data/is/stored") if err != nil { -return err + return err } db, err := config.Build(context.Background()) if err != nil { -return err + return err } myTable, err := db.GetTable("my-table") // this code works if the table is new or if the table already exists if err != nil { -return err + return err } // Write a key-value pair to the table. @@ -184,13 +184,13 @@ value := []byte("this is a value") err = myTable.Put(key, value) if err != nil { -return err + return err } // Flush the data to disk. err = myTable.Flush() if err != nil { -return err + return err } // Congratulations! Your data is now durable on disk. @@ -198,7 +198,7 @@ return err // Read the value back. This works before or after a flush. val, ok, err := myTable.Get(key) if err != nil { -return err + return err } ``` diff --git a/sei-db/db_engine/litt/table.go b/sei-db/db_engine/litt/table.go index 9394831c74..4bfa2dc210 100644 --- a/sei-db/db_engine/litt/table.go +++ b/sei-db/db_engine/litt/table.go @@ -38,12 +38,9 @@ type Table interface { Put(key []byte, value []byte, secondaryKeys ...*types.SecondaryKey) error // PutBatch stores multiple values in the database. Similar to Put, but allows for multiple values to be written - // at once. This may improve performance, but it otherwise has identical properties to a sequence of Put calls - // (i.e. this method does not atomically write the entire batch). + // at once, which may improve performance. // - // Each PutRequest may include zero or more secondary keys (see Put for semantics). Validation - // is per-request: a request with any invalid secondary keys is rejected without applying any - // part of that request, but other requests in the batch may still be applied. + // Each PutRequest may include zero or more secondary keys (see Put for semantics). // // The maximum size of a key (primary or secondary) is 64 KiB (2^16 - 1 bytes). The maximum size // of a value is 2^32 bytes. This database has been optimized under the assumption that values diff --git a/sei-db/db_engine/litt/types/key_kind.go b/sei-db/db_engine/litt/types/key_kind.go index 18551ed8ef..2eba5957f9 100644 --- a/sei-db/db_engine/litt/types/key_kind.go +++ b/sei-db/db_engine/litt/types/key_kind.go @@ -23,20 +23,3 @@ const ( // that the group is fully written. KeyKindFinalSecondary KeyKind = 3 ) - -// IsPrimary reports whether this kind tags a primary key (KeyKindStandalone or KeyKindPrimary). -func (k KeyKind) IsPrimary() bool { - return k == KeyKindStandalone || k == KeyKindPrimary -} - -// IsSecondary reports whether this kind tags a secondary key (KeyKindSecondary or KeyKindFinalSecondary). -func (k KeyKind) IsSecondary() bool { - return k == KeyKindSecondary || k == KeyKindFinalSecondary -} - -// TerminatesGroup reports whether this kind closes its group. Recovery commits a buffered group -// when it encounters a record whose kind terminates the group (KeyKindStandalone or -// KeyKindFinalSecondary). -func (k KeyKind) TerminatesGroup() bool { - return k == KeyKindStandalone || k == KeyKindFinalSecondary -} From fa21576f83342b70d8b46ebada94ef269209d47c Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Fri, 22 May 2026 15:17:59 -0500 Subject: [PATCH 5/5] fix memtable semantics --- sei-db/db_engine/litt/memtable/mem_table.go | 61 ++++++++++++++++++++- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/sei-db/db_engine/litt/memtable/mem_table.go b/sei-db/db_engine/litt/memtable/mem_table.go index 2b16d8facc..2fbb5f875b 100644 --- a/sei-db/db_engine/litt/memtable/mem_table.go +++ b/sei-db/db_engine/litt/memtable/mem_table.go @@ -148,12 +148,67 @@ func (m *memTable) Put(key []byte, value []byte, secondaryKeys ...*types.Seconda } func (m *memTable) PutBatch(batch []*types.PutRequest) error { + // Stateless validation pass: matches single-Put validation rules. If any request is + // invalid, the entire batch is rejected before any writes are applied. This mirrors the + // validation-atomic behavior of DiskTable.PutBatch. for _, req := range batch { - err := m.Put(req.Key, req.Value, req.SecondaryKeys...) - if err != nil { - return err + if req.Key == nil { + return fmt.Errorf("nil keys are not supported") + } + if req.Value == nil { + return fmt.Errorf("nil values are not supported") + } + seen := make(map[string]struct{}, 1+len(req.SecondaryKeys)) + seen[string(req.Key)] = struct{}{} + for _, sk := range req.SecondaryKeys { + if sk == nil { + return fmt.Errorf("nil secondary key is not supported") + } + if sk.Key == nil { + return fmt.Errorf("nil secondary key bytes are not supported") + } + end := uint64(sk.Offset) + uint64(sk.Length) + if end > uint64(len(req.Value)) { + return fmt.Errorf( + "secondary key range [%d, %d) exceeds value length %d", sk.Offset, end, len(req.Value)) + } + skKey := string(sk.Key) + if _, dup := seen[skKey]; dup { + return fmt.Errorf("duplicate key %x within PutRequest", sk.Key) + } + seen[skKey] = struct{}{} + } + } + + now := m.clock() + + m.lock.Lock() + defer m.lock.Unlock() + + // Collision pass: ensure no key in any request already exists in the table. Held under the + // same lock as the apply pass, so the batch as a whole succeeds or fails atomically. + for _, req := range batch { + if _, ok := m.data[string(req.Key)]; ok { + return fmt.Errorf("key %x already exists", req.Key) + } + for _, sk := range req.SecondaryKeys { + if _, ok := m.data[string(sk.Key)]; ok { + return fmt.Errorf("secondary key %x already exists", sk.Key) + } } } + + for _, req := range batch { + stringKey := string(req.Key) + m.data[stringKey] = req.Value + m.expirationQueue.Push(&expirationRecord{creationTime: now, key: stringKey}) + for _, sk := range req.SecondaryKeys { + skString := string(sk.Key) + m.data[skString] = req.Value[sk.Offset : sk.Offset+sk.Length] + m.expirationQueue.Push(&expirationRecord{creationTime: now, key: skString}) + } + } + return nil }