From e04092ddbb58440d659421eb8506d61be6ca4b6a Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 11 May 2026 14:05:47 -0400 Subject: [PATCH 01/16] Add Scylla historical state offload prototype --- app/seidb.go | 16 + app/seidb_test.go | 38 +++ go.mod | 3 + go.sum | 8 + sei-db/config/ss_config.go | 21 ++ sei-db/config/toml.go | 11 + sei-db/config/toml_test.go | 4 + sei-db/state_db/ss/offload/consumer/README.md | 73 +++++ .../cmd/historical-scylla-consumer/main.go | 51 +++ sei-db/state_db/ss/offload/consumer/config.go | 54 ++++ .../consumer/config/example-scylla.json | 21 ++ .../state_db/ss/offload/consumer/consumer.go | 297 ++++++++++++++++++ sei-db/state_db/ss/offload/consumer/kafka.go | 115 +++++++ .../ss/offload/consumer/kafka_test.go | 46 +++ .../ss/offload/consumer/schema/scylla.cql | 46 +++ sei-db/state_db/ss/offload/consumer/scylla.go | 201 ++++++++++++ .../ss/offload/consumer/scylla_test.go | 68 ++++ sei-db/state_db/ss/offload/consumer/sink.go | 26 ++ .../state_db/ss/offload/historical/reader.go | 37 +++ .../state_db/ss/offload/historical/scylla.go | 239 ++++++++++++++ .../ss/offload/historical/scylla_test.go | 95 ++++++ .../state_db/ss/offload/historical/store.go | 96 ++++++ .../ss/offload/historical/store_test.go | 103 ++++++ sei-db/state_db/ss/offload/kafka.go | 5 +- sei-db/state_db/ss/store.go | 47 ++- 25 files changed, 1718 insertions(+), 3 deletions(-) create mode 100644 sei-db/state_db/ss/offload/consumer/README.md create mode 100644 sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go create mode 100644 sei-db/state_db/ss/offload/consumer/config.go create mode 100644 sei-db/state_db/ss/offload/consumer/config/example-scylla.json create mode 100644 sei-db/state_db/ss/offload/consumer/consumer.go create mode 100644 sei-db/state_db/ss/offload/consumer/kafka.go create mode 100644 sei-db/state_db/ss/offload/consumer/kafka_test.go create mode 100644 sei-db/state_db/ss/offload/consumer/schema/scylla.cql create mode 100644 sei-db/state_db/ss/offload/consumer/scylla.go create mode 100644 sei-db/state_db/ss/offload/consumer/scylla_test.go create mode 100644 sei-db/state_db/ss/offload/consumer/sink.go create mode 100644 sei-db/state_db/ss/offload/historical/reader.go create mode 100644 sei-db/state_db/ss/offload/historical/scylla.go create mode 100644 sei-db/state_db/ss/offload/historical/scylla_test.go create mode 100644 sei-db/state_db/ss/offload/historical/store.go create mode 100644 sei-db/state_db/ss/offload/historical/store_test.go diff --git a/app/seidb.go b/app/seidb.go index 4ed31b2d7e..6be013875d 100644 --- a/app/seidb.go +++ b/app/seidb.go @@ -45,6 +45,15 @@ const ( FlagEVMSSSplit = "state-store.evm-ss-split" FlagEVMSSSeparateDBs = "state-store.evm-ss-separate-dbs" + // Historical SS offload fallback. + FlagHistoricalOffloadScyllaHosts = "state-store.historical-offload-scylla-hosts" + FlagHistoricalOffloadScyllaKeyspace = "state-store.historical-offload-scylla-keyspace" + FlagHistoricalOffloadScyllaUsername = "state-store.historical-offload-scylla-username" + FlagHistoricalOffloadScyllaPassword = "state-store.historical-offload-scylla-password" + FlagHistoricalOffloadScyllaDatacenter = "state-store.historical-offload-scylla-datacenter" + FlagHistoricalOffloadScyllaConsistency = "state-store.historical-offload-scylla-consistency" + FlagHistoricalOffloadScyllaTimeoutMS = "state-store.historical-offload-scylla-timeout-ms" + // Other configs FlagSnapshotInterval = "state-sync.snapshot-interval" ) @@ -148,6 +157,13 @@ func parseSSConfigs(appOpts servertypes.AppOptions) config.StateStoreConfig { ssConfig.EVMDBDirectory = cast.ToString(appOpts.Get(FlagEVMSSDirectory)) ssConfig.SeparateEVMSubDBs = cast.ToBool(appOpts.Get(FlagEVMSSSeparateDBs)) ssConfig.EVMSplit = cast.ToBool(appOpts.Get(FlagEVMSSSplit)) + ssConfig.HistoricalOffloadScyllaHosts = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaHosts)) + ssConfig.HistoricalOffloadScyllaKeyspace = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaKeyspace)) + ssConfig.HistoricalOffloadScyllaUsername = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaUsername)) + ssConfig.HistoricalOffloadScyllaPassword = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaPassword)) + ssConfig.HistoricalOffloadScyllaDatacenter = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaDatacenter)) + ssConfig.HistoricalOffloadScyllaConsistency = cast.ToString(appOpts.Get(FlagHistoricalOffloadScyllaConsistency)) + ssConfig.HistoricalOffloadScyllaTimeoutMS = cast.ToInt(appOpts.Get(FlagHistoricalOffloadScyllaTimeoutMS)) return ssConfig } diff --git a/app/seidb_test.go b/app/seidb_test.go index 54b3eb6027..d0a0c88dc1 100644 --- a/app/seidb_test.go +++ b/app/seidb_test.go @@ -61,6 +61,20 @@ func (t TestSeiDBAppOpts) Get(s string) interface{} { return defaultSSConfig.EVMSplit case FlagEVMSSSeparateDBs: return defaultSSConfig.SeparateEVMSubDBs + case FlagHistoricalOffloadScyllaHosts: + return defaultSSConfig.HistoricalOffloadScyllaHosts + case FlagHistoricalOffloadScyllaKeyspace: + return defaultSSConfig.HistoricalOffloadScyllaKeyspace + case FlagHistoricalOffloadScyllaUsername: + return defaultSSConfig.HistoricalOffloadScyllaUsername + case FlagHistoricalOffloadScyllaPassword: + return defaultSSConfig.HistoricalOffloadScyllaPassword + case FlagHistoricalOffloadScyllaDatacenter: + return defaultSSConfig.HistoricalOffloadScyllaDatacenter + case FlagHistoricalOffloadScyllaConsistency: + return defaultSSConfig.HistoricalOffloadScyllaConsistency + case FlagHistoricalOffloadScyllaTimeoutMS: + return defaultSSConfig.HistoricalOffloadScyllaTimeoutMS } return nil } @@ -114,6 +128,30 @@ func TestParseSSConfigs_EVMFlags(t *testing.T) { assert.True(t, ssConfig.SeparateEVMSubDBs) } +func TestParseSSConfigs_HistoricalScyllaFlags(t *testing.T) { + appOpts := mapAppOpts{ + FlagSSEnable: true, + FlagHistoricalOffloadScyllaHosts: "10.0.0.1:9042,10.0.0.2:9042", + FlagHistoricalOffloadScyllaKeyspace: "sei_history", + FlagHistoricalOffloadScyllaUsername: "sei", + FlagHistoricalOffloadScyllaPassword: "secret", + FlagHistoricalOffloadScyllaDatacenter: "use1", + FlagHistoricalOffloadScyllaConsistency: "local_quorum", + FlagHistoricalOffloadScyllaTimeoutMS: 1500, + FlagSSAsyncWriterBuffer: 0, + } + + ssConfig := parseSSConfigs(appOpts) + assert.True(t, ssConfig.Enable) + assert.Equal(t, "10.0.0.1:9042,10.0.0.2:9042", ssConfig.HistoricalOffloadScyllaHosts) + assert.Equal(t, "sei_history", ssConfig.HistoricalOffloadScyllaKeyspace) + assert.Equal(t, "sei", ssConfig.HistoricalOffloadScyllaUsername) + assert.Equal(t, "secret", ssConfig.HistoricalOffloadScyllaPassword) + assert.Equal(t, "use1", ssConfig.HistoricalOffloadScyllaDatacenter) + assert.Equal(t, "local_quorum", ssConfig.HistoricalOffloadScyllaConsistency) + assert.Equal(t, 1500, ssConfig.HistoricalOffloadScyllaTimeoutMS) +} + func TestParseReceiptConfigs_DefaultsToPebbleWhenUnset(t *testing.T) { receiptConfig, err := config.ReadReceiptConfig(mapAppOpts{}) assert.NoError(t, err) diff --git a/go.mod b/go.mod index db681f7bf4..dae2d00b1a 100644 --- a/go.mod +++ b/go.mod @@ -35,6 +35,7 @@ require ( github.com/golang-jwt/jwt/v4 v4.5.1 github.com/golang/mock v1.7.0-rc.1 github.com/golang/protobuf v1.5.4 + github.com/gocql/gocql v1.7.0 github.com/google/btree v1.1.3 github.com/google/go-cmp v0.7.0 github.com/google/gofuzz v1.2.0 @@ -119,6 +120,7 @@ require ( github.com/emirpasic/gods v1.18.1 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/go-git/go-billy/v5 v5.8.0 // indirect + github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect @@ -128,6 +130,7 @@ require ( github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 8ab76b9ab4..b941802165 100644 --- a/go.sum +++ b/go.sum @@ -736,6 +736,7 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bgentry/speakeasy v0.2.0 h1:tgObeVOf8WAvtuAX6DhJ4xks4CFNwPDZiqzGqIHE51E= github.com/bgentry/speakeasy v0.2.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k= github.com/bits-and-blooms/bitset v1.7.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= github.com/bits-and-blooms/bitset v1.14.2/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.17.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= @@ -743,6 +744,7 @@ github.com/bits-and-blooms/bitset v1.24.3 h1:Bte86SlO3lwPQqww+7BE9ZuUCKIjfqnG5jt github.com/bits-and-blooms/bitset v1.24.3/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bketelsen/crypt v0.0.4/go.mod h1:aI6NrJ0pMGgvZKL1iVgXLnfIFJtfV+bKCoqOes/6LfM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/btcsuite/btcd v0.23.2 h1:/YOgUp25sdCnP5ho6Hl3s0E438zlX+Kak7E6TgBgoT0= @@ -1122,6 +1124,8 @@ github.com/goccy/go-json v0.9.11/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MG github.com/goccy/go-json v0.10.4/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/gocql/gocql v1.7.0 h1:O+7U7/1gSN7QTEAaMEsJc1Oq2QHXvCWoF3DFK9HDHus= +github.com/gocql/gocql v1.7.0/go.mod h1:vnlvXyFZeLBF0Wy+RS8hrOdbn0UWsWtdg07XJnFxZ+4= github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2 h1:ZpnhV/YsD2/4cESfV5+Hoeu/iUR3ruzNvZ+yQfO03a0= github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -1328,6 +1332,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c h1:6rhixN/i8ZofjG1Y75iExal34USq5p+wiN1tpie8IrU= github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c/go.mod h1:NMPJylDgVpX0MLRlPy15sqSwOFv/U1GZ2m21JhFfek0= github.com/guptarohit/asciigraph v0.5.5/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE= github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= @@ -2911,6 +2917,8 @@ gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qS gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.62.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= diff --git a/sei-db/config/ss_config.go b/sei-db/config/ss_config.go index 3fe94e750d..09fd69117f 100644 --- a/sei-db/config/ss_config.go +++ b/sei-db/config/ss_config.go @@ -76,6 +76,27 @@ type StateStoreConfig struct { // When true, data is routed to separate DBs by EVM key family while // preserving the same logical store key and full key encoding inside each DB. SeparateEVMSubDBs bool `mapstructure:"evm-separate-dbs"` + + // HistoricalOffloadScyllaHosts enables ScyllaDB/Cassandra fallback reads + // for versions pruned from local SS when non-empty. Hosts are comma-separated + // host[:port] values. + HistoricalOffloadScyllaHosts string `mapstructure:"historical-offload-scylla-hosts"` + + // HistoricalOffloadScyllaKeyspace is the keyspace containing state_mutations. + HistoricalOffloadScyllaKeyspace string `mapstructure:"historical-offload-scylla-keyspace"` + + // HistoricalOffloadScyllaUsername and Password are optional. + HistoricalOffloadScyllaUsername string `mapstructure:"historical-offload-scylla-username"` + HistoricalOffloadScyllaPassword string `mapstructure:"historical-offload-scylla-password"` + + // HistoricalOffloadScyllaDatacenter enables DC-aware routing when set. + HistoricalOffloadScyllaDatacenter string `mapstructure:"historical-offload-scylla-datacenter"` + + // HistoricalOffloadScyllaConsistency defaults to local_quorum when empty. + HistoricalOffloadScyllaConsistency string `mapstructure:"historical-offload-scylla-consistency"` + + // HistoricalOffloadScyllaTimeoutMS defaults in the Scylla reader when zero. + HistoricalOffloadScyllaTimeoutMS int `mapstructure:"historical-offload-scylla-timeout-ms"` } // DefaultStateStoreConfig returns the default StateStoreConfig diff --git a/sei-db/config/toml.go b/sei-db/config/toml.go index eb387cb1d5..68e14fce7a 100644 --- a/sei-db/config/toml.go +++ b/sei-db/config/toml.go @@ -139,6 +139,17 @@ evm-ss-split = {{ .StateStore.EVMSplit }} # When false, all EVM data stays in one DB using the current unified layout. # When true, data is routed to separate DBs while preserving the same evm key prefix format. evm-ss-separate-dbs = {{ .StateStore.SeparateEVMSubDBs }} + +# Optional ScyllaDB/Cassandra historical-state fallback. When hosts are set, +# point reads for versions pruned from local SS fall back to state_mutations in +# the configured keyspace. Iterators still use local SS. +historical-offload-scylla-hosts = "{{ .StateStore.HistoricalOffloadScyllaHosts }}" +historical-offload-scylla-keyspace = "{{ .StateStore.HistoricalOffloadScyllaKeyspace }}" +historical-offload-scylla-username = "{{ .StateStore.HistoricalOffloadScyllaUsername }}" +historical-offload-scylla-password = "{{ .StateStore.HistoricalOffloadScyllaPassword }}" +historical-offload-scylla-datacenter = "{{ .StateStore.HistoricalOffloadScyllaDatacenter }}" +historical-offload-scylla-consistency = "{{ .StateStore.HistoricalOffloadScyllaConsistency }}" +historical-offload-scylla-timeout-ms = {{ .StateStore.HistoricalOffloadScyllaTimeoutMS }} ` // ReceiptStoreConfigTemplate defines the configuration template for receipt-store diff --git a/sei-db/config/toml_test.go b/sei-db/config/toml_test.go index fd0a51f932..b6bd267faf 100644 --- a/sei-db/config/toml_test.go +++ b/sei-db/config/toml_test.go @@ -88,6 +88,10 @@ func TestStateStoreConfigTemplate(t *testing.T) { require.Contains(t, output, `evm-ss-db-directory = ""`, "Missing evm-ss-db-directory") require.Contains(t, output, `evm-ss-split = false`, "Missing or incorrect evm-ss-split") require.Contains(t, output, "evm-ss-separate-dbs = false", "Missing or incorrect evm-ss-separate-dbs") + require.Contains(t, output, `historical-offload-scylla-hosts = ""`, "Missing historical Scylla hosts") + require.Contains(t, output, `historical-offload-scylla-keyspace = ""`, "Missing historical Scylla keyspace") + require.Contains(t, output, `historical-offload-scylla-consistency = ""`, "Missing historical Scylla consistency") + require.Contains(t, output, "historical-offload-scylla-timeout-ms = 0", "Missing historical Scylla timeout") } // TestReceiptStoreConfigTemplate verifies that all field paths in the receipt-store TOML template diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md new file mode 100644 index 0000000000..9ccc28b29d --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -0,0 +1,73 @@ +# Historical Scylla/Cassandra Offload + +This is a prototype historical-state backend for ScyllaDB or Cassandra. + +The intended shape is narrow: + +- local SS remains the hot store for recent state, writes, imports, pruning, and iterators +- Scylla/Cassandra stores immutable MVCC mutation rows for older history +- reads below local SS retention can fall back to Scylla/Cassandra for `Get` and `Has` + +The table layout is built for point reads by `(store_name, state_key, target_version)`: + +```sql +SELECT version, value, deleted +FROM state_mutations +WHERE store_name = ? AND state_key = ? AND version <= ? +ORDER BY version DESC +LIMIT 1; +``` + +Ordered prefix iteration is intentionally not served from Scylla/Cassandra in this prototype. + +## Schema + +Apply the schema once: + +```bash +cqlsh 127.0.0.1 9042 -f sei-db/state_db/ss/offload/consumer/schema/scylla.cql +``` + +For production, edit the keyspace replication in `schema/scylla.cql` to use +`NetworkTopologyStrategy` with the actual datacenter names and replication +factors before applying it. + +## Consumer + +The consumer reads historical offload changelog messages from Kafka and writes +them into Scylla/Cassandra. Kafka offsets are committed only after the sink +write succeeds. + +```bash +go run ./sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer \ + ./sei-db/state_db/ss/offload/consumer/config/example-scylla.json +``` + +The example config is local-dev only. Set real Kafka brokers, Scylla hosts, +keyspace, datacenter, and credentials in your own config. + +## Node Read Fallback + +Enable fallback reads in the node config: + +```toml +[state-store] +historical-offload-scylla-hosts = "10.0.0.1:9042,10.0.0.2:9042" +historical-offload-scylla-keyspace = "sei_history" +historical-offload-scylla-username = "" +historical-offload-scylla-password = "" +historical-offload-scylla-datacenter = "datacenter1" +historical-offload-scylla-consistency = "local_quorum" +historical-offload-scylla-timeout-ms = 2000 +``` + +Fallback activates only for point reads where the requested version is below the +local SS earliest version. Missing rows and tombstones return empty state, same +as local SS. + +## Current Limits + +- No Scylla/Cassandra iterator path. +- No cross-row transaction on ingest; mutation rows are written first and the + version marker is written last, so replay is idempotent after partial failure. +- No automatic schema creation from the binary. diff --git a/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go b/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go new file mode 100644 index 0000000000..65020ca354 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go @@ -0,0 +1,51 @@ +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/signal" + "syscall" + "time" + + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/consumer" +) + +func main() { + if len(os.Args) != 2 { + fmt.Fprintf(os.Stderr, "usage: %s \n", os.Args[0]) + os.Exit(2) + } + + cfg, err := consumer.LoadConfig(os.Args[1]) + if err != nil { + log.Fatalf("load config: %v", err) + } + + sink, err := consumer.NewScyllaSink(cfg.Scylla) + if err != nil { + log.Fatalf("open scylla/cassandra sink: %v", err) + } + defer func() { _ = sink.Close() }() + + reader, err := consumer.NewKafkaReader(cfg.Kafka) + if err != nil { + log.Fatalf("open kafka reader: %v", err) + } + defer func() { _ = reader.Close() }() + + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + c := consumer.New(reader, sink, consumer.Options{ + Logf: func(format string, args ...interface{}) { log.Printf(format, args...) }, + Workers: cfg.Workers, + ShardBufferSize: cfg.ShardBufferSize, + MaxBatchRecords: cfg.MaxBatchRecords, + BatchMaxWait: time.Duration(cfg.BatchMaxWaitMS) * time.Millisecond, + }) + if err := c.Run(ctx); err != nil { + log.Fatalf("consumer: %v", err) + } +} diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go new file mode 100644 index 0000000000..f381b2c11f --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -0,0 +1,54 @@ +package consumer + +import ( + "encoding/json" + "fmt" + "os" +) + +type Config struct { + Kafka KafkaReaderConfig + Scylla ScyllaConfig + Workers int + ShardBufferSize int + MaxBatchRecords int + BatchMaxWaitMS int +} + +func (c *Config) Validate() error { + if err := c.Kafka.Validate(); err != nil { + return fmt.Errorf("kafka: %w", err) + } + if err := c.Scylla.Validate(); err != nil { + return fmt.Errorf("scylla: %w", err) + } + if c.Workers < 0 { + return fmt.Errorf("workers must be non-negative") + } + if c.ShardBufferSize < 0 { + return fmt.Errorf("shard buffer size must be non-negative") + } + if c.MaxBatchRecords < 0 { + return fmt.Errorf("max batch records must be non-negative") + } + if c.BatchMaxWaitMS < 0 { + return fmt.Errorf("batch max wait ms must be non-negative") + } + return nil +} + +func LoadConfig(path string) (*Config, error) { + // #nosec G304 -- config path is supplied by the operator on the command line. + raw, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read config: %w", err) + } + cfg := &Config{} + if err := json.Unmarshal(raw, cfg); err != nil { + return nil, fmt.Errorf("parse config: %w", err) + } + if err := cfg.Validate(); err != nil { + return nil, err + } + return cfg, nil +} diff --git a/sei-db/state_db/ss/offload/consumer/config/example-scylla.json b/sei-db/state_db/ss/offload/consumer/config/example-scylla.json new file mode 100644 index 0000000000..94779c83e9 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config/example-scylla.json @@ -0,0 +1,21 @@ +{ + "Kafka": { + "Brokers": ["localhost:9092"], + "Topic": "historical-offload", + "GroupID": "historical-scylla", + "StartOffset": "first" + }, + "Scylla": { + "Hosts": ["127.0.0.1:9042"], + "Keyspace": "sei_history", + "Datacenter": "datacenter1", + "Consistency": "local_quorum", + "TimeoutMS": 2000, + "ConnectTimeoutMS": 2000, + "NumConns": 4 + }, + "Workers": 16, + "ShardBufferSize": 128, + "MaxBatchRecords": 16, + "BatchMaxWaitMS": 10 +} diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go new file mode 100644 index 0000000000..a0adaaca46 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -0,0 +1,297 @@ +package consumer + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/segmentio/kafka-go" + "golang.org/x/sync/errgroup" +) + +// MessageSource is the subset of *kafka.Reader used by the loop. +type MessageSource interface { + FetchMessage(ctx context.Context) (kafka.Message, error) + CommitMessages(ctx context.Context, msgs ...kafka.Message) error +} + +// Messages are sharded by partition: cross-partition writes parallelize while +// ordering within a partition is preserved. +type Consumer struct { + reader MessageSource + sink Sink + logf func(format string, args ...interface{}) + workers int + shardBuf int + batchSize int + batchWait time.Duration + maxAttempts int + baseBackoff time.Duration + maxBackoff time.Duration +} + +const ( + defaultSinkMaxAttempts = 5 + defaultSinkBaseBackoff = 1 * time.Second + defaultSinkMaxBackoff = 30 * time.Second + defaultWorkers = 16 + defaultShardBuffer = 128 + defaultBatchSize = 16 + defaultBatchMaxWait = 10 * time.Millisecond +) + +// Backpressure: when the sink falls behind, ShardBufferSize fills, the fetcher +// blocks, and Kafka stops being polled. Zero values pick defaults. +type Options struct { + Logf func(format string, args ...interface{}) + SinkMaxAttempts int + SinkBaseBackoff time.Duration + SinkMaxBackoff time.Duration + Workers int + ShardBufferSize int + MaxBatchRecords int + BatchMaxWait time.Duration +} + +func New(reader MessageSource, sink Sink, opts Options) *Consumer { + logf := opts.Logf + if logf == nil { + logf = func(string, ...interface{}) {} + } + maxAttempts := opts.SinkMaxAttempts + if maxAttempts <= 0 { + maxAttempts = defaultSinkMaxAttempts + } + base := opts.SinkBaseBackoff + if base <= 0 { + base = defaultSinkBaseBackoff + } + maxBackoff := opts.SinkMaxBackoff + if maxBackoff <= 0 { + maxBackoff = defaultSinkMaxBackoff + } + workers := opts.Workers + if workers <= 0 { + workers = defaultWorkers + } + shardBuf := opts.ShardBufferSize + if shardBuf <= 0 { + shardBuf = defaultShardBuffer + } + batchSize := opts.MaxBatchRecords + if batchSize <= 0 { + batchSize = defaultBatchSize + } + batchWait := opts.BatchMaxWait + if batchWait <= 0 { + batchWait = defaultBatchMaxWait + } + return &Consumer{ + reader: reader, + sink: sink, + logf: logf, + workers: workers, + shardBuf: shardBuf, + batchSize: batchSize, + batchWait: batchWait, + maxAttempts: maxAttempts, + baseBackoff: base, + maxBackoff: maxBackoff, + } +} + +// Run commits offsets only after the sink persists each message. +func (c *Consumer) Run(ctx context.Context) error { + return c.runParallel(ctx) +} + +func (c *Consumer) runParallel(ctx context.Context) error { + g, gctx := errgroup.WithContext(ctx) + shards := make([]chan kafka.Message, c.workers) + for i := range shards { + shards[i] = make(chan kafka.Message, c.shardBuf) + ch := shards[i] + g.Go(func() error { return c.workerLoop(gctx, ch) }) + } + g.Go(func() error { + defer func() { + for _, ch := range shards { + close(ch) + } + }() + for { + msg, err := c.reader.FetchMessage(gctx) + if err != nil { + if isCancellation(err) { + return nil + } + return fmt.Errorf("fetch kafka message: %w", err) + } + shard := shardFor(msg.Partition, c.workers) + select { + case shards[shard] <- msg: + case <-gctx.Done(): + return nil + } + } + }) + if err := g.Wait(); err != nil && !isCancellation(err) { + return err + } + return nil +} + +func (c *Consumer) workerLoop(ctx context.Context, ch <-chan kafka.Message) error { + for { + select { + case <-ctx.Done(): + return nil + case msg, ok := <-ch: + if !ok { + return nil + } + msgs, ok := c.collectBatch(ctx, ch, msg) + if !ok { + return nil + } + if err := c.processBatch(ctx, msgs); err != nil { + if isCancellation(err) { + return nil + } + return err + } + } + } +} + +func (c *Consumer) collectBatch(ctx context.Context, ch <-chan kafka.Message, first kafka.Message) ([]kafka.Message, bool) { + msgs := make([]kafka.Message, 1, c.batchSize) + msgs[0] = first + if c.batchSize <= 1 { + return msgs, true + } + +drainBuffered: + for len(msgs) < c.batchSize { + select { + case <-ctx.Done(): + return nil, false + case msg, ok := <-ch: + if !ok { + return msgs, true + } + msgs = append(msgs, msg) + default: + break drainBuffered + } + } + if len(msgs) == c.batchSize { + return msgs, true + } + + timer := time.NewTimer(c.batchWait) + defer timer.Stop() + for len(msgs) < c.batchSize { + select { + case <-ctx.Done(): + return nil, false + case msg, ok := <-ch: + if !ok { + return msgs, true + } + msgs = append(msgs, msg) + case <-timer.C: + return msgs, true + } + } + return msgs, true +} + +func (c *Consumer) processBatch(ctx context.Context, msgs []kafka.Message) error { + records := make([]Record, 0, len(msgs)) + var firstVersion, lastVersion int64 + for i, msg := range msgs { + entry, err := DecodeEntry(msg.Value) + if err != nil { + return fmt.Errorf("decode message at offset %d: %w", msg.Offset, err) + } + if i == 0 { + firstVersion = entry.Version + } + lastVersion = entry.Version + records = append(records, Record{ + Topic: msg.Topic, + Partition: msg.Partition, + Offset: msg.Offset, + Entry: entry, + }) + } + start := time.Now() + if err := c.writeBatchWithRetry(ctx, records); err != nil { + return fmt.Errorf("sink write batch first_version=%d last_version=%d: %w", + firstVersion, lastVersion, err) + } + c.logf("wrote records=%d first_version=%d last_version=%d in %s", + len(records), firstVersion, lastVersion, time.Since(start)) + if err := c.reader.CommitMessages(ctx, msgs...); err != nil { + return fmt.Errorf("commit kafka offsets: %w", err) + } + return nil +} + +func (c *Consumer) writeBatchWithRetry(ctx context.Context, records []Record) error { + backoff := c.baseBackoff + var lastErr error + for attempt := 1; attempt <= c.maxAttempts; attempt++ { + err := c.writeRecords(ctx, records) + if err == nil { + return nil + } + lastErr = err + if isCancellation(err) { + return err + } + if attempt == c.maxAttempts { + break + } + c.logf("sink write attempt %d/%d failed: %v; retrying in %s", + attempt, c.maxAttempts, err, backoff) + select { + case <-time.After(backoff): + case <-ctx.Done(): + return ctx.Err() + } + backoff *= 2 + if backoff > c.maxBackoff { + backoff = c.maxBackoff + } + } + return fmt.Errorf("sink write failed after %d attempts: %w", c.maxAttempts, lastErr) +} + +func (c *Consumer) writeRecords(ctx context.Context, records []Record) error { + if len(records) == 0 { + return nil + } + if sink, ok := c.sink.(BatchSink); ok { + return sink.WriteBatch(ctx, records) + } + for _, rec := range records { + if err := c.sink.Write(ctx, rec); err != nil { + return err + } + } + return nil +} + +func shardFor(partition, workers int) int { + if partition < 0 { + partition = -partition + } + return partition % workers +} + +func isCancellation(err error) bool { + return errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) +} diff --git a/sei-db/state_db/ss/offload/consumer/kafka.go b/sei-db/state_db/ss/offload/consumer/kafka.go new file mode 100644 index 0000000000..1785bb05f8 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/kafka.go @@ -0,0 +1,115 @@ +package consumer + +import ( + "crypto/tls" + "fmt" + "strings" + "time" + + gogoproto "github.com/gogo/protobuf/proto" + "github.com/segmentio/kafka-go" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload" +) + +// TLS/SASL must match the producer cluster. Commits are synchronous +// (kafka-go's zero CommitInterval) so offsets only advance after the sink +// persists each entry. +type KafkaReaderConfig struct { + Brokers []string + Topic string + GroupID string + ClientID string + Region string + StartOffset string // "first" or "last"; defaults to "first" + MinBytes int + MaxBytes int + MaxWait time.Duration + TLSEnabled bool + SASLMechanism string +} + +func (c *KafkaReaderConfig) ApplyDefaults() { + if c.ClientID == "" { + c.ClientID = "cryptosim-historical-scylla-consumer" + } + if c.StartOffset == "" { + c.StartOffset = "first" + } + if c.MinBytes == 0 { + c.MinBytes = 1 + } + if c.MaxBytes == 0 { + c.MaxBytes = 10 << 20 + } + if c.MaxWait == 0 { + c.MaxWait = 500 * time.Millisecond + } +} + +func (c *KafkaReaderConfig) Validate() error { + if len(c.Brokers) == 0 { + return fmt.Errorf("kafka brokers are required") + } + if c.Topic == "" { + return fmt.Errorf("kafka topic is required") + } + if c.GroupID == "" { + return fmt.Errorf("kafka group id is required") + } + switch strings.ToLower(c.StartOffset) { + case "", "first", "last": + default: + return fmt.Errorf("unsupported kafka start offset %q", c.StartOffset) + } + return nil +} + +func NewKafkaReader(cfg KafkaReaderConfig) (*kafka.Reader, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + + dialer := &kafka.Dialer{ + ClientID: cfg.ClientID, + Timeout: 10 * time.Second, + } + if cfg.TLSEnabled { + dialer.TLS = &tls.Config{MinVersion: tls.VersionTLS12} + } + mech, err := offload.NewSASLMechanism(offload.KafkaConfig{ + Region: cfg.Region, + TLSEnabled: cfg.TLSEnabled, + SASLMechanism: cfg.SASLMechanism, + }) + if err != nil { + return nil, err + } + dialer.SASLMechanism = mech + + start := kafka.FirstOffset + if strings.EqualFold(cfg.StartOffset, "last") { + start = kafka.LastOffset + } + + return kafka.NewReader(kafka.ReaderConfig{ + Brokers: cfg.Brokers, + Topic: cfg.Topic, + GroupID: cfg.GroupID, + Dialer: dialer, + MinBytes: cfg.MinBytes, + MaxBytes: cfg.MaxBytes, + MaxWait: cfg.MaxWait, + StartOffset: start, + }), nil +} + +func DecodeEntry(payload []byte) (*dbproto.ChangelogEntry, error) { + entry := &dbproto.ChangelogEntry{} + if err := gogoproto.Unmarshal(payload, entry); err != nil { + return nil, fmt.Errorf("decode changelog entry: %w", err) + } + return entry, nil +} diff --git a/sei-db/state_db/ss/offload/consumer/kafka_test.go b/sei-db/state_db/ss/offload/consumer/kafka_test.go new file mode 100644 index 0000000000..7fa6b320e4 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/kafka_test.go @@ -0,0 +1,46 @@ +package consumer + +import ( + "testing" + + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/stretchr/testify/require" +) + +func TestKafkaReaderConfigApplyDefaults(t *testing.T) { + cfg := KafkaReaderConfig{ + Brokers: []string{"localhost:9092"}, + Topic: "historical-offload", + GroupID: "scylla", + } + cfg.ApplyDefaults() + require.Equal(t, "cryptosim-historical-scylla-consumer", cfg.ClientID) + require.Equal(t, "first", cfg.StartOffset) + require.Equal(t, 1, cfg.MinBytes) + require.Equal(t, 10<<20, cfg.MaxBytes) +} + +func TestKafkaReaderConfigValidate(t *testing.T) { + cfg := KafkaReaderConfig{} + require.ErrorContains(t, cfg.Validate(), "brokers") + cfg = KafkaReaderConfig{Brokers: []string{"x"}} + require.ErrorContains(t, cfg.Validate(), "topic") + cfg = KafkaReaderConfig{Brokers: []string{"x"}, Topic: "t"} + require.ErrorContains(t, cfg.Validate(), "group id") + cfg = KafkaReaderConfig{ + Brokers: []string{"x"}, + Topic: "t", + GroupID: "g", + StartOffset: "middle", + } + require.ErrorContains(t, cfg.Validate(), "start offset") +} + +func TestDecodeEntry(t *testing.T) { + entry := &proto.ChangelogEntry{Version: 7} + payload, err := entry.Marshal() + require.NoError(t, err) + got, err := DecodeEntry(payload) + require.NoError(t, err) + require.Equal(t, int64(7), got.Version) +} diff --git a/sei-db/state_db/ss/offload/consumer/schema/scylla.cql b/sei-db/state_db/ss/offload/consumer/schema/scylla.cql new file mode 100644 index 0000000000..ceb7d3cfc2 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/schema/scylla.cql @@ -0,0 +1,46 @@ +-- ScyllaDB/Cassandra schema for Sei historical state offload. +-- Apply once before running historical-scylla-consumer. + +CREATE KEYSPACE IF NOT EXISTS sei_history +WITH replication = { + 'class': 'SimpleStrategy', + 'replication_factor': '1' +}; + +-- For production, replace the keyspace replication with +-- NetworkTopologyStrategy and the real datacenter replication factors. + +USE sei_history; + +-- Version markers are written after all mutation rows for a block version. +-- Buckets avoid a single hot partition while keeping LastVersion bounded to +-- 64 small point reads. +CREATE TABLE IF NOT EXISTS state_versions ( + bucket int, + version bigint, + kafka_topic text, + kafka_partition int, + kafka_offset bigint, + ingested_at timestamp, + PRIMARY KEY ((bucket), version) +) WITH CLUSTERING ORDER BY (version DESC); + +-- Historical point lookup: +-- WHERE store_name = ? AND state_key = ? AND version <= ? +-- ORDER BY version DESC LIMIT 1 +CREATE TABLE IF NOT EXISTS state_mutations ( + store_name text, + state_key blob, + version bigint, + value blob, + deleted boolean, + PRIMARY KEY ((store_name, state_key), version) +) WITH CLUSTERING ORDER BY (version DESC); + +CREATE TABLE IF NOT EXISTS state_tree_upgrades ( + version bigint, + name text, + rename_from text, + deleted boolean, + PRIMARY KEY ((version), name) +); diff --git a/sei-db/state_db/ss/offload/consumer/scylla.go b/sei-db/state_db/ss/offload/consumer/scylla.go new file mode 100644 index 0000000000..c16744662c --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/scylla.go @@ -0,0 +1,201 @@ +package consumer + +import ( + "context" + "fmt" + "time" + + "github.com/gocql/gocql" + + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" +) + +type ScyllaConfig struct { + Hosts []string + Keyspace string + Username string + Password string + Datacenter string + Consistency string + TimeoutMS int + ConnectTimeoutMS int + NumConns int +} + +func (c *ScyllaConfig) ApplyDefaults() { + cfg := c.toHistorical() + cfg.ApplyDefaults() + c.Consistency = cfg.Consistency + c.TimeoutMS = int(cfg.Timeout / time.Millisecond) + c.ConnectTimeoutMS = int(cfg.ConnectTimeout / time.Millisecond) + c.NumConns = cfg.NumConns +} + +func (c *ScyllaConfig) Validate() error { + cfg := c.toHistorical() + cfg.ApplyDefaults() + return cfg.Validate() +} + +func (c ScyllaConfig) toHistorical() historical.ScyllaConfig { + return historical.ScyllaConfig{ + Hosts: c.Hosts, + Keyspace: c.Keyspace, + Username: c.Username, + Password: c.Password, + Datacenter: c.Datacenter, + Consistency: c.Consistency, + Timeout: time.Duration(c.TimeoutMS) * time.Millisecond, + ConnectTimeout: time.Duration(c.ConnectTimeoutMS) * time.Millisecond, + NumConns: c.NumConns, + } +} + +type scyllaSink struct { + session *gocql.Session +} + +var _ Sink = (*scyllaSink)(nil) +var _ BatchSink = (*scyllaSink)(nil) + +func NewScyllaSink(cfg ScyllaConfig) (Sink, error) { + session, err := historical.OpenScyllaSession(cfg.toHistorical()) + if err != nil { + return nil, err + } + return &scyllaSink{session: session}, nil +} + +func (s *scyllaSink) Close() error { + s.session.Close() + return nil +} + +func (s *scyllaSink) LastVersion(ctx context.Context) (int64, error) { + var maxVersion int64 + for bucket := 0; bucket < historical.VersionBucketCount; bucket++ { + var version int64 + err := s.session.Query(selectLatestVersionCQL, bucket).WithContext(ctx).Scan(&version) + if err != nil { + if err == gocql.ErrNotFound { + continue + } + return 0, fmt.Errorf("read latest scylla/cassandra version bucket %d: %w", bucket, err) + } + if version > maxVersion { + maxVersion = version + } + } + return maxVersion, nil +} + +func (s *scyllaSink) Write(ctx context.Context, rec Record) error { + return s.WriteBatch(ctx, []Record{rec}) +} + +func (s *scyllaSink) WriteBatch(ctx context.Context, records []Record) error { + for _, rec := range compactRecords(records) { + if err := s.writeRecord(ctx, rec); err != nil { + return err + } + } + return nil +} + +func compactRecords(records []Record) []Record { + for _, rec := range records { + if rec.Entry == nil { + out := make([]Record, 0, len(records)) + for _, rec := range records { + if rec.Entry != nil { + out = append(out, rec) + } + } + return out + } + } + return records +} + +func (s *scyllaSink) writeRecord(ctx context.Context, rec Record) error { + entry := rec.Entry + if entry == nil { + return nil + } + version := entry.Version + for _, ncs := range entry.Changesets { + for _, pair := range ncs.Changeset.Pairs { + if err := s.writeMutation(ctx, version, ncs.Name, pair); err != nil { + return err + } + } + } + for _, up := range entry.Upgrades { + if err := s.writeUpgrade(ctx, version, up); err != nil { + return err + } + } + if err := s.session.Query(insertVersionCQL, + historical.VersionBucket(version), + version, + rec.Topic, + rec.Partition, + rec.Offset, + time.Now(), + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("insert scylla/cassandra version %d: %w", version, err) + } + return nil +} + +func (s *scyllaSink) writeMutation(ctx context.Context, version int64, storeName string, pair *proto.KVPair) error { + deleted := pair.Delete || pair.Value == nil + value := pair.Value + if deleted { + value = nil + } + if err := s.session.Query(insertMutationCQL, + storeName, + pair.Key, + version, + value, + deleted, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("insert scylla/cassandra mutation store=%s version=%d: %w", storeName, version, err) + } + return nil +} + +func (s *scyllaSink) writeUpgrade(ctx context.Context, version int64, up *proto.TreeNameUpgrade) error { + if err := s.session.Query(insertUpgradeCQL, + version, + up.Name, + up.RenameFrom, + up.Delete, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("insert scylla/cassandra tree upgrade version=%d name=%s: %w", version, up.Name, err) + } + return nil +} + +const selectLatestVersionCQL = ` +SELECT version +FROM state_versions +WHERE bucket = ? +LIMIT 1` + +const insertVersionCQL = ` +INSERT INTO state_versions ( + bucket, version, kafka_topic, kafka_partition, kafka_offset, ingested_at +) VALUES (?, ?, ?, ?, ?, ?)` + +const insertMutationCQL = ` +INSERT INTO state_mutations ( + store_name, state_key, version, value, deleted +) VALUES (?, ?, ?, ?, ?)` + +const insertUpgradeCQL = ` +INSERT INTO state_tree_upgrades ( + version, name, rename_from, deleted +) VALUES (?, ?, ?, ?)` diff --git a/sei-db/state_db/ss/offload/consumer/scylla_test.go b/sei-db/state_db/ss/offload/consumer/scylla_test.go new file mode 100644 index 0000000000..b80b409aa8 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/scylla_test.go @@ -0,0 +1,68 @@ +package consumer + +import ( + "strings" + "testing" + + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/stretchr/testify/require" +) + +func TestScyllaConfigValidate(t *testing.T) { + cfg := ScyllaConfig{ + Hosts: []string{"127.0.0.1"}, + Keyspace: "sei_history", + } + require.NoError(t, cfg.Validate()) + + cfg.TimeoutMS = -1 + require.ErrorContains(t, cfg.Validate(), "timeout") +} + +func TestScyllaConfigApplyDefaults(t *testing.T) { + cfg := ScyllaConfig{ + Hosts: []string{"127.0.0.1"}, + Keyspace: "sei_history", + } + cfg.ApplyDefaults() + require.Equal(t, "local_quorum", cfg.Consistency) + require.Equal(t, 2000, cfg.TimeoutMS) + require.Equal(t, 2000, cfg.ConnectTimeoutMS) + require.Equal(t, 4, cfg.NumConns) +} + +func TestCompactRecordsDropsNilEntries(t *testing.T) { + records := compactRecords([]Record{ + {Entry: &proto.ChangelogEntry{Version: 1}}, + {}, + {Entry: &proto.ChangelogEntry{Version: 2}}, + }) + require.Len(t, records, 2) + require.Equal(t, int64(1), records[0].Entry.Version) + require.Equal(t, int64(2), records[1].Entry.Version) +} + +func TestScyllaCQLShape(t *testing.T) { + for _, frag := range []string{ + "INSERT INTO state_mutations", + "store_name", + "state_key", + "version", + "value", + "deleted", + } { + require.Contains(t, insertMutationCQL, frag) + } + for _, frag := range []string{ + "INSERT INTO state_versions", + "bucket", + "version", + "kafka_topic", + "kafka_partition", + "kafka_offset", + "ingested_at", + } { + require.Contains(t, insertVersionCQL, frag) + } + require.True(t, strings.Contains(selectLatestVersionCQL, "LIMIT 1")) +} diff --git a/sei-db/state_db/ss/offload/consumer/sink.go b/sei-db/state_db/ss/offload/consumer/sink.go new file mode 100644 index 0000000000..79da24d0cd --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/sink.go @@ -0,0 +1,26 @@ +package consumer + +import ( + "context" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// Topic/Partition/Offset are kept alongside Entry so sinks can be idempotent +// across replayed Kafka messages. +type Record struct { + Topic string + Partition int + Offset int64 + Entry *dbproto.ChangelogEntry +} + +type Sink interface { + Write(ctx context.Context, rec Record) error + LastVersion(ctx context.Context) (int64, error) + Close() error +} + +type BatchSink interface { + WriteBatch(ctx context.Context, records []Record) error +} diff --git a/sei-db/state_db/ss/offload/historical/reader.go b/sei-db/state_db/ss/offload/historical/reader.go new file mode 100644 index 0000000000..7dfdfee993 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/reader.go @@ -0,0 +1,37 @@ +// Package historical reads MVCC state from an external historical store. +package historical + +import ( + "context" + "errors" +) + +var ErrNotFound = errors.New("historical state not found") + +// Key is a string so Lookup is usable as a map key. +type Lookup struct { + StoreName string + Key string +} + +// Value is the actual MVCC value that satisfied the lookup. +// Version may be older than the requested target version. +type Value struct { + Bytes []byte + Version int64 +} + +type Reader interface { + // Get returns ErrNotFound if no row exists at or before targetVersion, + // or if the latest such row is a tombstone. + Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) + + // Has skips value transfer and returns false for missing or tombstoned keys. + Has(ctx context.Context, storeName string, key []byte, targetVersion int64) (bool, error) + + // BatchGet returns only found, non-tombstoned lookups. + BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) + + LastVersion(ctx context.Context) (int64, error) + Close() error +} diff --git a/sei-db/state_db/ss/offload/historical/scylla.go b/sei-db/state_db/ss/offload/historical/scylla.go new file mode 100644 index 0000000000..5d48c8a778 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/scylla.go @@ -0,0 +1,239 @@ +package historical + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/gocql/gocql" +) + +const ( + defaultScyllaConsistency = "local_quorum" + defaultScyllaTimeout = 2 * time.Second + defaultScyllaNumConns = 4 + + // VersionBucketCount spreads monotonically increasing block-version markers + // across a bounded set of partitions while keeping LastVersion cheap. + VersionBucketCount = 64 +) + +type ScyllaConfig struct { + Hosts []string + Keyspace string + Username string + Password string + Datacenter string + Consistency string + Timeout time.Duration + ConnectTimeout time.Duration + NumConns int +} + +func (c *ScyllaConfig) ApplyDefaults() { + if c.Consistency == "" { + c.Consistency = defaultScyllaConsistency + } + if c.Timeout == 0 { + c.Timeout = defaultScyllaTimeout + } + if c.ConnectTimeout == 0 { + c.ConnectTimeout = defaultScyllaTimeout + } + if c.NumConns == 0 { + c.NumConns = defaultScyllaNumConns + } +} + +func (c *ScyllaConfig) Validate() error { + if len(c.Hosts) == 0 { + return fmt.Errorf("scylla/cassandra hosts are required") + } + for _, host := range c.Hosts { + if strings.TrimSpace(host) == "" { + return fmt.Errorf("scylla/cassandra hosts must not contain blanks") + } + } + if strings.TrimSpace(c.Keyspace) == "" { + return fmt.Errorf("scylla/cassandra keyspace is required") + } + if c.Password != "" && c.Username == "" { + return fmt.Errorf("scylla/cassandra username is required when password is set") + } + if _, err := parseConsistency(c.Consistency); err != nil { + return err + } + if c.Timeout < 0 { + return fmt.Errorf("scylla/cassandra timeout must be non-negative") + } + if c.ConnectTimeout < 0 { + return fmt.Errorf("scylla/cassandra connect timeout must be non-negative") + } + if c.NumConns < 0 { + return fmt.Errorf("scylla/cassandra num conns must be non-negative") + } + return nil +} + +func NewScyllaReader(cfg ScyllaConfig) (Reader, error) { + session, err := OpenScyllaSession(cfg) + if err != nil { + return nil, err + } + return &scyllaReader{session: session}, nil +} + +func OpenScyllaSession(cfg ScyllaConfig) (*gocql.Session, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + consistency, err := parseConsistency(cfg.Consistency) + if err != nil { + return nil, err + } + + cluster := gocql.NewCluster(cfg.Hosts...) + cluster.Keyspace = cfg.Keyspace + cluster.Consistency = consistency + cluster.Timeout = cfg.Timeout + cluster.ConnectTimeout = cfg.ConnectTimeout + cluster.NumConns = cfg.NumConns + if cfg.Username != "" { + cluster.Authenticator = gocql.PasswordAuthenticator{ + Username: cfg.Username, + Password: cfg.Password, + } + } + if cfg.Datacenter != "" { + cluster.PoolConfig.HostSelectionPolicy = gocql.TokenAwareHostPolicy( + gocql.DCAwareRoundRobinPolicy(cfg.Datacenter), + ) + } + + session, err := cluster.CreateSession() + if err != nil { + return nil, fmt.Errorf("open scylla/cassandra session: %w", err) + } + return session, nil +} + +type scyllaReader struct { + session *gocql.Session +} + +var _ Reader = (*scyllaReader)(nil) + +func (r *scyllaReader) Close() error { + r.session.Close() + return nil +} + +func (r *scyllaReader) LastVersion(ctx context.Context) (int64, error) { + var maxVersion int64 + for bucket := 0; bucket < VersionBucketCount; bucket++ { + var version int64 + err := r.session.Query(selectLatestVersionCQL, bucket).WithContext(ctx).Scan(&version) + if err != nil { + if err == gocql.ErrNotFound { + continue + } + return 0, fmt.Errorf("read latest scylla/cassandra version bucket %d: %w", bucket, err) + } + if version > maxVersion { + maxVersion = version + } + } + return maxVersion, nil +} + +func (r *scyllaReader) Has(ctx context.Context, storeName string, key []byte, targetVersion int64) (bool, error) { + var deleted bool + err := r.session.Query(hasLookupCQL, storeName, key, targetVersion).WithContext(ctx).Scan(&deleted) + if err != nil { + if err == gocql.ErrNotFound { + return false, nil + } + return false, fmt.Errorf("scylla/cassandra has lookup: %w", err) + } + return !deleted, nil +} + +func (r *scyllaReader) Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) { + var ( + version int64 + bz []byte + deleted bool + ) + err := r.session.Query(getLookupCQL, storeName, key, targetVersion).WithContext(ctx).Scan(&version, &bz, &deleted) + if err != nil { + if err == gocql.ErrNotFound { + return Value{}, ErrNotFound + } + return Value{}, fmt.Errorf("scylla/cassandra get lookup: %w", err) + } + if deleted { + return Value{}, ErrNotFound + } + return Value{Bytes: bz, Version: version}, nil +} + +func (r *scyllaReader) BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) { + out := make(map[Lookup]Value, len(lookups)) + for _, lookup := range lookups { + value, err := r.Get(ctx, lookup.StoreName, []byte(lookup.Key), targetVersion) + if err != nil { + if err == ErrNotFound { + continue + } + return nil, err + } + out[lookup] = value + } + return out, nil +} + +func VersionBucket(version int64) int { + if version < 0 { + version = -version + } + return int(version % VersionBucketCount) +} + +func parseConsistency(name string) (gocql.Consistency, error) { + switch strings.ToLower(strings.TrimSpace(name)) { + case "one": + return gocql.One, nil + case "local_one": + return gocql.LocalOne, nil + case "quorum": + return gocql.Quorum, nil + case "", "local_quorum": + return gocql.LocalQuorum, nil + case "all": + return gocql.All, nil + default: + return gocql.Any, fmt.Errorf("unsupported scylla/cassandra consistency %q", name) + } +} + +const selectLatestVersionCQL = ` +SELECT version +FROM state_versions +WHERE bucket = ? +LIMIT 1` + +const hasLookupCQL = ` +SELECT deleted +FROM state_mutations +WHERE store_name = ? AND state_key = ? AND version <= ? +ORDER BY version DESC +LIMIT 1` + +const getLookupCQL = ` +SELECT version, value, deleted +FROM state_mutations +WHERE store_name = ? AND state_key = ? AND version <= ? +ORDER BY version DESC +LIMIT 1` diff --git a/sei-db/state_db/ss/offload/historical/scylla_test.go b/sei-db/state_db/ss/offload/historical/scylla_test.go new file mode 100644 index 0000000000..8147b44bc4 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/scylla_test.go @@ -0,0 +1,95 @@ +package historical + +import ( + "strings" + "testing" + "time" + + "github.com/gocql/gocql" + "github.com/stretchr/testify/require" +) + +func TestScyllaConfigApplyDefaults(t *testing.T) { + cfg := ScyllaConfig{ + Hosts: []string{"127.0.0.1"}, + Keyspace: "sei_history", + } + cfg.ApplyDefaults() + require.Equal(t, defaultScyllaConsistency, cfg.Consistency) + require.Equal(t, defaultScyllaTimeout, cfg.Timeout) + require.Equal(t, defaultScyllaTimeout, cfg.ConnectTimeout) + require.Equal(t, defaultScyllaNumConns, cfg.NumConns) +} + +func TestScyllaConfigValidate(t *testing.T) { + tests := []struct { + name string + cfg ScyllaConfig + err string + }{ + {"missing hosts", ScyllaConfig{Keyspace: "ks"}, "hosts"}, + {"blank host", ScyllaConfig{Hosts: []string{" "}, Keyspace: "ks"}, "blanks"}, + {"missing keyspace", ScyllaConfig{Hosts: []string{"127.0.0.1"}}, "keyspace"}, + {"password without username", ScyllaConfig{Hosts: []string{"127.0.0.1"}, Keyspace: "ks", Password: "secret"}, "username"}, + {"bad consistency", ScyllaConfig{Hosts: []string{"127.0.0.1"}, Keyspace: "ks", Consistency: "bad"}, "consistency"}, + {"negative timeout", ScyllaConfig{Hosts: []string{"127.0.0.1"}, Keyspace: "ks", Timeout: -time.Second}, "timeout"}, + {"negative connect timeout", ScyllaConfig{Hosts: []string{"127.0.0.1"}, Keyspace: "ks", ConnectTimeout: -time.Second}, "connect"}, + {"negative conns", ScyllaConfig{Hosts: []string{"127.0.0.1"}, Keyspace: "ks", NumConns: -1}, "conns"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := tc.cfg.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), tc.err) + }) + } +} + +func TestParseConsistency(t *testing.T) { + tests := []struct { + in string + out gocql.Consistency + }{ + {"", gocql.LocalQuorum}, + {"local_quorum", gocql.LocalQuorum}, + {"LOCAL_ONE", gocql.LocalOne}, + {"one", gocql.One}, + {"quorum", gocql.Quorum}, + {"all", gocql.All}, + } + for _, tc := range tests { + t.Run(tc.in, func(t *testing.T) { + got, err := parseConsistency(tc.in) + require.NoError(t, err) + require.Equal(t, tc.out, got) + }) + } +} + +func TestVersionBucket(t *testing.T) { + require.Equal(t, 0, VersionBucket(0)) + require.Equal(t, 1, VersionBucket(1)) + require.Equal(t, 0, VersionBucket(VersionBucketCount)) + require.Equal(t, 1, VersionBucket(-1)) +} + +func TestPointLookupCQLShape(t *testing.T) { + for _, q := range []string{getLookupCQL, hasLookupCQL} { + for _, frag := range []string{ + "FROM state_mutations", + "store_name = ?", + "state_key = ?", + "version <= ?", + "ORDER BY version DESC", + "LIMIT 1", + } { + require.Contains(t, q, frag) + } + } +} + +func TestLatestVersionCQLShape(t *testing.T) { + require.Contains(t, selectLatestVersionCQL, "FROM state_versions") + require.Contains(t, selectLatestVersionCQL, "bucket = ?") + require.True(t, strings.Contains(selectLatestVersionCQL, "LIMIT 1")) +} diff --git a/sei-db/state_db/ss/offload/historical/store.go b/sei-db/state_db/ss/offload/historical/store.go new file mode 100644 index 0000000000..9c0dc975f6 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/store.go @@ -0,0 +1,96 @@ +package historical + +import ( + "context" + "errors" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" + "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// FallbackStateStore routes pruned point reads to the historical reader. +// Iteration and writes stay on the primary state store. +type FallbackStateStore struct { + primary types.StateStore + reader Reader +} + +var _ types.StateStore = (*FallbackStateStore)(nil) + +// NewFallbackStateStore takes ownership of primary and reader for Close. +func NewFallbackStateStore(primary types.StateStore, reader Reader) *FallbackStateStore { + return &FallbackStateStore{primary: primary, reader: reader} +} + +func (s *FallbackStateStore) shouldFallback(version int64) bool { + earliest := s.primary.GetEarliestVersion() + return earliest > 0 && version < earliest +} + +func (s *FallbackStateStore) Get(storeKey string, version int64, key []byte) ([]byte, error) { + if !s.shouldFallback(version) { + return s.primary.Get(storeKey, version, key) + } + v, err := s.reader.Get(context.Background(), storeKey, key, version) + if err != nil { + if errors.Is(err, ErrNotFound) { + return nil, nil + } + return nil, err + } + return v.Bytes, nil +} + +func (s *FallbackStateStore) Has(storeKey string, version int64, key []byte) (bool, error) { + if !s.shouldFallback(version) { + return s.primary.Has(storeKey, version, key) + } + return s.reader.Has(context.Background(), storeKey, key, version) +} + +func (s *FallbackStateStore) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return s.primary.Iterator(storeKey, version, start, end) +} + +func (s *FallbackStateStore) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return s.primary.ReverseIterator(storeKey, version, start, end) +} + +func (s *FallbackStateStore) RawIterate(storeKey string, fn func([]byte, []byte, int64) bool) (bool, error) { + return s.primary.RawIterate(storeKey, fn) +} + +func (s *FallbackStateStore) GetLatestVersion() int64 { return s.primary.GetLatestVersion() } + +func (s *FallbackStateStore) SetLatestVersion(version int64) error { + return s.primary.SetLatestVersion(version) +} + +func (s *FallbackStateStore) GetEarliestVersion() int64 { return s.primary.GetEarliestVersion() } + +func (s *FallbackStateStore) SetEarliestVersion(version int64, ignoreVersion bool) error { + return s.primary.SetEarliestVersion(version, ignoreVersion) +} + +func (s *FallbackStateStore) ApplyChangesetSync(version int64, changesets []*proto.NamedChangeSet) error { + return s.primary.ApplyChangesetSync(version, changesets) +} + +func (s *FallbackStateStore) ApplyChangesetAsync(version int64, changesets []*proto.NamedChangeSet) error { + return s.primary.ApplyChangesetAsync(version, changesets) +} + +func (s *FallbackStateStore) Prune(version int64) error { return s.primary.Prune(version) } + +func (s *FallbackStateStore) Import(version int64, ch <-chan types.SnapshotNode) error { + return s.primary.Import(version, ch) +} + +func (s *FallbackStateStore) Close() error { + primaryErr := s.primary.Close() + readerErr := s.reader.Close() + if primaryErr != nil { + return primaryErr + } + return readerErr +} diff --git a/sei-db/state_db/ss/offload/historical/store_test.go b/sei-db/state_db/ss/offload/historical/store_test.go new file mode 100644 index 0000000000..fd7696aed1 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/store_test.go @@ -0,0 +1,103 @@ +package historical + +import ( + "context" + "testing" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/stretchr/testify/require" +) + +type fakeStateStore struct { + earliest int64 + gets int + has int +} + +func (f *fakeStateStore) Get(_ string, _ int64, _ []byte) ([]byte, error) { + f.gets++ + return []byte("primary"), nil +} + +func (f *fakeStateStore) Has(_ string, _ int64, _ []byte) (bool, error) { + f.has++ + return true, nil +} + +func (f *fakeStateStore) Iterator(string, int64, []byte, []byte) (types.DBIterator, error) { + return nil, nil +} + +func (f *fakeStateStore) ReverseIterator(string, int64, []byte, []byte) (types.DBIterator, error) { + return nil, nil +} + +func (f *fakeStateStore) RawIterate(string, func([]byte, []byte, int64) bool) (bool, error) { + return false, nil +} + +func (f *fakeStateStore) GetLatestVersion() int64 { return 0 } +func (f *fakeStateStore) SetLatestVersion(int64) error { return nil } +func (f *fakeStateStore) GetEarliestVersion() int64 { return f.earliest } +func (f *fakeStateStore) SetEarliestVersion(version int64, _ bool) error { + f.earliest = version + return nil +} +func (f *fakeStateStore) ApplyChangesetSync(int64, []*proto.NamedChangeSet) error { return nil } +func (f *fakeStateStore) ApplyChangesetAsync(int64, []*proto.NamedChangeSet) error { return nil } +func (f *fakeStateStore) Prune(int64) error { return nil } +func (f *fakeStateStore) Import(int64, <-chan types.SnapshotNode) error { return nil } +func (f *fakeStateStore) Close() error { return nil } + +type fakeReader struct { + gets int + has int +} + +func (f *fakeReader) Get(context.Context, string, []byte, int64) (Value, error) { + f.gets++ + return Value{Bytes: []byte("historical"), Version: 7}, nil +} + +func (f *fakeReader) Has(context.Context, string, []byte, int64) (bool, error) { + f.has++ + return true, nil +} + +func (f *fakeReader) BatchGet(context.Context, int64, []Lookup) (map[Lookup]Value, error) { + return nil, nil +} + +func (f *fakeReader) LastVersion(context.Context) (int64, error) { return 0, nil } +func (f *fakeReader) Close() error { return nil } + +func TestFallbackStateStoreRoutesPrunedPointReads(t *testing.T) { + primary := &fakeStateStore{earliest: 10} + reader := &fakeReader{} + store := NewFallbackStateStore(primary, reader) + + value, err := store.Get("bank", 7, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("historical"), value) + require.Equal(t, 0, primary.gets) + require.Equal(t, 1, reader.gets) + + ok, err := store.Has("bank", 7, []byte("k")) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, 0, primary.has) + require.Equal(t, 1, reader.has) +} + +func TestFallbackStateStoreKeepsRecentPointReadsOnPrimary(t *testing.T) { + primary := &fakeStateStore{earliest: 10} + reader := &fakeReader{} + store := NewFallbackStateStore(primary, reader) + + value, err := store.Get("bank", 10, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("primary"), value) + require.Equal(t, 1, primary.gets) + require.Equal(t, 0, reader.gets) +} diff --git a/sei-db/state_db/ss/offload/kafka.go b/sei-db/state_db/ss/offload/kafka.go index edbe366818..981cb35e02 100644 --- a/sei-db/state_db/ss/offload/kafka.go +++ b/sei-db/state_db/ss/offload/kafka.go @@ -122,7 +122,7 @@ func NewKafkaStream(cfg KafkaConfig) (Stream, error) { } } - mechanism, err := kafkaSASLMechanism(cfg) + mechanism, err := NewSASLMechanism(cfg) if err != nil { return nil, err } @@ -211,7 +211,8 @@ func kafkaCompression(name string) compress.Compression { } } -func kafkaSASLMechanism(cfg KafkaConfig) (sasl.Mechanism, error) { +// NewSASLMechanism is exported so out-of-package consumers share the producer's auth path. +func NewSASLMechanism(cfg KafkaConfig) (sasl.Mechanism, error) { switch strings.ToLower(cfg.SASLMechanism) { case "", kafkaOptionNone: return nil, nil diff --git a/sei-db/state_db/ss/store.go b/sei-db/state_db/ss/store.go index 0fb4b5184e..dc1ce2edc2 100644 --- a/sei-db/state_db/ss/store.go +++ b/sei-db/state_db/ss/store.go @@ -1,9 +1,14 @@ package ss import ( + "fmt" + "strings" + "time" + "github.com/sei-protocol/sei-chain/sei-db/config" "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/composite" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" ) // NewStateStore creates a CompositeStateStore which handles both Cosmos and EVM data. @@ -11,5 +16,45 @@ import ( // files in the backend package. When WriteMode/ReadMode are both cosmos_only (the default), // the EVM stores are not opened and the composite store behaves identically to a plain cosmos state store. func NewStateStore(homeDir string, ssConfig config.StateStoreConfig) (types.StateStore, error) { - return composite.NewCompositeStateStore(ssConfig, homeDir) + primary, err := composite.NewCompositeStateStore(ssConfig, homeDir) + if err != nil { + return nil, err + } + if !scyllaHistoricalOffloadConfigured(ssConfig) { + return primary, nil + } + reader, err := historical.NewScyllaReader(historical.ScyllaConfig{ + Hosts: splitCSV(ssConfig.HistoricalOffloadScyllaHosts), + Keyspace: ssConfig.HistoricalOffloadScyllaKeyspace, + Username: ssConfig.HistoricalOffloadScyllaUsername, + Password: ssConfig.HistoricalOffloadScyllaPassword, + Datacenter: ssConfig.HistoricalOffloadScyllaDatacenter, + Consistency: ssConfig.HistoricalOffloadScyllaConsistency, + Timeout: time.Duration(ssConfig.HistoricalOffloadScyllaTimeoutMS) * time.Millisecond, + }) + if err != nil { + _ = primary.Close() + return nil, fmt.Errorf("open scylla/cassandra historical offload reader: %w", err) + } + return historical.NewFallbackStateStore(primary, reader), nil +} + +func scyllaHistoricalOffloadConfigured(cfg config.StateStoreConfig) bool { + return strings.TrimSpace(cfg.HistoricalOffloadScyllaHosts) != "" || + strings.TrimSpace(cfg.HistoricalOffloadScyllaKeyspace) != "" +} + +func splitCSV(value string) []string { + if strings.TrimSpace(value) == "" { + return nil + } + parts := strings.Split(value, ",") + out := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part != "" { + out = append(out, part) + } + } + return out } From 32b55d2b0c910d546f054af7c11fdba7fddfd2f3 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 12 May 2026 12:02:48 -0400 Subject: [PATCH 02/16] Avoid retry timer leak in Scylla consumer --- sei-db/state_db/ss/offload/consumer/consumer.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go index a0adaaca46..a04c42e75e 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer.go +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -257,10 +257,8 @@ func (c *Consumer) writeBatchWithRetry(ctx context.Context, records []Record) er } c.logf("sink write attempt %d/%d failed: %v; retrying in %s", attempt, c.maxAttempts, err, backoff) - select { - case <-time.After(backoff): - case <-ctx.Done(): - return ctx.Err() + if err := sleepWithContext(ctx, backoff); err != nil { + return err } backoff *= 2 if backoff > c.maxBackoff { @@ -270,6 +268,17 @@ func (c *Consumer) writeBatchWithRetry(ctx context.Context, records []Record) er return fmt.Errorf("sink write failed after %d attempts: %w", c.maxAttempts, lastErr) } +func sleepWithContext(ctx context.Context, d time.Duration) error { + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-timer.C: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + func (c *Consumer) writeRecords(ctx context.Context, records []Record) error { if len(records) == 0 { return nil From 6c6f5890114f4664fdd08e08dba51cfd2d7776ae Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 12 May 2026 12:20:53 -0400 Subject: [PATCH 03/16] Parallelize Scylla mutation ingest --- sei-db/state_db/ss/offload/consumer/README.md | 3 +- .../consumer/config/example-scylla.json | 3 +- sei-db/state_db/ss/offload/consumer/scylla.go | 94 ++++++++++++++----- .../ss/offload/consumer/scylla_test.go | 89 ++++++++++++++++++ 4 files changed, 166 insertions(+), 23 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index 9ccc28b29d..185920d783 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -36,7 +36,8 @@ factors before applying it. The consumer reads historical offload changelog messages from Kafka and writes them into Scylla/Cassandra. Kafka offsets are committed only after the sink -write succeeds. +write succeeds. Within each block, mutation rows are written with bounded +concurrency and the version marker is written last. ```bash go run ./sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer \ diff --git a/sei-db/state_db/ss/offload/consumer/config/example-scylla.json b/sei-db/state_db/ss/offload/consumer/config/example-scylla.json index 94779c83e9..013217af75 100644 --- a/sei-db/state_db/ss/offload/consumer/config/example-scylla.json +++ b/sei-db/state_db/ss/offload/consumer/config/example-scylla.json @@ -12,7 +12,8 @@ "Consistency": "local_quorum", "TimeoutMS": 2000, "ConnectTimeoutMS": 2000, - "NumConns": 4 + "NumConns": 4, + "MutationWorkers": 16 }, "Workers": 16, "ShardBufferSize": 128, diff --git a/sei-db/state_db/ss/offload/consumer/scylla.go b/sei-db/state_db/ss/offload/consumer/scylla.go index c16744662c..ebe0b276eb 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla.go +++ b/sei-db/state_db/ss/offload/consumer/scylla.go @@ -6,11 +6,14 @@ import ( "time" "github.com/gocql/gocql" + "golang.org/x/sync/errgroup" "github.com/sei-protocol/sei-chain/sei-db/proto" "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" ) +const defaultScyllaMutationWorkers = 16 + type ScyllaConfig struct { Hosts []string Keyspace string @@ -21,6 +24,7 @@ type ScyllaConfig struct { TimeoutMS int ConnectTimeoutMS int NumConns int + MutationWorkers int } func (c *ScyllaConfig) ApplyDefaults() { @@ -30,12 +34,21 @@ func (c *ScyllaConfig) ApplyDefaults() { c.TimeoutMS = int(cfg.Timeout / time.Millisecond) c.ConnectTimeoutMS = int(cfg.ConnectTimeout / time.Millisecond) c.NumConns = cfg.NumConns + if c.MutationWorkers == 0 { + c.MutationWorkers = defaultScyllaMutationWorkers + } } func (c *ScyllaConfig) Validate() error { cfg := c.toHistorical() cfg.ApplyDefaults() - return cfg.Validate() + if err := cfg.Validate(); err != nil { + return err + } + if c.MutationWorkers < 0 { + return fmt.Errorf("scylla/cassandra mutation workers must be non-negative") + } + return nil } func (c ScyllaConfig) toHistorical() historical.ScyllaConfig { @@ -53,22 +66,34 @@ func (c ScyllaConfig) toHistorical() historical.ScyllaConfig { } type scyllaSink struct { - session *gocql.Session + session *gocql.Session + exec scyllaExecFunc + mutationWorkers int } var _ Sink = (*scyllaSink)(nil) var _ BatchSink = (*scyllaSink)(nil) func NewScyllaSink(cfg ScyllaConfig) (Sink, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } session, err := historical.OpenScyllaSession(cfg.toHistorical()) if err != nil { return nil, err } - return &scyllaSink{session: session}, nil + return &scyllaSink{ + session: session, + exec: sessionExec(session), + mutationWorkers: cfg.MutationWorkers, + }, nil } func (s *scyllaSink) Close() error { - s.session.Close() + if s.session != nil { + s.session.Close() + } return nil } @@ -124,61 +149,88 @@ func (s *scyllaSink) writeRecord(ctx context.Context, rec Record) error { return nil } version := entry.Version - for _, ncs := range entry.Changesets { - for _, pair := range ncs.Changeset.Pairs { - if err := s.writeMutation(ctx, version, ncs.Name, pair); err != nil { - return err - } - } - } - for _, up := range entry.Upgrades { - if err := s.writeUpgrade(ctx, version, up); err != nil { - return err - } + if err := s.writeRecordRows(ctx, version, entry); err != nil { + return err } - if err := s.session.Query(insertVersionCQL, + if err := s.exec(ctx, insertVersionCQL, historical.VersionBucket(version), version, rec.Topic, rec.Partition, rec.Offset, time.Now(), - ).WithContext(ctx).Exec(); err != nil { + ); err != nil { return fmt.Errorf("insert scylla/cassandra version %d: %w", version, err) } return nil } +func (s *scyllaSink) writeRecordRows(ctx context.Context, version int64, entry *proto.ChangelogEntry) error { + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(s.effectiveMutationWorkers()) + for _, ncs := range entry.Changesets { + storeName := ncs.Name + for _, pair := range ncs.Changeset.Pairs { + pair := pair + g.Go(func() error { + return s.writeMutation(gctx, version, storeName, pair) + }) + } + } + for _, up := range entry.Upgrades { + up := up + g.Go(func() error { + return s.writeUpgrade(gctx, version, up) + }) + } + return g.Wait() +} + +func (s *scyllaSink) effectiveMutationWorkers() int { + if s.mutationWorkers <= 0 { + return defaultScyllaMutationWorkers + } + return s.mutationWorkers +} + func (s *scyllaSink) writeMutation(ctx context.Context, version int64, storeName string, pair *proto.KVPair) error { deleted := pair.Delete || pair.Value == nil value := pair.Value if deleted { value = nil } - if err := s.session.Query(insertMutationCQL, + if err := s.exec(ctx, insertMutationCQL, storeName, pair.Key, version, value, deleted, - ).WithContext(ctx).Exec(); err != nil { + ); err != nil { return fmt.Errorf("insert scylla/cassandra mutation store=%s version=%d: %w", storeName, version, err) } return nil } func (s *scyllaSink) writeUpgrade(ctx context.Context, version int64, up *proto.TreeNameUpgrade) error { - if err := s.session.Query(insertUpgradeCQL, + if err := s.exec(ctx, insertUpgradeCQL, version, up.Name, up.RenameFrom, up.Delete, - ).WithContext(ctx).Exec(); err != nil { + ); err != nil { return fmt.Errorf("insert scylla/cassandra tree upgrade version=%d name=%s: %w", version, up.Name, err) } return nil } +type scyllaExecFunc func(ctx context.Context, stmt string, values ...interface{}) error + +func sessionExec(session *gocql.Session) scyllaExecFunc { + return func(ctx context.Context, stmt string, values ...interface{}) error { + return session.Query(stmt, values...).WithContext(ctx).Exec() + } +} + const selectLatestVersionCQL = ` SELECT version FROM state_versions diff --git a/sei-db/state_db/ss/offload/consumer/scylla_test.go b/sei-db/state_db/ss/offload/consumer/scylla_test.go index b80b409aa8..60db6acf9a 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla_test.go +++ b/sei-db/state_db/ss/offload/consumer/scylla_test.go @@ -1,8 +1,11 @@ package consumer import ( + "context" "strings" + "sync/atomic" "testing" + "time" "github.com/sei-protocol/sei-chain/sei-db/proto" "github.com/stretchr/testify/require" @@ -17,6 +20,10 @@ func TestScyllaConfigValidate(t *testing.T) { cfg.TimeoutMS = -1 require.ErrorContains(t, cfg.Validate(), "timeout") + + cfg.TimeoutMS = 0 + cfg.MutationWorkers = -1 + require.ErrorContains(t, cfg.Validate(), "mutation workers") } func TestScyllaConfigApplyDefaults(t *testing.T) { @@ -29,6 +36,7 @@ func TestScyllaConfigApplyDefaults(t *testing.T) { require.Equal(t, 2000, cfg.TimeoutMS) require.Equal(t, 2000, cfg.ConnectTimeoutMS) require.Equal(t, 4, cfg.NumConns) + require.Equal(t, 16, cfg.MutationWorkers) } func TestCompactRecordsDropsNilEntries(t *testing.T) { @@ -66,3 +74,84 @@ func TestScyllaCQLShape(t *testing.T) { } require.True(t, strings.Contains(selectLatestVersionCQL, "LIMIT 1")) } + +func TestScyllaSinkWritesRowsConcurrentlyBeforeVersionMarker(t *testing.T) { + rowStarted := make(chan struct{}, 8) + releaseRows := make(chan struct{}) + var activeRows atomic.Int32 + var sawConcurrentRows atomic.Bool + var markerBeforeRowsDone atomic.Bool + var versionMarkers atomic.Int32 + + sink := &scyllaSink{ + mutationWorkers: 2, + exec: func(ctx context.Context, stmt string, _ ...interface{}) error { + if strings.Contains(stmt, "state_versions") { + if activeRows.Load() != 0 { + markerBeforeRowsDone.Store(true) + } + versionMarkers.Add(1) + return nil + } + if activeRows.Add(1) > 1 { + sawConcurrentRows.Store(true) + } + rowStarted <- struct{}{} + select { + case <-releaseRows: + case <-ctx.Done(): + activeRows.Add(-1) + return ctx.Err() + } + activeRows.Add(-1) + return nil + }, + } + entry := &proto.ChangelogEntry{ + Version: 7, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("k1"), Value: []byte("v1")}, + {Key: []byte("k2"), Value: []byte("v2")}, + {Key: []byte("k3"), Value: []byte("v3")}, + }}, + }}, + Upgrades: []*proto.TreeNameUpgrade{{Name: "new-store"}}, + } + + errCh := make(chan error, 1) + go func() { + errCh <- sink.writeRecord(context.Background(), Record{Topic: "t", Partition: 1, Offset: 2, Entry: entry}) + }() + + releaseClosed := false + closeRelease := func() { + if !releaseClosed { + close(releaseRows) + releaseClosed = true + } + } + defer closeRelease() + + for i := 0; i < 2; i++ { + select { + case <-rowStarted: + case <-time.After(time.Second): + closeRelease() + t.Fatal("timed out waiting for concurrent row writes") + } + } + require.True(t, sawConcurrentRows.Load()) + require.Equal(t, int32(0), versionMarkers.Load(), "version marker must wait for row writes") + + closeRelease() + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(time.Second): + t.Fatal("timed out waiting for record write") + } + require.False(t, markerBeforeRowsDone.Load()) + require.Equal(t, int32(1), versionMarkers.Load()) +} From 4fcd2421f4fd638195aea8c949d640509422ca1f Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 12 May 2026 12:56:20 -0400 Subject: [PATCH 04/16] Parallelize Scylla historical batch reads --- .../state_db/ss/offload/historical/scylla.go | 74 +++++++++++++------ .../ss/offload/historical/scylla_test.go | 73 ++++++++++++++++++ 2 files changed, 125 insertions(+), 22 deletions(-) diff --git a/sei-db/state_db/ss/offload/historical/scylla.go b/sei-db/state_db/ss/offload/historical/scylla.go index 5d48c8a778..bcb98cebba 100644 --- a/sei-db/state_db/ss/offload/historical/scylla.go +++ b/sei-db/state_db/ss/offload/historical/scylla.go @@ -2,17 +2,21 @@ package historical import ( "context" + "errors" "fmt" "strings" + "sync" "time" "github.com/gocql/gocql" + "golang.org/x/sync/errgroup" ) const ( defaultScyllaConsistency = "local_quorum" defaultScyllaTimeout = 2 * time.Second defaultScyllaNumConns = 4 + defaultScyllaReadWorkers = 16 // VersionBucketCount spreads monotonically increasing block-version markers // across a bounded set of partitions while keeping LastVersion cheap. @@ -81,7 +85,10 @@ func NewScyllaReader(cfg ScyllaConfig) (Reader, error) { if err != nil { return nil, err } - return &scyllaReader{session: session}, nil + return &scyllaReader{ + session: session, + get: sessionGet(session), + }, nil } func OpenScyllaSession(cfg ScyllaConfig) (*gocql.Session, error) { @@ -121,12 +128,15 @@ func OpenScyllaSession(cfg ScyllaConfig) (*gocql.Session, error) { type scyllaReader struct { session *gocql.Session + get scyllaGetFunc } var _ Reader = (*scyllaReader)(nil) func (r *scyllaReader) Close() error { - r.session.Close() + if r.session != nil { + r.session.Close() + } return nil } @@ -161,35 +171,55 @@ func (r *scyllaReader) Has(ctx context.Context, storeName string, key []byte, ta } func (r *scyllaReader) Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) { - var ( - version int64 - bz []byte - deleted bool - ) - err := r.session.Query(getLookupCQL, storeName, key, targetVersion).WithContext(ctx).Scan(&version, &bz, &deleted) - if err != nil { - if err == gocql.ErrNotFound { + return r.get(ctx, storeName, key, targetVersion) +} + +func sessionGet(session *gocql.Session) scyllaGetFunc { + return func(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) { + var ( + version int64 + bz []byte + deleted bool + ) + err := session.Query(getLookupCQL, storeName, key, targetVersion).WithContext(ctx).Scan(&version, &bz, &deleted) + if err != nil { + if err == gocql.ErrNotFound { + return Value{}, ErrNotFound + } + return Value{}, fmt.Errorf("scylla/cassandra get lookup: %w", err) + } + if deleted { return Value{}, ErrNotFound } - return Value{}, fmt.Errorf("scylla/cassandra get lookup: %w", err) - } - if deleted { - return Value{}, ErrNotFound + return Value{Bytes: bz, Version: version}, nil } - return Value{Bytes: bz, Version: version}, nil } +type scyllaGetFunc func(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) + func (r *scyllaReader) BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) { out := make(map[Lookup]Value, len(lookups)) + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(defaultScyllaReadWorkers) + var mu sync.Mutex for _, lookup := range lookups { - value, err := r.Get(ctx, lookup.StoreName, []byte(lookup.Key), targetVersion) - if err != nil { - if err == ErrNotFound { - continue + lookup := lookup + g.Go(func() error { + value, err := r.Get(gctx, lookup.StoreName, []byte(lookup.Key), targetVersion) + if err != nil { + if errors.Is(err, ErrNotFound) { + return nil + } + return err } - return nil, err - } - out[lookup] = value + mu.Lock() + out[lookup] = value + mu.Unlock() + return nil + }) + } + if err := g.Wait(); err != nil { + return nil, err } return out, nil } diff --git a/sei-db/state_db/ss/offload/historical/scylla_test.go b/sei-db/state_db/ss/offload/historical/scylla_test.go index 8147b44bc4..2c5f3a8d42 100644 --- a/sei-db/state_db/ss/offload/historical/scylla_test.go +++ b/sei-db/state_db/ss/offload/historical/scylla_test.go @@ -1,7 +1,9 @@ package historical import ( + "context" "strings" + "sync/atomic" "testing" "time" @@ -93,3 +95,74 @@ func TestLatestVersionCQLShape(t *testing.T) { require.Contains(t, selectLatestVersionCQL, "bucket = ?") require.True(t, strings.Contains(selectLatestVersionCQL, "LIMIT 1")) } + +func TestScyllaReaderBatchGetParallelizesLookups(t *testing.T) { + started := make(chan string, 4) + release := make(chan struct{}) + var active atomic.Int32 + var sawConcurrent atomic.Bool + + reader := &scyllaReader{ + get: func(ctx context.Context, _ string, key []byte, targetVersion int64) (Value, error) { + if active.Add(1) > 1 { + sawConcurrent.Store(true) + } + defer active.Add(-1) + keyString := string(key) + started <- keyString + select { + case <-release: + case <-ctx.Done(): + return Value{}, ctx.Err() + } + if keyString == "missing" { + return Value{}, ErrNotFound + } + return Value{Bytes: []byte("value-" + keyString), Version: targetVersion - 1}, nil + }, + } + + errCh := make(chan error, 1) + var got map[Lookup]Value + lookups := []Lookup{ + {StoreName: "bank", Key: "k1"}, + {StoreName: "bank", Key: "missing"}, + {StoreName: "evm", Key: "k2"}, + } + go func() { + var err error + got, err = reader.BatchGet(context.Background(), 10, lookups) + errCh <- err + }() + + releaseClosed := false + closeRelease := func() { + if !releaseClosed { + close(release) + releaseClosed = true + } + } + defer closeRelease() + + for i := 0; i < 2; i++ { + select { + case <-started: + case <-time.After(time.Second): + closeRelease() + t.Fatal("timed out waiting for concurrent lookups") + } + } + require.True(t, sawConcurrent.Load()) + + closeRelease() + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(time.Second): + t.Fatal("timed out waiting for batch get") + } + require.Len(t, got, 2) + require.Equal(t, []byte("value-k1"), got[lookups[0]].Bytes) + require.Equal(t, []byte("value-k2"), got[lookups[2]].Bytes) + require.NotContains(t, got, lookups[1]) +} From 0fa1aef5c03e79b41b23344e30d5a67a939bf004 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 13 May 2026 10:08:22 -0400 Subject: [PATCH 05/16] Cache Scylla historical point reads --- .../state_db/ss/offload/historical/store.go | 43 ++++++++++++++++++- .../ss/offload/historical/store_test.go | 21 +++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/sei-db/state_db/ss/offload/historical/store.go b/sei-db/state_db/ss/offload/historical/store.go index 9c0dc975f6..90dc7abdac 100644 --- a/sei-db/state_db/ss/offload/historical/store.go +++ b/sei-db/state_db/ss/offload/historical/store.go @@ -1,25 +1,43 @@ package historical import ( + "bytes" "context" "errors" + lru "github.com/hashicorp/golang-lru/v2" "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" "github.com/sei-protocol/sei-chain/sei-db/proto" ) +const ( + defaultHistoricalReadCacheEntries = 64 * 1024 + maxHistoricalReadCacheValueBytes = 64 * 1024 +) + +type historicalReadCacheKey struct { + storeKey string + version int64 + key string +} + // FallbackStateStore routes pruned point reads to the historical reader. // Iteration and writes stay on the primary state store. type FallbackStateStore struct { primary types.StateStore reader Reader + cache *lru.Cache[historicalReadCacheKey, []byte] } var _ types.StateStore = (*FallbackStateStore)(nil) // NewFallbackStateStore takes ownership of primary and reader for Close. func NewFallbackStateStore(primary types.StateStore, reader Reader) *FallbackStateStore { - return &FallbackStateStore{primary: primary, reader: reader} + cache, err := lru.New[historicalReadCacheKey, []byte](defaultHistoricalReadCacheEntries) + if err != nil { + panic(err) + } + return &FallbackStateStore{primary: primary, reader: reader, cache: cache} } func (s *FallbackStateStore) shouldFallback(version int64) bool { @@ -31,6 +49,10 @@ func (s *FallbackStateStore) Get(storeKey string, version int64, key []byte) ([] if !s.shouldFallback(version) { return s.primary.Get(storeKey, version, key) } + cacheKey := historicalReadCacheKey{storeKey: storeKey, version: version, key: string(key)} + if value, ok := s.getCached(cacheKey); ok { + return value, nil + } v, err := s.reader.Get(context.Background(), storeKey, key, version) if err != nil { if errors.Is(err, ErrNotFound) { @@ -38,9 +60,28 @@ func (s *FallbackStateStore) Get(storeKey string, version int64, key []byte) ([] } return nil, err } + s.cacheValue(cacheKey, v.Bytes) return v.Bytes, nil } +func (s *FallbackStateStore) getCached(key historicalReadCacheKey) ([]byte, bool) { + if s.cache == nil { + return nil, false + } + value, ok := s.cache.Get(key) + if !ok { + return nil, false + } + return bytes.Clone(value), true +} + +func (s *FallbackStateStore) cacheValue(key historicalReadCacheKey, value []byte) { + if s.cache == nil || value == nil || len(value) > maxHistoricalReadCacheValueBytes { + return + } + s.cache.Add(key, bytes.Clone(value)) +} + func (s *FallbackStateStore) Has(storeKey string, version int64, key []byte) (bool, error) { if !s.shouldFallback(version) { return s.primary.Has(storeKey, version, key) diff --git a/sei-db/state_db/ss/offload/historical/store_test.go b/sei-db/state_db/ss/offload/historical/store_test.go index fd7696aed1..1e3cbfea57 100644 --- a/sei-db/state_db/ss/offload/historical/store_test.go +++ b/sei-db/state_db/ss/offload/historical/store_test.go @@ -101,3 +101,24 @@ func TestFallbackStateStoreKeepsRecentPointReadsOnPrimary(t *testing.T) { require.Equal(t, 1, primary.gets) require.Equal(t, 0, reader.gets) } + +func TestFallbackStateStoreCachesHistoricalPointReads(t *testing.T) { + primary := &fakeStateStore{earliest: 10} + reader := &fakeReader{} + store := NewFallbackStateStore(primary, reader) + + value, err := store.Get("bank", 7, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("historical"), value) + value[0] = 'H' + + value, err = store.Get("bank", 7, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("historical"), value) + value[0] = 'H' + + value, err = store.Get("bank", 7, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("historical"), value) + require.Equal(t, 1, reader.gets) +} From 5ef70bafd2bcc277ecedd37aef8e10082fd7faf2 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 13 May 2026 10:31:25 -0400 Subject: [PATCH 06/16] Enable token-aware Scylla routing --- sei-db/state_db/ss/offload/historical/scylla.go | 14 +++++++++----- .../ss/offload/historical/scylla_test.go | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/sei-db/state_db/ss/offload/historical/scylla.go b/sei-db/state_db/ss/offload/historical/scylla.go index bcb98cebba..686908c343 100644 --- a/sei-db/state_db/ss/offload/historical/scylla.go +++ b/sei-db/state_db/ss/offload/historical/scylla.go @@ -113,11 +113,7 @@ func OpenScyllaSession(cfg ScyllaConfig) (*gocql.Session, error) { Password: cfg.Password, } } - if cfg.Datacenter != "" { - cluster.PoolConfig.HostSelectionPolicy = gocql.TokenAwareHostPolicy( - gocql.DCAwareRoundRobinPolicy(cfg.Datacenter), - ) - } + cluster.PoolConfig.HostSelectionPolicy = scyllaHostSelectionPolicy(cfg.Datacenter) session, err := cluster.CreateSession() if err != nil { @@ -126,6 +122,14 @@ func OpenScyllaSession(cfg ScyllaConfig) (*gocql.Session, error) { return session, nil } +func scyllaHostSelectionPolicy(datacenter string) gocql.HostSelectionPolicy { + datacenter = strings.TrimSpace(datacenter) + if datacenter != "" { + return gocql.TokenAwareHostPolicy(gocql.DCAwareRoundRobinPolicy(datacenter)) + } + return gocql.TokenAwareHostPolicy(gocql.RoundRobinHostPolicy()) +} + type scyllaReader struct { session *gocql.Session get scyllaGetFunc diff --git a/sei-db/state_db/ss/offload/historical/scylla_test.go b/sei-db/state_db/ss/offload/historical/scylla_test.go index 2c5f3a8d42..2699cdf3ea 100644 --- a/sei-db/state_db/ss/offload/historical/scylla_test.go +++ b/sei-db/state_db/ss/offload/historical/scylla_test.go @@ -2,6 +2,7 @@ package historical import ( "context" + "reflect" "strings" "sync/atomic" "testing" @@ -68,6 +69,22 @@ func TestParseConsistency(t *testing.T) { } } +func TestScyllaHostSelectionPolicyIsTokenAware(t *testing.T) { + for _, tc := range []struct { + name string + datacenter string + }{ + {"no datacenter", ""}, + {"with datacenter", "dc1"}, + } { + t.Run(tc.name, func(t *testing.T) { + policy := scyllaHostSelectionPolicy(tc.datacenter) + require.NotNil(t, policy) + require.Contains(t, reflect.TypeOf(policy).String(), "tokenAware") + }) + } +} + func TestVersionBucket(t *testing.T) { require.Equal(t, 0, VersionBucket(0)) require.Equal(t, 1, VersionBucket(1)) From 37d0086494da7beef1475edfa1e4c79f30a6c604 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 13 May 2026 15:57:40 -0400 Subject: [PATCH 07/16] Compact duplicate Scylla mutation writes --- sei-db/state_db/ss/offload/consumer/scylla.go | 41 +++++++++++++--- .../ss/offload/consumer/scylla_test.go | 49 +++++++++++++++++++ 2 files changed, 82 insertions(+), 8 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/scylla.go b/sei-db/state_db/ss/offload/consumer/scylla.go index ebe0b276eb..fe030c6901 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla.go +++ b/sei-db/state_db/ss/offload/consumer/scylla.go @@ -168,14 +168,11 @@ func (s *scyllaSink) writeRecord(ctx context.Context, rec Record) error { func (s *scyllaSink) writeRecordRows(ctx context.Context, version int64, entry *proto.ChangelogEntry) error { g, gctx := errgroup.WithContext(ctx) g.SetLimit(s.effectiveMutationWorkers()) - for _, ncs := range entry.Changesets { - storeName := ncs.Name - for _, pair := range ncs.Changeset.Pairs { - pair := pair - g.Go(func() error { - return s.writeMutation(gctx, version, storeName, pair) - }) - } + for _, mutation := range compactMutations(entry) { + mutation := mutation + g.Go(func() error { + return s.writeMutation(gctx, version, mutation.storeName, mutation.pair) + }) } for _, up := range entry.Upgrades { up := up @@ -186,6 +183,34 @@ func (s *scyllaSink) writeRecordRows(ctx context.Context, version int64, entry * return g.Wait() } +type scyllaMutation struct { + storeName string + pair *proto.KVPair +} + +type scyllaMutationKey struct { + storeName string + key string +} + +func compactMutations(entry *proto.ChangelogEntry) []scyllaMutation { + mutations := make([]scyllaMutation, 0) + indexByKey := make(map[scyllaMutationKey]int) + for _, ncs := range entry.Changesets { + storeName := ncs.Name + for _, pair := range ncs.Changeset.Pairs { + key := scyllaMutationKey{storeName: storeName, key: string(pair.Key)} + if idx, ok := indexByKey[key]; ok { + mutations[idx].pair = pair + continue + } + indexByKey[key] = len(mutations) + mutations = append(mutations, scyllaMutation{storeName: storeName, pair: pair}) + } + } + return mutations +} + func (s *scyllaSink) effectiveMutationWorkers() int { if s.mutationWorkers <= 0 { return defaultScyllaMutationWorkers diff --git a/sei-db/state_db/ss/offload/consumer/scylla_test.go b/sei-db/state_db/ss/offload/consumer/scylla_test.go index 60db6acf9a..1c2d3b1645 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla_test.go +++ b/sei-db/state_db/ss/offload/consumer/scylla_test.go @@ -3,6 +3,7 @@ package consumer import ( "context" "strings" + "sync" "sync/atomic" "testing" "time" @@ -155,3 +156,51 @@ func TestScyllaSinkWritesRowsConcurrentlyBeforeVersionMarker(t *testing.T) { require.False(t, markerBeforeRowsDone.Load()) require.Equal(t, int32(1), versionMarkers.Load()) } + +func TestScyllaSinkCompactsDuplicateMutations(t *testing.T) { + type write struct { + value []byte + deleted bool + } + var mu sync.Mutex + writes := make(map[string]write) + sink := &scyllaSink{ + mutationWorkers: 1, + exec: func(_ context.Context, stmt string, values ...interface{}) error { + if !strings.Contains(stmt, "state_mutations") { + return nil + } + storeName := values[0].(string) + key := string(values[1].([]byte)) + value := values[3].([]byte) + deleted := values[4].(bool) + mu.Lock() + writes[storeName+"/"+key] = write{value: value, deleted: deleted} + mu.Unlock() + return nil + }, + } + entry := &proto.ChangelogEntry{ + Version: 9, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("k"), Value: []byte("old")}, + {Key: []byte("drop"), Value: []byte("present")}, + {Key: []byte("k"), Value: []byte("new")}, + {Key: []byte("drop"), Delete: true}, + }}, + }, { + Name: "evm", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("k"), Value: []byte("separate-store")}, + }}, + }}, + } + + require.NoError(t, sink.writeRecordRows(context.Background(), entry.Version, entry)) + require.Len(t, writes, 3) + require.Equal(t, write{value: []byte("new")}, writes["bank/k"]) + require.Equal(t, write{deleted: true}, writes["bank/drop"]) + require.Equal(t, write{value: []byte("separate-store")}, writes["evm/k"]) +} From 4a14131fc1d9b3e311819937293982e1a150fe48 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 13 May 2026 16:07:31 -0400 Subject: [PATCH 08/16] Pipeline Scylla batch row writes --- sei-db/state_db/ss/offload/consumer/scylla.go | 50 +++++++- .../ss/offload/consumer/scylla_test.go | 121 ++++++++++++++++++ 2 files changed, 166 insertions(+), 5 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/scylla.go b/sei-db/state_db/ss/offload/consumer/scylla.go index fe030c6901..bcf061c7b0 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla.go +++ b/sei-db/state_db/ss/offload/consumer/scylla.go @@ -120,12 +120,14 @@ func (s *scyllaSink) Write(ctx context.Context, rec Record) error { } func (s *scyllaSink) WriteBatch(ctx context.Context, records []Record) error { - for _, rec := range compactRecords(records) { - if err := s.writeRecord(ctx, rec); err != nil { - return err - } + records = compactRecords(records) + if len(records) == 0 { + return nil } - return nil + if len(records) == 1 { + return s.writeRecord(ctx, records[0]) + } + return s.writeRecordsPipelined(ctx, records) } func compactRecords(records []Record) []Record { @@ -152,6 +154,11 @@ func (s *scyllaSink) writeRecord(ctx context.Context, rec Record) error { if err := s.writeRecordRows(ctx, version, entry); err != nil { return err } + return s.writeVersionMarker(ctx, rec) +} + +func (s *scyllaSink) writeVersionMarker(ctx context.Context, rec Record) error { + version := rec.Entry.Version if err := s.exec(ctx, insertVersionCQL, historical.VersionBucket(version), version, @@ -165,6 +172,39 @@ func (s *scyllaSink) writeRecord(ctx context.Context, rec Record) error { return nil } +func (s *scyllaSink) writeRecordsPipelined(ctx context.Context, records []Record) error { + rowCtx, cancel := context.WithCancel(ctx) + defer cancel() + g, gctx := errgroup.WithContext(rowCtx) + rowDone := make([]chan error, len(records)) + for i := range records { + rowDone[i] = make(chan error, 1) + i := i + rec := records[i] + g.Go(func() error { + err := s.writeRecordRows(gctx, rec.Entry.Version, rec.Entry) + if err != nil { + err = fmt.Errorf("write scylla/cassandra rows version %d: %w", rec.Entry.Version, err) + } + rowDone[i] <- err + return err + }) + } + for i, rec := range records { + if err := <-rowDone[i]; err != nil { + cancel() + _ = g.Wait() + return err + } + if err := s.writeVersionMarker(ctx, rec); err != nil { + cancel() + _ = g.Wait() + return err + } + } + return g.Wait() +} + func (s *scyllaSink) writeRecordRows(ctx context.Context, version int64, entry *proto.ChangelogEntry) error { g, gctx := errgroup.WithContext(ctx) g.SetLimit(s.effectiveMutationWorkers()) diff --git a/sei-db/state_db/ss/offload/consumer/scylla_test.go b/sei-db/state_db/ss/offload/consumer/scylla_test.go index 1c2d3b1645..8516ed6e74 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla_test.go +++ b/sei-db/state_db/ss/offload/consumer/scylla_test.go @@ -204,3 +204,124 @@ func TestScyllaSinkCompactsDuplicateMutations(t *testing.T) { require.Equal(t, write{deleted: true}, writes["bank/drop"]) require.Equal(t, write{value: []byte("separate-store")}, writes["evm/k"]) } + +func TestScyllaSinkWriteBatchPipelinesRowsAndOrdersMarkers(t *testing.T) { + rowStarted := make(chan int64, 2) + markerWritten := make(chan int64, 2) + releaseRows := map[int64]chan struct{}{ + 1: make(chan struct{}), + 2: make(chan struct{}), + } + var activeRows atomic.Int32 + var sawConcurrentRows atomic.Bool + var mu sync.Mutex + rowsDone := make(map[int64]bool) + var markers []int64 + var markerBeforeRowsDone bool + + sink := &scyllaSink{ + mutationWorkers: 1, + exec: func(ctx context.Context, stmt string, values ...interface{}) error { + switch { + case strings.Contains(stmt, "state_mutations"): + version := values[2].(int64) + if activeRows.Add(1) > 1 { + sawConcurrentRows.Store(true) + } + rowStarted <- version + select { + case <-releaseRows[version]: + case <-ctx.Done(): + activeRows.Add(-1) + return ctx.Err() + } + activeRows.Add(-1) + mu.Lock() + rowsDone[version] = true + mu.Unlock() + return nil + case strings.Contains(stmt, "state_versions"): + version := values[1].(int64) + mu.Lock() + if !rowsDone[version] { + markerBeforeRowsDone = true + } + markers = append(markers, version) + mu.Unlock() + markerWritten <- version + return nil + default: + return nil + } + }, + } + records := []Record{ + { + Topic: "t", + Partition: 0, + Offset: 10, + Entry: &proto.ChangelogEntry{ + Version: 1, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: []byte("k1"), Value: []byte("v1")}}}, + }}, + }, + }, + { + Topic: "t", + Partition: 0, + Offset: 11, + Entry: &proto.ChangelogEntry{ + Version: 2, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: []byte("k2"), Value: []byte("v2")}}}, + }}, + }, + }, + } + + errCh := make(chan error, 1) + go func() { + errCh <- sink.WriteBatch(context.Background(), records) + }() + + started := map[int64]bool{} + for len(started) < 2 { + select { + case version := <-rowStarted: + started[version] = true + case <-time.After(time.Second): + t.Fatal("timed out waiting for pipelined row writes") + } + } + require.True(t, sawConcurrentRows.Load()) + + close(releaseRows[2]) + select { + case version := <-markerWritten: + t.Fatalf("marker %d written before earlier record rows completed", version) + case <-time.After(100 * time.Millisecond): + } + + close(releaseRows[1]) + for _, want := range []int64{1, 2} { + select { + case got := <-markerWritten: + require.Equal(t, want, got) + case <-time.After(time.Second): + t.Fatalf("timed out waiting for marker %d", want) + } + } + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(time.Second): + t.Fatal("timed out waiting for batch write") + } + mu.Lock() + defer mu.Unlock() + require.False(t, markerBeforeRowsDone) + require.Equal(t, []int64{1, 2}, markers) +} From f11f0e813202aa8fe782bb0f77459cf0a45e1180 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 14 May 2026 13:10:18 -0400 Subject: [PATCH 09/16] Add Bigtable historical offload backend --- go.mod | 3 + go.sum | 6 + sei-db/config/ss_config.go | 9 + sei-db/config/toml.go | 10 + sei-db/config/toml_test.go | 4 + sei-db/state_db/ss/offload/consumer/README.md | 58 +- .../state_db/ss/offload/consumer/bigtable.go | 262 +++++++++ .../ss/offload/consumer/bigtable_test.go | 168 ++++++ .../cmd/historical-scylla-consumer/main.go | 4 +- sei-db/state_db/ss/offload/consumer/config.go | 38 +- .../consumer/config/example-bigtable.json | 22 + .../ss/offload/consumer/config_test.go | 33 ++ sei-db/state_db/ss/offload/consumer/scylla.go | 4 + .../ss/offload/historical/bigtable.go | 533 ++++++++++++++++++ .../ss/offload/historical/bigtable_test.go | 93 +++ sei-db/state_db/ss/store.go | 29 +- 16 files changed, 1259 insertions(+), 17 deletions(-) create mode 100644 sei-db/state_db/ss/offload/consumer/bigtable.go create mode 100644 sei-db/state_db/ss/offload/consumer/bigtable_test.go create mode 100644 sei-db/state_db/ss/offload/consumer/config/example-bigtable.json create mode 100644 sei-db/state_db/ss/offload/consumer/config_test.go create mode 100644 sei-db/state_db/ss/offload/historical/bigtable.go create mode 100644 sei-db/state_db/ss/offload/historical/bigtable_test.go diff --git a/go.mod b/go.mod index dae2d00b1a..6012f8f185 100644 --- a/go.mod +++ b/go.mod @@ -141,6 +141,8 @@ require ( ) require ( + cloud.google.com/go/bigtable v1.37.0 + cloud.google.com/go/compute/metadata v0.8.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/DataDog/zstd v1.5.7 // indirect @@ -281,6 +283,7 @@ require ( go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/image v0.36.0 golang.org/x/mod v0.32.0 + golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2 // indirect golang.org/x/term v0.39.0 // indirect golang.org/x/text v0.34.0 // indirect diff --git a/go.sum b/go.sum index b941802165..ad867b4147 100644 --- a/go.sum +++ b/go.sum @@ -125,6 +125,8 @@ cloud.google.com/go/bigquery v1.47.0/go.mod h1:sA9XOgy0A8vQK9+MWhEQTY6Tix87M/Zur cloud.google.com/go/bigquery v1.48.0/go.mod h1:QAwSz+ipNgfL5jxiaK7weyOhzdoAy1zFm0Nf1fysJac= cloud.google.com/go/bigquery v1.49.0/go.mod h1:Sv8hMmTFFYBlt/ftw2uN6dFdQPzBlREY9yBh7Oy7/4Q= cloud.google.com/go/bigquery v1.50.0/go.mod h1:YrleYEh2pSEbgTBZYMJ5SuSr0ML3ypjRB1zgf7pvQLU= +cloud.google.com/go/bigtable v1.37.0 h1:Q+x7y04lQ0B+WXp03wc1/FLhFt4CwcQdkwWT0M4Jp3w= +cloud.google.com/go/bigtable v1.37.0/go.mod h1:HXqddP6hduwzrtiTCqZPpj9ij4hGZb4Zy1WF/dT+yaU= cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= cloud.google.com/go/billing v1.5.0/go.mod h1:mztb1tBc3QekhjSgmpf/CV4LzWXLzCArwpLmP2Gm88s= cloud.google.com/go/billing v1.6.0/go.mod h1:WoXzguj+BeHXPbKfNWkqVtDdzORazmCjraY+vrxcyvI= @@ -175,6 +177,8 @@ cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZ cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= +cloud.google.com/go/compute/metadata v0.8.0 h1:HxMRIbao8w17ZX6wBnjhcDkW6lTFpgcaobyVfZWqRLA= +cloud.google.com/go/compute/metadata v0.8.0/go.mod h1:sYOGTp851OV9bOFJ9CH7elVvyzopvWQFNNghtDQ/Biw= cloud.google.com/go/contactcenterinsights v1.3.0/go.mod h1:Eu2oemoePuEFc/xKFPjbTuPSj0fYJcPls9TFlPNnHHY= cloud.google.com/go/contactcenterinsights v1.4.0/go.mod h1:L2YzkGbPsv+vMQMCADxJoT9YiTTnSEd6fEvCeHTYVck= cloud.google.com/go/contactcenterinsights v1.6.0/go.mod h1:IIDlT6CLcDoyv79kDv8iWxMSTZhLxSCofVV5W6YFM/w= @@ -2332,6 +2336,8 @@ golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/perf v0.0.0-20230113213139-801c7ef9e5c5/go.mod h1:UBKtEnL8aqnd+0JHqZ+2qoMDwtuy6cYhhKNoHLBiTQc= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/sei-db/config/ss_config.go b/sei-db/config/ss_config.go index 09fd69117f..ff24924727 100644 --- a/sei-db/config/ss_config.go +++ b/sei-db/config/ss_config.go @@ -97,6 +97,15 @@ type StateStoreConfig struct { // HistoricalOffloadScyllaTimeoutMS defaults in the Scylla reader when zero. HistoricalOffloadScyllaTimeoutMS int `mapstructure:"historical-offload-scylla-timeout-ms"` + + // HistoricalOffloadBigtableProjectID enables Bigtable fallback reads when + // project, instance, and table are set. + HistoricalOffloadBigtableProjectID string `mapstructure:"historical-offload-bigtable-project-id"` + HistoricalOffloadBigtableInstance string `mapstructure:"historical-offload-bigtable-instance"` + HistoricalOffloadBigtableTable string `mapstructure:"historical-offload-bigtable-table"` + HistoricalOffloadBigtableFamily string `mapstructure:"historical-offload-bigtable-family"` + HistoricalOffloadBigtableAppProfile string `mapstructure:"historical-offload-bigtable-app-profile"` + HistoricalOffloadBigtableShards int `mapstructure:"historical-offload-bigtable-shards"` } // DefaultStateStoreConfig returns the default StateStoreConfig diff --git a/sei-db/config/toml.go b/sei-db/config/toml.go index 68e14fce7a..3c8343bd38 100644 --- a/sei-db/config/toml.go +++ b/sei-db/config/toml.go @@ -150,6 +150,16 @@ historical-offload-scylla-password = "{{ .StateStore.HistoricalOffloadScyllaPass historical-offload-scylla-datacenter = "{{ .StateStore.HistoricalOffloadScyllaDatacenter }}" historical-offload-scylla-consistency = "{{ .StateStore.HistoricalOffloadScyllaConsistency }}" historical-offload-scylla-timeout-ms = {{ .StateStore.HistoricalOffloadScyllaTimeoutMS }} + +# Optional Bigtable historical-state fallback. When project, instance, and +# table are set, point reads for versions pruned from local SS fall back to +# Bigtable. Use the same family/shards as the Bigtable consumer. +historical-offload-bigtable-project-id = "{{ .StateStore.HistoricalOffloadBigtableProjectID }}" +historical-offload-bigtable-instance = "{{ .StateStore.HistoricalOffloadBigtableInstance }}" +historical-offload-bigtable-table = "{{ .StateStore.HistoricalOffloadBigtableTable }}" +historical-offload-bigtable-family = "{{ .StateStore.HistoricalOffloadBigtableFamily }}" +historical-offload-bigtable-app-profile = "{{ .StateStore.HistoricalOffloadBigtableAppProfile }}" +historical-offload-bigtable-shards = {{ .StateStore.HistoricalOffloadBigtableShards }} ` // ReceiptStoreConfigTemplate defines the configuration template for receipt-store diff --git a/sei-db/config/toml_test.go b/sei-db/config/toml_test.go index b6bd267faf..ae7e4515d2 100644 --- a/sei-db/config/toml_test.go +++ b/sei-db/config/toml_test.go @@ -92,6 +92,10 @@ func TestStateStoreConfigTemplate(t *testing.T) { require.Contains(t, output, `historical-offload-scylla-keyspace = ""`, "Missing historical Scylla keyspace") require.Contains(t, output, `historical-offload-scylla-consistency = ""`, "Missing historical Scylla consistency") require.Contains(t, output, "historical-offload-scylla-timeout-ms = 0", "Missing historical Scylla timeout") + require.Contains(t, output, `historical-offload-bigtable-project-id = ""`, "Missing historical Bigtable project") + require.Contains(t, output, `historical-offload-bigtable-instance = ""`, "Missing historical Bigtable instance") + require.Contains(t, output, `historical-offload-bigtable-table = ""`, "Missing historical Bigtable table") + require.Contains(t, output, "historical-offload-bigtable-shards = 0", "Missing historical Bigtable shards") } // TestReceiptStoreConfigTemplate verifies that all field paths in the receipt-store TOML template diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index 185920d783..7320f3fde4 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -1,14 +1,16 @@ -# Historical Scylla/Cassandra Offload +# Historical State Offload -This is a prototype historical-state backend for ScyllaDB or Cassandra. +This is a prototype historical-state backend for ScyllaDB/Cassandra and +Bigtable. The intended shape is narrow: - local SS remains the hot store for recent state, writes, imports, pruning, and iterators -- Scylla/Cassandra stores immutable MVCC mutation rows for older history -- reads below local SS retention can fall back to Scylla/Cassandra for `Get` and `Has` +- the downstream store keeps immutable MVCC mutation rows for older history +- reads below local SS retention can fall back to the downstream store for `Get` and `Has` -The table layout is built for point reads by `(store_name, state_key, target_version)`: +The Scylla table layout is built for point reads by +`(store_name, state_key, target_version)`: ```sql SELECT version, value, deleted @@ -18,7 +20,17 @@ ORDER BY version DESC LIMIT 1; ``` -Ordered prefix iteration is intentionally not served from Scylla/Cassandra in this prototype. +Bigtable uses salted row keys with an inverted height suffix: + +```text +m | shard(store,key) | store_name | state_key | inverted_height +``` + +Reads scan from `inverted(target_height)` and stop after the first row, giving +the latest write at or before the requested height. + +Ordered prefix iteration is intentionally not served from the offload store in +this prototype. ## Schema @@ -35,17 +47,27 @@ factors before applying it. ## Consumer The consumer reads historical offload changelog messages from Kafka and writes -them into Scylla/Cassandra. Kafka offsets are committed only after the sink -write succeeds. Within each block, mutation rows are written with bounded -concurrency and the version marker is written last. +them into the configured backend. Kafka offsets are committed only after the +sink write succeeds. Mutation rows are written with bounded concurrency and the +version marker is written last. ```bash go run ./sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer \ ./sei-db/state_db/ss/offload/consumer/config/example-scylla.json ``` -The example config is local-dev only. Set real Kafka brokers, Scylla hosts, -keyspace, datacenter, and credentials in your own config. +For Bigtable: + +```bash +cbt -project my-gcp-project -instance sei-history createtable state_mutations +cbt -project my-gcp-project -instance sei-history createfamily state_mutations state + +go run ./sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer \ + ./sei-db/state_db/ss/offload/consumer/config/example-bigtable.json +``` + +The example configs are local/dev placeholders. Set real Kafka brokers and +backend credentials/config in your own config. ## Node Read Fallback @@ -62,13 +84,25 @@ historical-offload-scylla-consistency = "local_quorum" historical-offload-scylla-timeout-ms = 2000 ``` +Or Bigtable: + +```toml +[state-store] +historical-offload-bigtable-project-id = "my-gcp-project" +historical-offload-bigtable-instance = "sei-history" +historical-offload-bigtable-table = "state_mutations" +historical-offload-bigtable-family = "state" +historical-offload-bigtable-app-profile = "" +historical-offload-bigtable-shards = 256 +``` + Fallback activates only for point reads where the requested version is below the local SS earliest version. Missing rows and tombstones return empty state, same as local SS. ## Current Limits -- No Scylla/Cassandra iterator path. +- No offload iterator path. - No cross-row transaction on ingest; mutation rows are written first and the version marker is written last, so replay is idempotent after partial failure. - No automatic schema creation from the binary. diff --git a/sei-db/state_db/ss/offload/consumer/bigtable.go b/sei-db/state_db/ss/offload/consumer/bigtable.go new file mode 100644 index 0000000000..811786dcda --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/bigtable.go @@ -0,0 +1,262 @@ +package consumer + +import ( + "context" + "fmt" + "time" + + "golang.org/x/sync/errgroup" + + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" +) + +const defaultBigtableMutationWorkers = 16 + +type BigtableConfig struct { + ProjectID string + InstanceID string + Table string + Family string + AppProfile string + Shards int + MutationWorkers int +} + +func (c *BigtableConfig) ApplyDefaults() { + cfg := c.toHistorical() + cfg.ApplyDefaults() + c.Family = cfg.Family + c.Shards = cfg.Shards + if c.MutationWorkers == 0 { + c.MutationWorkers = defaultBigtableMutationWorkers + } +} + +func (c BigtableConfig) Configured() bool { + return c.toHistorical().Configured() +} + +func (c *BigtableConfig) Validate() error { + cfg := c.toHistorical() + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return err + } + if c.MutationWorkers < 0 { + return fmt.Errorf("bigtable mutation workers must be non-negative") + } + return nil +} + +func (c BigtableConfig) toHistorical() historical.BigtableConfig { + return historical.BigtableConfig{ + ProjectID: c.ProjectID, + InstanceID: c.InstanceID, + Table: c.Table, + Family: c.Family, + AppProfile: c.AppProfile, + Shards: c.Shards, + } +} + +type bigtableSink struct { + client *historical.BigtableClient + applyBulk historical.BigtableApplyBulkFunc + readRows historical.BigtableReadRowsFunc + family string + shards int + mutationWorkers int +} + +var _ Sink = (*bigtableSink)(nil) +var _ BatchSink = (*bigtableSink)(nil) + +func NewBigtableSink(cfg BigtableConfig) (Sink, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + ctx := context.Background() + client, err := historical.OpenBigtableClient(ctx, cfg.toHistorical()) + if err != nil { + return nil, err + } + return &bigtableSink{ + client: client, + applyBulk: client.ApplyBulk, + readRows: client.ReadRows, + family: cfg.Family, + shards: cfg.Shards, + mutationWorkers: cfg.MutationWorkers, + }, nil +} + +func (s *bigtableSink) Close() error { + if s.client != nil { + return s.client.Close() + } + return nil +} + +func (s *bigtableSink) LastVersion(ctx context.Context) (int64, error) { + return historical.BigtableLastVersion(ctx, s.readRows) +} + +func (s *bigtableSink) Write(ctx context.Context, rec Record) error { + return s.WriteBatch(ctx, []Record{rec}) +} + +func (s *bigtableSink) WriteBatch(ctx context.Context, records []Record) error { + records = compactRecords(records) + if len(records) == 0 { + return nil + } + if len(records) == 1 { + return s.writeRecord(ctx, records[0]) + } + return s.writeRecordsPipelined(ctx, records) +} + +func (s *bigtableSink) writeRecord(ctx context.Context, rec Record) error { + if rec.Entry == nil { + return nil + } + if err := s.writeRecordRows(ctx, rec.Entry.Version, rec.Entry); err != nil { + return err + } + return s.writeVersionMarker(ctx, rec) +} + +func (s *bigtableSink) writeRecordsPipelined(ctx context.Context, records []Record) error { + rowCtx, cancel := context.WithCancel(ctx) + defer cancel() + g, gctx := errgroup.WithContext(rowCtx) + g.SetLimit(s.effectiveMutationWorkers()) + rowDone := make([]chan error, len(records)) + for i := range records { + rowDone[i] = make(chan error, 1) + i := i + rec := records[i] + g.Go(func() error { + err := s.writeRecordRows(gctx, rec.Entry.Version, rec.Entry) + if err != nil { + err = fmt.Errorf("write bigtable rows version %d: %w", rec.Entry.Version, err) + } + rowDone[i] <- err + return err + }) + } + for i, rec := range records { + if err := <-rowDone[i]; err != nil { + cancel() + _ = g.Wait() + return err + } + if err := s.writeVersionMarker(ctx, rec); err != nil { + cancel() + _ = g.Wait() + return err + } + } + return g.Wait() +} + +func (s *bigtableSink) writeRecordRows(ctx context.Context, version int64, entry *proto.ChangelogEntry) error { + rows := s.recordRowMutations(version, entry) + if len(rows) == 0 { + return nil + } + errs, err := s.applyBulk(ctx, rows) + return bigtableBulkError(rows, errs, err) +} + +func (s *bigtableSink) recordRowMutations(version int64, entry *proto.ChangelogEntry) []historical.BigtableRowMutation { + rows := make([]historical.BigtableRowMutation, 0) + for _, mutation := range compactMutations(entry) { + rows = append(rows, s.mutationRow(version, mutation.storeName, mutation.pair)) + } + for _, up := range entry.Upgrades { + rows = append(rows, s.upgradeRow(version, up)) + } + return rows +} + +func (s *bigtableSink) mutationRow(version int64, storeName string, pair *proto.KVPair) historical.BigtableRowMutation { + ts := historical.BigtableTimestamp(version) + deleted := pair.Delete || pair.Value == nil + row := historical.BigtableRowMutation{ + RowKey: historical.BigtableMutationRowKey(storeName, pair.Key, version, s.shards), + } + if !deleted { + row.SetCells = append(row.SetCells, historical.BigtableSetCell{ + Family: s.family, + Qualifier: historical.BigtableValueColumn, + TimestampMicros: ts, + Value: pair.Value, + }) + } + row.SetCells = append(row.SetCells, historical.BigtableSetCell{ + Family: s.family, + Qualifier: historical.BigtableDeletedColumn, + TimestampMicros: ts, + Value: boolByte(deleted), + }) + return row +} + +func (s *bigtableSink) upgradeRow(version int64, up *proto.TreeNameUpgrade) historical.BigtableRowMutation { + ts := historical.BigtableTimestamp(version) + return historical.BigtableRowMutation{ + RowKey: historical.BigtableUpgradeRowKey(version, up.Name), + SetCells: []historical.BigtableSetCell{ + {Family: s.family, Qualifier: "rename_from", TimestampMicros: ts, Value: []byte(up.RenameFrom)}, + {Family: s.family, Qualifier: historical.BigtableDeletedColumn, TimestampMicros: ts, Value: boolByte(up.Delete)}, + }, + } +} + +func (s *bigtableSink) writeVersionMarker(ctx context.Context, rec Record) error { + version := rec.Entry.Version + ts := historical.BigtableTimestamp(version) + row := historical.BigtableRowMutation{ + RowKey: historical.BigtableVersionRowKey(version), + SetCells: []historical.BigtableSetCell{ + {Family: s.family, Qualifier: "topic", TimestampMicros: ts, Value: []byte(rec.Topic)}, + {Family: s.family, Qualifier: "partition", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", rec.Partition))}, + {Family: s.family, Qualifier: "offset", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", rec.Offset))}, + {Family: s.family, Qualifier: "ingested_at_unix_nano", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", time.Now().UnixNano()))}, + }, + } + errs, err := s.applyBulk(ctx, []historical.BigtableRowMutation{row}) + if err := bigtableBulkError([]historical.BigtableRowMutation{row}, errs, err); err != nil { + return fmt.Errorf("insert bigtable version %d: %w", version, err) + } + return nil +} + +func (s *bigtableSink) effectiveMutationWorkers() int { + if s.mutationWorkers <= 0 { + return defaultBigtableMutationWorkers + } + return s.mutationWorkers +} + +func bigtableBulkError(rows []historical.BigtableRowMutation, errs []error, err error) error { + if err != nil { + return err + } + for i, rowErr := range errs { + if rowErr != nil { + return fmt.Errorf("row %q: %w", rows[i].RowKey, rowErr) + } + } + return nil +} + +func boolByte(v bool) []byte { + if v { + return []byte{1} + } + return []byte{0} +} diff --git a/sei-db/state_db/ss/offload/consumer/bigtable_test.go b/sei-db/state_db/ss/offload/consumer/bigtable_test.go new file mode 100644 index 0000000000..d8903798b3 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/bigtable_test.go @@ -0,0 +1,168 @@ +package consumer + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" + "github.com/stretchr/testify/require" +) + +func TestBigtableConfigApplyDefaults(t *testing.T) { + cfg := BigtableConfig{ + ProjectID: "project", + InstanceID: "instance", + Table: "state", + } + cfg.ApplyDefaults() + require.Equal(t, historical.DefaultBigtableFamily, cfg.Family) + require.Equal(t, historical.DefaultBigtableShards, cfg.Shards) + require.Equal(t, defaultBigtableMutationWorkers, cfg.MutationWorkers) + require.NoError(t, cfg.Validate()) +} + +func TestBigtableSinkWritesMutationRowsAndVersionMarker(t *testing.T) { + var rows []string + sink := &bigtableSink{ + family: historical.DefaultBigtableFamily, + shards: historical.DefaultBigtableShards, + applyBulk: func(_ context.Context, mutations []historical.BigtableRowMutation) ([]error, error) { + for _, mutation := range mutations { + rows = append(rows, mutation.RowKey) + } + return make([]error, len(mutations)), nil + }, + } + entry := &proto.ChangelogEntry{ + Version: 7, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{ + {Key: []byte("k1"), Value: []byte("old")}, + {Key: []byte("k1"), Value: []byte("new")}, + {Key: []byte("drop"), Delete: true}, + }}, + }}, + Upgrades: []*proto.TreeNameUpgrade{{Name: "new-store"}}, + } + + require.NoError(t, sink.Write(context.Background(), Record{Topic: "t", Partition: 1, Offset: 2, Entry: entry})) + require.Len(t, rows, 4) + require.Equal(t, historical.BigtableMutationRowKey("bank", []byte("k1"), 7, historical.DefaultBigtableShards), rows[0]) + require.Equal(t, historical.BigtableMutationRowKey("bank", []byte("drop"), 7, historical.DefaultBigtableShards), rows[1]) + require.Equal(t, historical.BigtableUpgradeRowKey(7, "new-store"), rows[2]) + require.Equal(t, historical.BigtableVersionRowKey(7), rows[3]) +} + +func TestBigtableSinkWriteBatchPipelinesRowsAndOrdersMarkers(t *testing.T) { + rowStarted := make(chan int64, 2) + markerWritten := make(chan int64, 2) + releaseRows := map[int64]chan struct{}{ + 1: make(chan struct{}), + 2: make(chan struct{}), + } + var activeRows atomic.Int32 + var sawConcurrentRows atomic.Bool + var mu sync.Mutex + rowsDone := make(map[int64]bool) + var markers []int64 + var markerBeforeRowsDone bool + + sink := &bigtableSink{ + family: historical.DefaultBigtableFamily, + shards: historical.DefaultBigtableShards, + mutationWorkers: 2, + applyBulk: func(ctx context.Context, mutations []historical.BigtableRowMutation) ([]error, error) { + version, ok := historical.BigtableVersionFromRowKey(mutations[0].RowKey) + require.True(t, ok) + if mutations[0].RowKey == historical.BigtableVersionRowKey(version) { + mu.Lock() + if !rowsDone[version] { + markerBeforeRowsDone = true + } + markers = append(markers, version) + mu.Unlock() + markerWritten <- version + return make([]error, len(mutations)), nil + } + if activeRows.Add(1) > 1 { + sawConcurrentRows.Store(true) + } + rowStarted <- version + select { + case <-releaseRows[version]: + case <-ctx.Done(): + activeRows.Add(-1) + return nil, ctx.Err() + } + activeRows.Add(-1) + mu.Lock() + rowsDone[version] = true + mu.Unlock() + return make([]error, len(mutations)), nil + }, + } + records := []Record{ + {Entry: &proto.ChangelogEntry{ + Version: 1, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: []byte("k1"), Value: []byte("v1")}}}, + }}, + }}, + {Entry: &proto.ChangelogEntry{ + Version: 2, + Changesets: []*proto.NamedChangeSet{{ + Name: "bank", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: []byte("k2"), Value: []byte("v2")}}}, + }}, + }}, + } + + errCh := make(chan error, 1) + go func() { + errCh <- sink.WriteBatch(context.Background(), records) + }() + + started := map[int64]bool{} + for len(started) < 2 { + select { + case version := <-rowStarted: + started[version] = true + case <-time.After(time.Second): + t.Fatal("timed out waiting for pipelined row writes") + } + } + require.True(t, sawConcurrentRows.Load()) + + close(releaseRows[2]) + select { + case version := <-markerWritten: + t.Fatalf("marker %d written before earlier record rows completed", version) + case <-time.After(100 * time.Millisecond): + } + + close(releaseRows[1]) + for _, want := range []int64{1, 2} { + select { + case got := <-markerWritten: + require.Equal(t, want, got) + case <-time.After(time.Second): + t.Fatalf("timed out waiting for marker %d", want) + } + } + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(time.Second): + t.Fatal("timed out waiting for batch write") + } + mu.Lock() + defer mu.Unlock() + require.False(t, markerBeforeRowsDone) + require.Equal(t, []int64{1, 2}, markers) +} diff --git a/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go b/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go index 65020ca354..60dc2fc241 100644 --- a/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go +++ b/sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer/main.go @@ -23,9 +23,9 @@ func main() { log.Fatalf("load config: %v", err) } - sink, err := consumer.NewScyllaSink(cfg.Scylla) + sink, err := consumer.NewSinkFromConfig(*cfg) if err != nil { - log.Fatalf("open scylla/cassandra sink: %v", err) + log.Fatalf("open %s sink: %v", cfg.BackendName(), err) } defer func() { _ = sink.Close() }() diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index f381b2c11f..eceb801fb0 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -4,11 +4,14 @@ import ( "encoding/json" "fmt" "os" + "strings" ) type Config struct { + Backend string Kafka KafkaReaderConfig Scylla ScyllaConfig + Bigtable BigtableConfig Workers int ShardBufferSize int MaxBatchRecords int @@ -19,8 +22,17 @@ func (c *Config) Validate() error { if err := c.Kafka.Validate(); err != nil { return fmt.Errorf("kafka: %w", err) } - if err := c.Scylla.Validate(); err != nil { - return fmt.Errorf("scylla: %w", err) + switch c.BackendName() { + case "scylla": + if err := c.Scylla.Validate(); err != nil { + return fmt.Errorf("scylla: %w", err) + } + case "bigtable": + if err := c.Bigtable.Validate(); err != nil { + return fmt.Errorf("bigtable: %w", err) + } + default: + return fmt.Errorf("unsupported backend %q", c.Backend) } if c.Workers < 0 { return fmt.Errorf("workers must be non-negative") @@ -37,6 +49,28 @@ func (c *Config) Validate() error { return nil } +func (c *Config) BackendName() string { + backend := strings.ToLower(strings.TrimSpace(c.Backend)) + if backend != "" { + return backend + } + if c.Bigtable.Configured() && !c.Scylla.Configured() { + return "bigtable" + } + return "scylla" +} + +func NewSinkFromConfig(cfg Config) (Sink, error) { + switch cfg.BackendName() { + case "scylla": + return NewScyllaSink(cfg.Scylla) + case "bigtable": + return NewBigtableSink(cfg.Bigtable) + default: + return nil, fmt.Errorf("unsupported backend %q", cfg.Backend) + } +} + func LoadConfig(path string) (*Config, error) { // #nosec G304 -- config path is supplied by the operator on the command line. raw, err := os.ReadFile(path) diff --git a/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json new file mode 100644 index 0000000000..8af5152b35 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json @@ -0,0 +1,22 @@ +{ + "Backend": "bigtable", + "Kafka": { + "Brokers": ["localhost:9092"], + "Topic": "historical-offload", + "GroupID": "historical-bigtable", + "StartOffset": "first" + }, + "Bigtable": { + "ProjectID": "my-gcp-project", + "InstanceID": "sei-history", + "Table": "state_mutations", + "Family": "state", + "AppProfile": "", + "Shards": 256, + "MutationWorkers": 16 + }, + "Workers": 16, + "ShardBufferSize": 128, + "MaxBatchRecords": 16, + "BatchMaxWaitMS": 10 +} diff --git a/sei-db/state_db/ss/offload/consumer/config_test.go b/sei-db/state_db/ss/offload/consumer/config_test.go new file mode 100644 index 0000000000..6e335bd3cd --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config_test.go @@ -0,0 +1,33 @@ +package consumer + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestConfigBackendName(t *testing.T) { + require.Equal(t, "scylla", (&Config{}).BackendName()) + require.Equal(t, "bigtable", (&Config{Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) + require.Equal(t, "scylla", (&Config{Backend: "Scylla", Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) +} + +func TestConfigValidateBigtable(t *testing.T) { + cfg := Config{ + Backend: "bigtable", + Kafka: KafkaReaderConfig{ + Brokers: []string{"localhost:9092"}, + Topic: "historical-offload", + GroupID: "historical-bigtable", + }, + Bigtable: BigtableConfig{ + ProjectID: "project", + InstanceID: "instance", + Table: "state", + }, + } + require.NoError(t, cfg.Validate()) + + cfg.Bigtable.Table = "" + require.ErrorContains(t, cfg.Validate(), "bigtable") +} diff --git a/sei-db/state_db/ss/offload/consumer/scylla.go b/sei-db/state_db/ss/offload/consumer/scylla.go index bcf061c7b0..babec955e7 100644 --- a/sei-db/state_db/ss/offload/consumer/scylla.go +++ b/sei-db/state_db/ss/offload/consumer/scylla.go @@ -27,6 +27,10 @@ type ScyllaConfig struct { MutationWorkers int } +func (c ScyllaConfig) Configured() bool { + return len(c.Hosts) != 0 || c.Keyspace != "" +} + func (c *ScyllaConfig) ApplyDefaults() { cfg := c.toHistorical() cfg.ApplyDefaults() diff --git a/sei-db/state_db/ss/offload/historical/bigtable.go b/sei-db/state_db/ss/offload/historical/bigtable.go new file mode 100644 index 0000000000..c1f075e4ed --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/bigtable.go @@ -0,0 +1,533 @@ +package historical + +import ( + "context" + "encoding/binary" + "fmt" + "hash/fnv" + "io" + "regexp" + "strings" + "sync" + "time" + + "cloud.google.com/go/bigtable/apiv2/bigtablepb" + "golang.org/x/sync/errgroup" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "google.golang.org/grpc/credentials/oauth" +) + +const ( + DefaultBigtableFamily = "state" + DefaultBigtableShards = 256 + + defaultBigtableReadWorkers = 16 + + bigtableMutationPrefix = byte('m') + bigtableVersionPrefix = byte('v') + bigtableUpgradePrefix = byte('u') + + BigtableValueColumn = "value" + BigtableDeletedColumn = "deleted" +) + +const bigtableEndpoint = "bigtable.googleapis.com:443" + +type BigtableClient struct { + conn *grpc.ClientConn + data bigtablepb.BigtableClient + tableName string + appProfile string +} + +type BigtableCell struct { + Family string + Qualifier string + Value []byte +} + +type BigtableRow struct { + Key string + Cells []BigtableCell +} + +type BigtableSetCell struct { + Family string + Qualifier string + TimestampMicros int64 + Value []byte +} + +type BigtableRowMutation struct { + RowKey string + SetCells []BigtableSetCell +} + +type BigtableConfig struct { + ProjectID string + InstanceID string + Table string + Family string + AppProfile string + Shards int +} + +func (c *BigtableConfig) ApplyDefaults() { + if c.Family == "" { + c.Family = DefaultBigtableFamily + } + if c.Shards == 0 { + c.Shards = DefaultBigtableShards + } +} + +func (c BigtableConfig) Configured() bool { + return strings.TrimSpace(c.ProjectID) != "" || + strings.TrimSpace(c.InstanceID) != "" || + strings.TrimSpace(c.Table) != "" +} + +func (c *BigtableConfig) Validate() error { + if strings.TrimSpace(c.ProjectID) == "" { + return fmt.Errorf("bigtable project id is required") + } + if strings.TrimSpace(c.InstanceID) == "" { + return fmt.Errorf("bigtable instance id is required") + } + if strings.TrimSpace(c.Table) == "" { + return fmt.Errorf("bigtable table is required") + } + if strings.TrimSpace(c.Family) == "" { + return fmt.Errorf("bigtable family is required") + } + if c.Shards < 0 || c.Shards > 65535 { + return fmt.Errorf("bigtable shards must be between 1 and 65535") + } + return nil +} + +func NewBigtableReader(cfg BigtableConfig) (Reader, error) { + ctx := context.Background() + client, err := OpenBigtableClient(ctx, cfg) + if err != nil { + return nil, err + } + cfg.ApplyDefaults() + return &bigtableReader{ + client: client, + readRows: client.ReadRows, + family: cfg.Family, + shards: cfg.Shards, + }, nil +} + +func OpenBigtableClient(ctx context.Context, cfg BigtableConfig) (*BigtableClient, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + perRPC, err := oauth.NewApplicationDefault(ctx, "https://www.googleapis.com/auth/bigtable.data", "https://www.googleapis.com/auth/cloud-platform") + if err != nil { + return nil, fmt.Errorf("bigtable auth: %w", err) + } + conn, err := grpc.DialContext(ctx, bigtableEndpoint, + grpc.WithTransportCredentials(credentials.NewClientTLSFromCert(nil, "")), + grpc.WithPerRPCCredentials(perRPC), + ) + if err != nil { + return nil, fmt.Errorf("open bigtable connection: %w", err) + } + return &BigtableClient{ + conn: conn, + data: bigtablepb.NewBigtableClient(conn), + tableName: bigtableTableName(cfg.ProjectID, cfg.InstanceID, cfg.Table), + appProfile: cfg.AppProfile, + }, nil +} + +type BigtableReadRowsFunc func(ctx context.Context, startKey, endKey []byte, limit int64, family string, f func(BigtableRow) bool) error + +type BigtableApplyBulkFunc func(ctx context.Context, rows []BigtableRowMutation) ([]error, error) + +func (c *BigtableClient) Close() error { + if c.conn == nil { + return nil + } + return c.conn.Close() +} + +func (c *BigtableClient) ReadRows(ctx context.Context, startKey, endKey []byte, limit int64, family string, f func(BigtableRow) bool) error { + req := &bigtablepb.ReadRowsRequest{ + TableName: c.tableName, + AppProfileId: c.appProfile, + Rows: &bigtablepb.RowSet{RowRanges: []*bigtablepb.RowRange{{ + StartKey: &bigtablepb.RowRange_StartKeyClosed{StartKeyClosed: startKey}, + EndKey: &bigtablepb.RowRange_EndKeyOpen{EndKeyOpen: endKey}, + }}}, + RowsLimit: limit, + } + if len(endKey) == 0 { + req.Rows.RowRanges[0].EndKey = nil + } + if family != "" { + req.Filter = &bigtablepb.RowFilter{ + Filter: &bigtablepb.RowFilter_FamilyNameRegexFilter{FamilyNameRegexFilter: regexp.QuoteMeta(family)}, + } + } + stream, err := c.data.ReadRows(ctx, req) + if err != nil { + return err + } + var builder bigtableRowBuilder + for { + resp, err := stream.Recv() + if err == io.EOF { + return nil + } + if err != nil { + return err + } + for _, chunk := range resp.Chunks { + row, committed, err := builder.add(chunk) + if err != nil { + return err + } + if committed && !f(row) { + return nil + } + } + } +} + +func (c *BigtableClient) ApplyBulk(ctx context.Context, rows []BigtableRowMutation) ([]error, error) { + if len(rows) == 0 { + return nil, nil + } + entries := make([]*bigtablepb.MutateRowsRequest_Entry, 0, len(rows)) + for _, row := range rows { + entry := &bigtablepb.MutateRowsRequest_Entry{RowKey: []byte(row.RowKey)} + for _, cell := range row.SetCells { + entry.Mutations = append(entry.Mutations, &bigtablepb.Mutation{ + Mutation: &bigtablepb.Mutation_SetCell_{SetCell: &bigtablepb.Mutation_SetCell{ + FamilyName: cell.Family, + ColumnQualifier: []byte(cell.Qualifier), + TimestampMicros: cell.TimestampMicros, + Value: cell.Value, + }}, + }) + } + entries = append(entries, entry) + } + stream, err := c.data.MutateRows(ctx, &bigtablepb.MutateRowsRequest{ + TableName: c.tableName, + AppProfileId: c.appProfile, + Entries: entries, + }) + if err != nil { + return nil, err + } + rowErrs := make([]error, len(rows)) + seen := make([]bool, len(rows)) + for { + resp, err := stream.Recv() + if err == io.EOF { + break + } + if err != nil { + return rowErrs, err + } + for _, entry := range resp.Entries { + if entry.Index < 0 || int(entry.Index) >= len(rowErrs) { + return rowErrs, fmt.Errorf("bigtable returned invalid mutation index %d", entry.Index) + } + idx := int(entry.Index) + seen[idx] = true + if st := entry.Status; st != nil && st.Code != 0 { + rowErrs[idx] = fmt.Errorf("bigtable status %d: %s", st.Code, st.Message) + } + } + } + for i := range seen { + if !seen[i] { + rowErrs[i] = fmt.Errorf("bigtable missing mutation result") + } + } + return rowErrs, nil +} + +type bigtableReader struct { + client *BigtableClient + readRows BigtableReadRowsFunc + family string + shards int +} + +var _ Reader = (*bigtableReader)(nil) + +func (r *bigtableReader) Close() error { + if r.client != nil { + return r.client.Close() + } + return nil +} + +func (r *bigtableReader) LastVersion(ctx context.Context) (int64, error) { + return BigtableLastVersion(ctx, r.readRows) +} + +func (r *bigtableReader) Has(ctx context.Context, storeName string, key []byte, targetVersion int64) (bool, error) { + _, err := r.Get(ctx, storeName, key, targetVersion) + if err != nil { + if err == ErrNotFound { + return false, nil + } + return false, err + } + return true, nil +} + +func (r *bigtableReader) Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) { + prefix := []byte(BigtableMutationRowPrefix(storeName, key, r.shards)) + start := []byte(BigtableMutationRowKey(storeName, key, targetVersion, r.shards)) + var row BigtableRow + err := r.readRows(ctx, start, bigtablePrefixEnd(prefix), 1, r.family, func(r BigtableRow) bool { + row = r + return false + }) + if err != nil { + return Value{}, fmt.Errorf("bigtable get lookup: %w", err) + } + if row.Key == "" { + return Value{}, ErrNotFound + } + return BigtableValueFromRow(row, r.family) +} + +func (r *bigtableReader) BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) { + out := make(map[Lookup]Value, len(lookups)) + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(defaultBigtableReadWorkers) + var mu sync.Mutex + for _, lookup := range lookups { + lookup := lookup + g.Go(func() error { + value, err := r.Get(gctx, lookup.StoreName, []byte(lookup.Key), targetVersion) + if err != nil { + if err == ErrNotFound { + return nil + } + return err + } + mu.Lock() + out[lookup] = value + mu.Unlock() + return nil + }) + } + if err := g.Wait(); err != nil { + return nil, err + } + return out, nil +} + +func BigtableLastVersion(ctx context.Context, readRows BigtableReadRowsFunc) (int64, error) { + var maxVersion int64 + for bucket := 0; bucket < VersionBucketCount; bucket++ { + prefix := bigtableVersionRowPrefix(bucket) + err := readRows(ctx, prefix, bigtablePrefixEnd(prefix), 1, "", func(row BigtableRow) bool { + version, ok := BigtableVersionFromRowKey(row.Key) + if ok && version > maxVersion { + maxVersion = version + } + return false + }) + if err != nil { + return 0, fmt.Errorf("read latest bigtable version bucket %d: %w", bucket, err) + } + } + return maxVersion, nil +} + +func BigtableValueFromRow(row BigtableRow, family string) (Value, error) { + version, ok := BigtableVersionFromRowKey(row.Key) + if !ok { + return Value{}, fmt.Errorf("invalid bigtable mutation row key") + } + var value []byte + deleted := false + for _, cell := range row.Cells { + if cell.Family != family { + continue + } + switch cell.Qualifier { + case BigtableValueColumn: + value = append([]byte(nil), cell.Value...) + case BigtableDeletedColumn: + deleted = len(cell.Value) > 0 && cell.Value[0] == 1 + } + } + if deleted || value == nil { + return Value{}, ErrNotFound + } + return Value{Bytes: value, Version: version}, nil +} + +func BigtableMutationRowPrefix(storeName string, key []byte, shards int) string { + shards = normalizeBigtableShards(shards) + shard := bigtableShard(storeName, key, shards) + prefix := make([]byte, 1+2+2+len(storeName)+4+len(key)) + prefix[0] = bigtableMutationPrefix + binary.BigEndian.PutUint16(prefix[1:], uint16(shard)) + binary.BigEndian.PutUint16(prefix[3:], uint16(len(storeName))) + copy(prefix[5:], storeName) + keyOffset := 5 + len(storeName) + binary.BigEndian.PutUint32(prefix[keyOffset:], uint32(len(key))) + copy(prefix[keyOffset+4:], key) + return string(prefix) +} + +func BigtableMutationRowKey(storeName string, key []byte, version int64, shards int) string { + prefix := []byte(BigtableMutationRowPrefix(storeName, key, shards)) + return string(append(prefix, bigtableInvertedVersion(version)...)) +} + +func BigtableVersionRowKey(version int64) string { + prefix := bigtableVersionRowPrefix(VersionBucket(version)) + return string(append(prefix, bigtableInvertedVersion(version)...)) +} + +func BigtableUpgradeRowKey(version int64, name string) string { + key := make([]byte, 1+8+2+len(name)) + key[0] = bigtableUpgradePrefix + copy(key[1:], bigtableInvertedVersion(version)) + binary.BigEndian.PutUint16(key[9:], uint16(len(name))) + copy(key[11:], name) + return string(key) +} + +func BigtableVersionFromRowKey(rowKey string) (int64, bool) { + key := []byte(rowKey) + switch { + case len(key) >= 1+2+8 && key[0] == bigtableVersionPrefix: + return bigtableDecodeInvertedVersion(key[3:11]), true + case len(key) >= 8 && key[0] == bigtableMutationPrefix: + return bigtableDecodeInvertedVersion(key[len(key)-8:]), true + default: + return 0, false + } +} + +func BigtableTimestamp(version int64) int64 { + return version * int64(time.Millisecond/time.Microsecond) +} + +type bigtableRowBuilder struct { + key []byte + family string + qualifier string + value []byte + inCell bool + cells []BigtableCell +} + +func (b *bigtableRowBuilder) add(chunk *bigtablepb.ReadRowsResponse_CellChunk) (BigtableRow, bool, error) { + if chunk.GetResetRow() { + b.reset() + return BigtableRow{}, false, nil + } + if len(chunk.RowKey) != 0 { + b.key = append(b.key[:0], chunk.RowKey...) + b.family = "" + b.qualifier = "" + b.value = nil + b.inCell = false + b.cells = b.cells[:0] + } + if chunk.FamilyName != nil { + b.family = chunk.FamilyName.Value + } + if chunk.Qualifier != nil { + b.qualifier = string(chunk.Qualifier.Value) + b.value = b.value[:0] + b.inCell = true + } + b.value = append(b.value, chunk.Value...) + if b.inCell && chunk.ValueSize == 0 { + b.cells = append(b.cells, BigtableCell{ + Family: b.family, + Qualifier: b.qualifier, + Value: append([]byte(nil), b.value...), + }) + b.value = nil + b.inCell = false + } + if !chunk.GetCommitRow() { + return BigtableRow{}, false, nil + } + if len(b.key) == 0 { + return BigtableRow{}, false, fmt.Errorf("bigtable committed row without key") + } + row := BigtableRow{ + Key: string(append([]byte(nil), b.key...)), + Cells: append([]BigtableCell(nil), b.cells...), + } + b.reset() + return row, true, nil +} + +func (b *bigtableRowBuilder) reset() { + b.key = nil + b.family = "" + b.qualifier = "" + b.value = nil + b.inCell = false + b.cells = nil +} + +func bigtableTableName(projectID, instanceID, table string) string { + return fmt.Sprintf("projects/%s/instances/%s/tables/%s", projectID, instanceID, table) +} + +func bigtableVersionRowPrefix(bucket int) []byte { + prefix := make([]byte, 1+2) + prefix[0] = bigtableVersionPrefix + binary.BigEndian.PutUint16(prefix[1:], uint16(bucket)) + return prefix +} + +func bigtablePrefixEnd(prefix []byte) []byte { + end := append([]byte(nil), prefix...) + for i := len(end) - 1; i >= 0; i-- { + if end[i] != 0xff { + end[i]++ + return end[:i+1] + } + } + return nil +} + +func bigtableInvertedVersion(version int64) []byte { + out := make([]byte, 8) + binary.BigEndian.PutUint64(out, ^uint64(version)) + return out +} + +func bigtableDecodeInvertedVersion(encoded []byte) int64 { + return int64(^binary.BigEndian.Uint64(encoded)) +} + +func bigtableShard(storeName string, key []byte, shards int) int { + h := fnv.New32a() + _, _ = h.Write([]byte(storeName)) + _, _ = h.Write([]byte{0}) + _, _ = h.Write(key) + return int(h.Sum32() % uint32(shards)) +} + +func normalizeBigtableShards(shards int) int { + if shards <= 0 { + return DefaultBigtableShards + } + return shards +} diff --git a/sei-db/state_db/ss/offload/historical/bigtable_test.go b/sei-db/state_db/ss/offload/historical/bigtable_test.go new file mode 100644 index 0000000000..2c78f2f228 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/bigtable_test.go @@ -0,0 +1,93 @@ +package historical + +import ( + "context" + "sort" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestBigtableConfigDefaultsAndValidate(t *testing.T) { + cfg := BigtableConfig{ + ProjectID: "project", + InstanceID: "instance", + Table: "state", + } + cfg.ApplyDefaults() + require.Equal(t, DefaultBigtableFamily, cfg.Family) + require.Equal(t, DefaultBigtableShards, cfg.Shards) + require.NoError(t, cfg.Validate()) + + missingProject := BigtableConfig{InstanceID: "i", Table: "t", Family: "f"} + missingInstance := BigtableConfig{ProjectID: "p", Table: "t", Family: "f"} + missingTable := BigtableConfig{ProjectID: "p", InstanceID: "i", Family: "f"} + require.ErrorContains(t, missingProject.Validate(), "project") + require.ErrorContains(t, missingInstance.Validate(), "instance") + require.ErrorContains(t, missingTable.Validate(), "table") +} + +func TestBigtableMutationRowKeyOrdersLatestVersionFirst(t *testing.T) { + key40 := BigtableMutationRowKey("bank", []byte("k1"), 40, 256) + key60 := BigtableMutationRowKey("bank", []byte("k1"), 60, 256) + key80 := BigtableMutationRowKey("bank", []byte("k1"), 80, 256) + keys := []string{key40, key80, key60} + sort.Strings(keys) + require.Equal(t, []string{key80, key60, key40}, keys) + + version, ok := BigtableVersionFromRowKey(key60) + require.True(t, ok) + require.Equal(t, int64(60), version) + require.NotEqual(t, + BigtableMutationRowPrefix("bank", []byte("k"), 256), + BigtableMutationRowPrefix("bank", []byte("k1"), 256), + ) +} + +func TestBigtableValueFromRow(t *testing.T) { + rowKey := BigtableMutationRowKey("bank", []byte("k"), 7, 256) + row := BigtableRow{ + Key: rowKey, + Cells: []BigtableCell{ + {Family: DefaultBigtableFamily, Qualifier: BigtableValueColumn, Value: []byte("value")}, + {Family: DefaultBigtableFamily, Qualifier: BigtableDeletedColumn, Value: []byte{0}}, + }, + } + value, err := BigtableValueFromRow(row, DefaultBigtableFamily) + require.NoError(t, err) + require.Equal(t, []byte("value"), value.Bytes) + require.Equal(t, int64(7), value.Version) + value.Bytes[0] = 'V' + require.Equal(t, []byte("value"), row.Cells[0].Value) + + row.Cells[1].Value = []byte{1} + _, err = BigtableValueFromRow(row, DefaultBigtableFamily) + require.ErrorIs(t, err, ErrNotFound) +} + +func TestBigtableReaderGetUsesMVCCRange(t *testing.T) { + wantRow := BigtableMutationRowKey("bank", []byte("k"), 40, 256) + reader := &bigtableReader{ + family: DefaultBigtableFamily, + shards: 256, + readRows: func(_ context.Context, startKey, endKey []byte, limit int64, family string, f func(BigtableRow) bool) error { + require.Equal(t, []byte(BigtableMutationRowKey("bank", []byte("k"), 60, 256)), startKey) + require.NotEmpty(t, endKey) + require.Equal(t, int64(1), limit) + require.Equal(t, DefaultBigtableFamily, family) + f(BigtableRow{ + Key: wantRow, + Cells: []BigtableCell{ + {Family: DefaultBigtableFamily, Qualifier: BigtableValueColumn, Value: []byte("v40")}, + {Family: DefaultBigtableFamily, Qualifier: BigtableDeletedColumn, Value: []byte{0}}, + }, + }) + return nil + }, + } + + value, err := reader.Get(context.Background(), "bank", []byte("k"), 60) + require.NoError(t, err) + require.Equal(t, []byte("v40"), value.Bytes) + require.Equal(t, int64(40), value.Version) +} diff --git a/sei-db/state_db/ss/store.go b/sei-db/state_db/ss/store.go index dc1ce2edc2..e102af34c9 100644 --- a/sei-db/state_db/ss/store.go +++ b/sei-db/state_db/ss/store.go @@ -20,9 +20,30 @@ func NewStateStore(homeDir string, ssConfig config.StateStoreConfig) (types.Stat if err != nil { return nil, err } - if !scyllaHistoricalOffloadConfigured(ssConfig) { + scyllaConfigured := scyllaHistoricalOffloadConfigured(ssConfig) + bigtableConfigured := bigtableHistoricalOffloadConfigured(ssConfig) + if scyllaConfigured && bigtableConfigured { + _ = primary.Close() + return nil, fmt.Errorf("only one historical offload fallback can be configured") + } + if !scyllaConfigured && !bigtableConfigured { return primary, nil } + if bigtableConfigured { + reader, err := historical.NewBigtableReader(historical.BigtableConfig{ + ProjectID: ssConfig.HistoricalOffloadBigtableProjectID, + InstanceID: ssConfig.HistoricalOffloadBigtableInstance, + Table: ssConfig.HistoricalOffloadBigtableTable, + Family: ssConfig.HistoricalOffloadBigtableFamily, + AppProfile: ssConfig.HistoricalOffloadBigtableAppProfile, + Shards: ssConfig.HistoricalOffloadBigtableShards, + }) + if err != nil { + _ = primary.Close() + return nil, fmt.Errorf("open bigtable historical offload reader: %w", err) + } + return historical.NewFallbackStateStore(primary, reader), nil + } reader, err := historical.NewScyllaReader(historical.ScyllaConfig{ Hosts: splitCSV(ssConfig.HistoricalOffloadScyllaHosts), Keyspace: ssConfig.HistoricalOffloadScyllaKeyspace, @@ -44,6 +65,12 @@ func scyllaHistoricalOffloadConfigured(cfg config.StateStoreConfig) bool { strings.TrimSpace(cfg.HistoricalOffloadScyllaKeyspace) != "" } +func bigtableHistoricalOffloadConfigured(cfg config.StateStoreConfig) bool { + return strings.TrimSpace(cfg.HistoricalOffloadBigtableProjectID) != "" || + strings.TrimSpace(cfg.HistoricalOffloadBigtableInstance) != "" || + strings.TrimSpace(cfg.HistoricalOffloadBigtableTable) != "" +} + func splitCSV(value string) []string { if strings.TrimSpace(value) == "" { return nil From d3035ecfdd2eb64e3ab8d74305f90a512a6c1c07 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 14 May 2026 14:01:27 -0400 Subject: [PATCH 10/16] Simplify Bigtable offload batching --- sei-db/state_db/ss/offload/consumer/README.md | 3 +- .../state_db/ss/offload/consumer/bigtable.go | 134 +++--------------- .../ss/offload/consumer/bigtable_test.go | 123 ++++------------ sei-db/state_db/ss/offload/consumer/config.go | 4 +- .../consumer/config/example-bigtable.json | 3 +- 5 files changed, 51 insertions(+), 216 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index 7320f3fde4..f716835cfb 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -48,8 +48,7 @@ factors before applying it. The consumer reads historical offload changelog messages from Kafka and writes them into the configured backend. Kafka offsets are committed only after the -sink write succeeds. Mutation rows are written with bounded concurrency and the -version marker is written last. +sink write succeeds. Mutation rows are written before the version marker. ```bash go run ./sei-db/state_db/ss/offload/consumer/cmd/historical-scylla-consumer \ diff --git a/sei-db/state_db/ss/offload/consumer/bigtable.go b/sei-db/state_db/ss/offload/consumer/bigtable.go index 811786dcda..2bff1be8cc 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable.go @@ -5,68 +5,18 @@ import ( "fmt" "time" - "golang.org/x/sync/errgroup" - "github.com/sei-protocol/sei-chain/sei-db/proto" "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" ) -const defaultBigtableMutationWorkers = 16 - -type BigtableConfig struct { - ProjectID string - InstanceID string - Table string - Family string - AppProfile string - Shards int - MutationWorkers int -} - -func (c *BigtableConfig) ApplyDefaults() { - cfg := c.toHistorical() - cfg.ApplyDefaults() - c.Family = cfg.Family - c.Shards = cfg.Shards - if c.MutationWorkers == 0 { - c.MutationWorkers = defaultBigtableMutationWorkers - } -} - -func (c BigtableConfig) Configured() bool { - return c.toHistorical().Configured() -} - -func (c *BigtableConfig) Validate() error { - cfg := c.toHistorical() - cfg.ApplyDefaults() - if err := cfg.Validate(); err != nil { - return err - } - if c.MutationWorkers < 0 { - return fmt.Errorf("bigtable mutation workers must be non-negative") - } - return nil -} - -func (c BigtableConfig) toHistorical() historical.BigtableConfig { - return historical.BigtableConfig{ - ProjectID: c.ProjectID, - InstanceID: c.InstanceID, - Table: c.Table, - Family: c.Family, - AppProfile: c.AppProfile, - Shards: c.Shards, - } -} +type BigtableConfig = historical.BigtableConfig type bigtableSink struct { - client *historical.BigtableClient - applyBulk historical.BigtableApplyBulkFunc - readRows historical.BigtableReadRowsFunc - family string - shards int - mutationWorkers int + client *historical.BigtableClient + applyBulk historical.BigtableApplyBulkFunc + readRows historical.BigtableReadRowsFunc + family string + shards int } var _ Sink = (*bigtableSink)(nil) @@ -78,17 +28,16 @@ func NewBigtableSink(cfg BigtableConfig) (Sink, error) { return nil, err } ctx := context.Background() - client, err := historical.OpenBigtableClient(ctx, cfg.toHistorical()) + client, err := historical.OpenBigtableClient(ctx, cfg) if err != nil { return nil, err } return &bigtableSink{ - client: client, - applyBulk: client.ApplyBulk, - readRows: client.ReadRows, - family: cfg.Family, - shards: cfg.Shards, - mutationWorkers: cfg.MutationWorkers, + client: client, + applyBulk: client.ApplyBulk, + readRows: client.ReadRows, + family: cfg.Family, + shards: cfg.Shards, }, nil } @@ -112,58 +61,22 @@ func (s *bigtableSink) WriteBatch(ctx context.Context, records []Record) error { if len(records) == 0 { return nil } - if len(records) == 1 { - return s.writeRecord(ctx, records[0]) - } - return s.writeRecordsPipelined(ctx, records) -} - -func (s *bigtableSink) writeRecord(ctx context.Context, rec Record) error { - if rec.Entry == nil { - return nil - } - if err := s.writeRecordRows(ctx, rec.Entry.Version, rec.Entry); err != nil { + if err := s.writeRecordRows(ctx, records); err != nil { return err } - return s.writeVersionMarker(ctx, rec) -} - -func (s *bigtableSink) writeRecordsPipelined(ctx context.Context, records []Record) error { - rowCtx, cancel := context.WithCancel(ctx) - defer cancel() - g, gctx := errgroup.WithContext(rowCtx) - g.SetLimit(s.effectiveMutationWorkers()) - rowDone := make([]chan error, len(records)) - for i := range records { - rowDone[i] = make(chan error, 1) - i := i - rec := records[i] - g.Go(func() error { - err := s.writeRecordRows(gctx, rec.Entry.Version, rec.Entry) - if err != nil { - err = fmt.Errorf("write bigtable rows version %d: %w", rec.Entry.Version, err) - } - rowDone[i] <- err - return err - }) - } - for i, rec := range records { - if err := <-rowDone[i]; err != nil { - cancel() - _ = g.Wait() - return err - } + for _, rec := range records { if err := s.writeVersionMarker(ctx, rec); err != nil { - cancel() - _ = g.Wait() return err } } - return g.Wait() + return nil } -func (s *bigtableSink) writeRecordRows(ctx context.Context, version int64, entry *proto.ChangelogEntry) error { - rows := s.recordRowMutations(version, entry) +func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) error { + var rows []historical.BigtableRowMutation + for _, rec := range records { + rows = append(rows, s.recordRowMutations(rec.Entry.Version, rec.Entry)...) + } if len(rows) == 0 { return nil } @@ -235,13 +148,6 @@ func (s *bigtableSink) writeVersionMarker(ctx context.Context, rec Record) error return nil } -func (s *bigtableSink) effectiveMutationWorkers() int { - if s.mutationWorkers <= 0 { - return defaultBigtableMutationWorkers - } - return s.mutationWorkers -} - func bigtableBulkError(rows []historical.BigtableRowMutation, errs []error, err error) error { if err != nil { return err diff --git a/sei-db/state_db/ss/offload/consumer/bigtable_test.go b/sei-db/state_db/ss/offload/consumer/bigtable_test.go index d8903798b3..aa084fc9e4 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable_test.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable_test.go @@ -2,29 +2,13 @@ package consumer import ( "context" - "sync" - "sync/atomic" "testing" - "time" "github.com/sei-protocol/sei-chain/sei-db/proto" "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" "github.com/stretchr/testify/require" ) -func TestBigtableConfigApplyDefaults(t *testing.T) { - cfg := BigtableConfig{ - ProjectID: "project", - InstanceID: "instance", - Table: "state", - } - cfg.ApplyDefaults() - require.Equal(t, historical.DefaultBigtableFamily, cfg.Family) - require.Equal(t, historical.DefaultBigtableShards, cfg.Shards) - require.Equal(t, defaultBigtableMutationWorkers, cfg.MutationWorkers) - require.NoError(t, cfg.Validate()) -} - func TestBigtableSinkWritesMutationRowsAndVersionMarker(t *testing.T) { var rows []string sink := &bigtableSink{ @@ -58,51 +42,27 @@ func TestBigtableSinkWritesMutationRowsAndVersionMarker(t *testing.T) { require.Equal(t, historical.BigtableVersionRowKey(7), rows[3]) } -func TestBigtableSinkWriteBatchPipelinesRowsAndOrdersMarkers(t *testing.T) { - rowStarted := make(chan int64, 2) - markerWritten := make(chan int64, 2) - releaseRows := map[int64]chan struct{}{ - 1: make(chan struct{}), - 2: make(chan struct{}), - } - var activeRows atomic.Int32 - var sawConcurrentRows atomic.Bool - var mu sync.Mutex - rowsDone := make(map[int64]bool) - var markers []int64 +func TestBigtableSinkWriteBatchWritesRowsBeforeMarkers(t *testing.T) { + var rowVersions []int64 + var markerVersions []int64 var markerBeforeRowsDone bool sink := &bigtableSink{ - family: historical.DefaultBigtableFamily, - shards: historical.DefaultBigtableShards, - mutationWorkers: 2, - applyBulk: func(ctx context.Context, mutations []historical.BigtableRowMutation) ([]error, error) { - version, ok := historical.BigtableVersionFromRowKey(mutations[0].RowKey) - require.True(t, ok) - if mutations[0].RowKey == historical.BigtableVersionRowKey(version) { - mu.Lock() - if !rowsDone[version] { - markerBeforeRowsDone = true - } - markers = append(markers, version) - mu.Unlock() - markerWritten <- version - return make([]error, len(mutations)), nil - } - if activeRows.Add(1) > 1 { - sawConcurrentRows.Store(true) + family: historical.DefaultBigtableFamily, + shards: historical.DefaultBigtableShards, + applyBulk: func(_ context.Context, mutations []historical.BigtableRowMutation) ([]error, error) { + isMarkerBatch := len(mutations) == 1 && mutations[0].RowKey == historical.BigtableVersionRowKey(mustBigtableVersion(t, mutations[0].RowKey)) + if isMarkerBatch && len(rowVersions) != 2 { + markerBeforeRowsDone = true } - rowStarted <- version - select { - case <-releaseRows[version]: - case <-ctx.Done(): - activeRows.Add(-1) - return nil, ctx.Err() + for _, mutation := range mutations { + version := mustBigtableVersion(t, mutation.RowKey) + if mutation.RowKey == historical.BigtableVersionRowKey(version) { + markerVersions = append(markerVersions, version) + } else { + rowVersions = append(rowVersions, version) + } } - activeRows.Add(-1) - mu.Lock() - rowsDone[version] = true - mu.Unlock() return make([]error, len(mutations)), nil }, } @@ -123,46 +83,15 @@ func TestBigtableSinkWriteBatchPipelinesRowsAndOrdersMarkers(t *testing.T) { }}, } - errCh := make(chan error, 1) - go func() { - errCh <- sink.WriteBatch(context.Background(), records) - }() - - started := map[int64]bool{} - for len(started) < 2 { - select { - case version := <-rowStarted: - started[version] = true - case <-time.After(time.Second): - t.Fatal("timed out waiting for pipelined row writes") - } - } - require.True(t, sawConcurrentRows.Load()) - - close(releaseRows[2]) - select { - case version := <-markerWritten: - t.Fatalf("marker %d written before earlier record rows completed", version) - case <-time.After(100 * time.Millisecond): - } - - close(releaseRows[1]) - for _, want := range []int64{1, 2} { - select { - case got := <-markerWritten: - require.Equal(t, want, got) - case <-time.After(time.Second): - t.Fatalf("timed out waiting for marker %d", want) - } - } - select { - case err := <-errCh: - require.NoError(t, err) - case <-time.After(time.Second): - t.Fatal("timed out waiting for batch write") - } - mu.Lock() - defer mu.Unlock() + require.NoError(t, sink.WriteBatch(context.Background(), records)) require.False(t, markerBeforeRowsDone) - require.Equal(t, []int64{1, 2}, markers) + require.Equal(t, []int64{1, 2}, rowVersions) + require.Equal(t, []int64{1, 2}, markerVersions) +} + +func mustBigtableVersion(t *testing.T, rowKey string) int64 { + t.Helper() + version, ok := historical.BigtableVersionFromRowKey(rowKey) + require.True(t, ok) + return version } diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index eceb801fb0..806cfae31e 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -28,7 +28,9 @@ func (c *Config) Validate() error { return fmt.Errorf("scylla: %w", err) } case "bigtable": - if err := c.Bigtable.Validate(); err != nil { + bigtable := c.Bigtable + bigtable.ApplyDefaults() + if err := bigtable.Validate(); err != nil { return fmt.Errorf("bigtable: %w", err) } default: diff --git a/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json index 8af5152b35..32e9250678 100644 --- a/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json +++ b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json @@ -12,8 +12,7 @@ "Table": "state_mutations", "Family": "state", "AppProfile": "", - "Shards": 256, - "MutationWorkers": 16 + "Shards": 256 }, "Workers": 16, "ShardBufferSize": 128, From 9890f78d393ac5637cc662e24582ac66636219d1 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 14 May 2026 15:02:32 -0400 Subject: [PATCH 11/16] Fix Bigtable lint findings --- .../state_db/ss/offload/consumer/bigtable.go | 7 +- sei-db/state_db/ss/offload/consumer/config.go | 17 +++-- .../ss/offload/consumer/config_test.go | 10 +-- .../ss/offload/historical/bigtable.go | 72 +++++++++++++++---- 4 files changed, 79 insertions(+), 27 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/bigtable.go b/sei-db/state_db/ss/offload/consumer/bigtable.go index 2bff1be8cc..409c758421 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable.go @@ -73,7 +73,7 @@ func (s *bigtableSink) WriteBatch(ctx context.Context, records []Record) error { } func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) error { - var rows []historical.BigtableRowMutation + rows := make([]historical.BigtableRowMutation, 0, len(records)) for _, rec := range records { rows = append(rows, s.recordRowMutations(rec.Entry.Version, rec.Entry)...) } @@ -85,8 +85,9 @@ func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) er } func (s *bigtableSink) recordRowMutations(version int64, entry *proto.ChangelogEntry) []historical.BigtableRowMutation { - rows := make([]historical.BigtableRowMutation, 0) - for _, mutation := range compactMutations(entry) { + mutations := compactMutations(entry) + rows := make([]historical.BigtableRowMutation, 0, len(mutations)+len(entry.Upgrades)) + for _, mutation := range mutations { rows = append(rows, s.mutationRow(version, mutation.storeName, mutation.pair)) } for _, up := range entry.Upgrades { diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index 806cfae31e..98be03feac 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -7,6 +7,11 @@ import ( "strings" ) +const ( + backendScylla = "scylla" + backendBigtable = "bigtable" +) + type Config struct { Backend string Kafka KafkaReaderConfig @@ -23,11 +28,11 @@ func (c *Config) Validate() error { return fmt.Errorf("kafka: %w", err) } switch c.BackendName() { - case "scylla": + case backendScylla: if err := c.Scylla.Validate(); err != nil { return fmt.Errorf("scylla: %w", err) } - case "bigtable": + case backendBigtable: bigtable := c.Bigtable bigtable.ApplyDefaults() if err := bigtable.Validate(); err != nil { @@ -57,16 +62,16 @@ func (c *Config) BackendName() string { return backend } if c.Bigtable.Configured() && !c.Scylla.Configured() { - return "bigtable" + return backendBigtable } - return "scylla" + return backendScylla } func NewSinkFromConfig(cfg Config) (Sink, error) { switch cfg.BackendName() { - case "scylla": + case backendScylla: return NewScyllaSink(cfg.Scylla) - case "bigtable": + case backendBigtable: return NewBigtableSink(cfg.Bigtable) default: return nil, fmt.Errorf("unsupported backend %q", cfg.Backend) diff --git a/sei-db/state_db/ss/offload/consumer/config_test.go b/sei-db/state_db/ss/offload/consumer/config_test.go index 6e335bd3cd..5ca9753998 100644 --- a/sei-db/state_db/ss/offload/consumer/config_test.go +++ b/sei-db/state_db/ss/offload/consumer/config_test.go @@ -7,14 +7,14 @@ import ( ) func TestConfigBackendName(t *testing.T) { - require.Equal(t, "scylla", (&Config{}).BackendName()) - require.Equal(t, "bigtable", (&Config{Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) - require.Equal(t, "scylla", (&Config{Backend: "Scylla", Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) + require.Equal(t, backendScylla, (&Config{}).BackendName()) + require.Equal(t, backendBigtable, (&Config{Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) + require.Equal(t, backendScylla, (&Config{Backend: "Scylla", Bigtable: BigtableConfig{ProjectID: "p"}}).BackendName()) } func TestConfigValidateBigtable(t *testing.T) { cfg := Config{ - Backend: "bigtable", + Backend: backendBigtable, Kafka: KafkaReaderConfig{ Brokers: []string{"localhost:9092"}, Topic: "historical-offload", @@ -29,5 +29,5 @@ func TestConfigValidateBigtable(t *testing.T) { require.NoError(t, cfg.Validate()) cfg.Bigtable.Table = "" - require.ErrorContains(t, cfg.Validate(), "bigtable") + require.ErrorContains(t, cfg.Validate(), backendBigtable) } diff --git a/sei-db/state_db/ss/offload/historical/bigtable.go b/sei-db/state_db/ss/offload/historical/bigtable.go index c1f075e4ed..39c8a56047 100644 --- a/sei-db/state_db/ss/offload/historical/bigtable.go +++ b/sei-db/state_db/ss/offload/historical/bigtable.go @@ -32,7 +32,13 @@ const ( BigtableDeletedColumn = "deleted" ) -const bigtableEndpoint = "bigtable.googleapis.com:443" +const ( + bigtableEndpoint = "bigtable.googleapis.com:443" + + maxUint16Int = 1<<16 - 1 + maxUint32Int = 1<<32 - 1 + maxInt64Uint64 = 1<<63 - 1 +) type BigtableClient struct { conn *grpc.ClientConn @@ -378,11 +384,11 @@ func BigtableMutationRowPrefix(storeName string, key []byte, shards int) string shard := bigtableShard(storeName, key, shards) prefix := make([]byte, 1+2+2+len(storeName)+4+len(key)) prefix[0] = bigtableMutationPrefix - binary.BigEndian.PutUint16(prefix[1:], uint16(shard)) - binary.BigEndian.PutUint16(prefix[3:], uint16(len(storeName))) + binary.BigEndian.PutUint16(prefix[1:], shard) + binary.BigEndian.PutUint16(prefix[3:], uint16FromBoundedInt(len(storeName))) copy(prefix[5:], storeName) keyOffset := 5 + len(storeName) - binary.BigEndian.PutUint32(prefix[keyOffset:], uint32(len(key))) + binary.BigEndian.PutUint32(prefix[keyOffset:], uint32FromBoundedInt(len(key))) copy(prefix[keyOffset+4:], key) return string(prefix) } @@ -401,7 +407,7 @@ func BigtableUpgradeRowKey(version int64, name string) string { key := make([]byte, 1+8+2+len(name)) key[0] = bigtableUpgradePrefix copy(key[1:], bigtableInvertedVersion(version)) - binary.BigEndian.PutUint16(key[9:], uint16(len(name))) + binary.BigEndian.PutUint16(key[9:], uint16FromBoundedInt(len(name))) copy(key[11:], name) return string(key) } @@ -410,9 +416,9 @@ func BigtableVersionFromRowKey(rowKey string) (int64, bool) { key := []byte(rowKey) switch { case len(key) >= 1+2+8 && key[0] == bigtableVersionPrefix: - return bigtableDecodeInvertedVersion(key[3:11]), true + return bigtableDecodeInvertedVersion(key[3:11]) case len(key) >= 8 && key[0] == bigtableMutationPrefix: - return bigtableDecodeInvertedVersion(key[len(key)-8:]), true + return bigtableDecodeInvertedVersion(key[len(key)-8:]) default: return 0, false } @@ -492,7 +498,7 @@ func bigtableTableName(projectID, instanceID, table string) string { func bigtableVersionRowPrefix(bucket int) []byte { prefix := make([]byte, 1+2) prefix[0] = bigtableVersionPrefix - binary.BigEndian.PutUint16(prefix[1:], uint16(bucket)) + binary.BigEndian.PutUint16(prefix[1:], uint16FromBoundedInt(bucket)) return prefix } @@ -509,25 +515,65 @@ func bigtablePrefixEnd(prefix []byte) []byte { func bigtableInvertedVersion(version int64) []byte { out := make([]byte, 8) - binary.BigEndian.PutUint64(out, ^uint64(version)) + binary.BigEndian.PutUint64(out, ^uint64FromNonNegativeInt64(version)) return out } -func bigtableDecodeInvertedVersion(encoded []byte) int64 { - return int64(^binary.BigEndian.Uint64(encoded)) +func bigtableDecodeInvertedVersion(encoded []byte) (int64, bool) { + version := ^binary.BigEndian.Uint64(encoded) + if version > maxInt64Uint64 { + return 0, false + } + // #nosec G115 -- version is checked above to fit in int64. + return int64(version), true } -func bigtableShard(storeName string, key []byte, shards int) int { +func bigtableShard(storeName string, key []byte, shards int) uint16 { h := fnv.New32a() _, _ = h.Write([]byte(storeName)) _, _ = h.Write([]byte{0}) _, _ = h.Write(key) - return int(h.Sum32() % uint32(shards)) + return uint16FromBoundedUint32(h.Sum32() % uint32FromBoundedInt(shards)) } func normalizeBigtableShards(shards int) int { if shards <= 0 { return DefaultBigtableShards } + if shards > maxUint16Int { + return maxUint16Int + } return shards } + +func uint16FromBoundedInt(value int) uint16 { + if value < 0 || value > maxUint16Int { + panic(fmt.Sprintf("bigtable value %d exceeds uint16", value)) + } + // #nosec G115 -- value is checked above to fit in uint16. + return uint16(value) +} + +func uint32FromBoundedInt(value int) uint32 { + if value < 0 || value > maxUint32Int { + panic(fmt.Sprintf("bigtable value %d exceeds uint32", value)) + } + // #nosec G115 -- value is checked above to fit in uint32. + return uint32(value) +} + +func uint16FromBoundedUint32(value uint32) uint16 { + if value > maxUint16Int { + panic(fmt.Sprintf("bigtable value %d exceeds uint16", value)) + } + // #nosec G115 -- value is checked above to fit in uint16. + return uint16(value) +} + +func uint64FromNonNegativeInt64(value int64) uint64 { + if value < 0 { + panic(fmt.Sprintf("bigtable version %d is negative", value)) + } + // #nosec G115 -- value is checked above to be non-negative. + return uint64(value) +} From 66f8c9f6fd5c29d3c9cda54e2d25017946e40000 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 14 May 2026 17:21:51 -0400 Subject: [PATCH 12/16] Remove unused historical BatchGet interface --- .../ss/offload/historical/bigtable.go | 31 ------------------- .../state_db/ss/offload/historical/reader.go | 3 -- .../ss/offload/historical/store_test.go | 4 --- 3 files changed, 38 deletions(-) diff --git a/sei-db/state_db/ss/offload/historical/bigtable.go b/sei-db/state_db/ss/offload/historical/bigtable.go index 39c8a56047..45fb431f1f 100644 --- a/sei-db/state_db/ss/offload/historical/bigtable.go +++ b/sei-db/state_db/ss/offload/historical/bigtable.go @@ -8,11 +8,9 @@ import ( "io" "regexp" "strings" - "sync" "time" "cloud.google.com/go/bigtable/apiv2/bigtablepb" - "golang.org/x/sync/errgroup" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/oauth" @@ -22,8 +20,6 @@ const ( DefaultBigtableFamily = "state" DefaultBigtableShards = 256 - defaultBigtableReadWorkers = 16 - bigtableMutationPrefix = byte('m') bigtableVersionPrefix = byte('v') bigtableUpgradePrefix = byte('u') @@ -310,33 +306,6 @@ func (r *bigtableReader) Get(ctx context.Context, storeName string, key []byte, return BigtableValueFromRow(row, r.family) } -func (r *bigtableReader) BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) { - out := make(map[Lookup]Value, len(lookups)) - g, gctx := errgroup.WithContext(ctx) - g.SetLimit(defaultBigtableReadWorkers) - var mu sync.Mutex - for _, lookup := range lookups { - lookup := lookup - g.Go(func() error { - value, err := r.Get(gctx, lookup.StoreName, []byte(lookup.Key), targetVersion) - if err != nil { - if err == ErrNotFound { - return nil - } - return err - } - mu.Lock() - out[lookup] = value - mu.Unlock() - return nil - }) - } - if err := g.Wait(); err != nil { - return nil, err - } - return out, nil -} - func BigtableLastVersion(ctx context.Context, readRows BigtableReadRowsFunc) (int64, error) { var maxVersion int64 for bucket := 0; bucket < VersionBucketCount; bucket++ { diff --git a/sei-db/state_db/ss/offload/historical/reader.go b/sei-db/state_db/ss/offload/historical/reader.go index 7dfdfee993..b4bea9414d 100644 --- a/sei-db/state_db/ss/offload/historical/reader.go +++ b/sei-db/state_db/ss/offload/historical/reader.go @@ -29,9 +29,6 @@ type Reader interface { // Has skips value transfer and returns false for missing or tombstoned keys. Has(ctx context.Context, storeName string, key []byte, targetVersion int64) (bool, error) - // BatchGet returns only found, non-tombstoned lookups. - BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) - LastVersion(ctx context.Context) (int64, error) Close() error } diff --git a/sei-db/state_db/ss/offload/historical/store_test.go b/sei-db/state_db/ss/offload/historical/store_test.go index 1e3cbfea57..a234586649 100644 --- a/sei-db/state_db/ss/offload/historical/store_test.go +++ b/sei-db/state_db/ss/offload/historical/store_test.go @@ -65,10 +65,6 @@ func (f *fakeReader) Has(context.Context, string, []byte, int64) (bool, error) { return true, nil } -func (f *fakeReader) BatchGet(context.Context, int64, []Lookup) (map[Lookup]Value, error) { - return nil, nil -} - func (f *fakeReader) LastVersion(context.Context) (int64, error) { return 0, nil } func (f *fakeReader) Close() error { return nil } From ecdf5cc749a6bf36af59ef8585b7cb85da54a671 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 15 May 2026 13:19:34 -0400 Subject: [PATCH 13/16] Reduce Bigtable write allocation overhead --- .../state_db/ss/offload/consumer/bigtable.go | 62 +++++++++++-------- .../ss/offload/historical/bigtable.go | 5 +- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/bigtable.go b/sei-db/state_db/ss/offload/consumer/bigtable.go index 409c758421..13e4513ca1 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable.go @@ -3,6 +3,7 @@ package consumer import ( "context" "fmt" + "strconv" "time" "github.com/sei-protocol/sei-chain/sei-db/proto" @@ -73,9 +74,9 @@ func (s *bigtableSink) WriteBatch(ctx context.Context, records []Record) error { } func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) error { - rows := make([]historical.BigtableRowMutation, 0, len(records)) + rows := make([]historical.BigtableRowMutation, 0, bigtableRowMutationCount(records)) for _, rec := range records { - rows = append(rows, s.recordRowMutations(rec.Entry.Version, rec.Entry)...) + rows = s.appendRecordRowMutations(rows, rec.Entry.Version, rec.Entry) } if len(rows) == 0 { return nil @@ -84,9 +85,8 @@ func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) er return bigtableBulkError(rows, errs, err) } -func (s *bigtableSink) recordRowMutations(version int64, entry *proto.ChangelogEntry) []historical.BigtableRowMutation { +func (s *bigtableSink) appendRecordRowMutations(rows []historical.BigtableRowMutation, version int64, entry *proto.ChangelogEntry) []historical.BigtableRowMutation { mutations := compactMutations(entry) - rows := make([]historical.BigtableRowMutation, 0, len(mutations)+len(entry.Upgrades)) for _, mutation := range mutations { rows = append(rows, s.mutationRow(version, mutation.storeName, mutation.pair)) } @@ -99,24 +99,25 @@ func (s *bigtableSink) recordRowMutations(version int64, entry *proto.ChangelogE func (s *bigtableSink) mutationRow(version int64, storeName string, pair *proto.KVPair) historical.BigtableRowMutation { ts := historical.BigtableTimestamp(version) deleted := pair.Delete || pair.Value == nil - row := historical.BigtableRowMutation{ - RowKey: historical.BigtableMutationRowKey(storeName, pair.Key, version, s.shards), - } - if !deleted { - row.SetCells = append(row.SetCells, historical.BigtableSetCell{ - Family: s.family, - Qualifier: historical.BigtableValueColumn, - TimestampMicros: ts, - Value: pair.Value, - }) - } - row.SetCells = append(row.SetCells, historical.BigtableSetCell{ - Family: s.family, - Qualifier: historical.BigtableDeletedColumn, - TimestampMicros: ts, - Value: boolByte(deleted), - }) - return row + rowKey := historical.BigtableMutationRowKey(storeName, pair.Key, version, s.shards) + if deleted { + return historical.BigtableRowMutation{ + RowKey: rowKey, + SetCells: []historical.BigtableSetCell{{ + Family: s.family, + Qualifier: historical.BigtableDeletedColumn, + TimestampMicros: ts, + Value: boolByte(true), + }}, + } + } + return historical.BigtableRowMutation{ + RowKey: rowKey, + SetCells: []historical.BigtableSetCell{ + {Family: s.family, Qualifier: historical.BigtableValueColumn, TimestampMicros: ts, Value: pair.Value}, + {Family: s.family, Qualifier: historical.BigtableDeletedColumn, TimestampMicros: ts, Value: boolByte(false)}, + }, + } } func (s *bigtableSink) upgradeRow(version int64, up *proto.TreeNameUpgrade) historical.BigtableRowMutation { @@ -137,9 +138,9 @@ func (s *bigtableSink) writeVersionMarker(ctx context.Context, rec Record) error RowKey: historical.BigtableVersionRowKey(version), SetCells: []historical.BigtableSetCell{ {Family: s.family, Qualifier: "topic", TimestampMicros: ts, Value: []byte(rec.Topic)}, - {Family: s.family, Qualifier: "partition", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", rec.Partition))}, - {Family: s.family, Qualifier: "offset", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", rec.Offset))}, - {Family: s.family, Qualifier: "ingested_at_unix_nano", TimestampMicros: ts, Value: []byte(fmt.Sprintf("%d", time.Now().UnixNano()))}, + {Family: s.family, Qualifier: "partition", TimestampMicros: ts, Value: []byte(strconv.Itoa(rec.Partition))}, + {Family: s.family, Qualifier: "offset", TimestampMicros: ts, Value: []byte(strconv.FormatInt(rec.Offset, 10))}, + {Family: s.family, Qualifier: "ingested_at_unix_nano", TimestampMicros: ts, Value: []byte(strconv.FormatInt(time.Now().UnixNano(), 10))}, }, } errs, err := s.applyBulk(ctx, []historical.BigtableRowMutation{row}) @@ -149,6 +150,17 @@ func (s *bigtableSink) writeVersionMarker(ctx context.Context, rec Record) error return nil } +func bigtableRowMutationCount(records []Record) int { + total := 0 + for _, rec := range records { + for _, changeset := range rec.Entry.Changesets { + total += len(changeset.Changeset.Pairs) + } + total += len(rec.Entry.Upgrades) + } + return total +} + func bigtableBulkError(rows []historical.BigtableRowMutation, errs []error, err error) error { if err != nil { return err diff --git a/sei-db/state_db/ss/offload/historical/bigtable.go b/sei-db/state_db/ss/offload/historical/bigtable.go index 45fb431f1f..0820def8ee 100644 --- a/sei-db/state_db/ss/offload/historical/bigtable.go +++ b/sei-db/state_db/ss/offload/historical/bigtable.go @@ -208,7 +208,10 @@ func (c *BigtableClient) ApplyBulk(ctx context.Context, rows []BigtableRowMutati } entries := make([]*bigtablepb.MutateRowsRequest_Entry, 0, len(rows)) for _, row := range rows { - entry := &bigtablepb.MutateRowsRequest_Entry{RowKey: []byte(row.RowKey)} + entry := &bigtablepb.MutateRowsRequest_Entry{ + RowKey: []byte(row.RowKey), + Mutations: make([]*bigtablepb.Mutation, 0, len(row.SetCells)), + } for _, cell := range row.SetCells { entry.Mutations = append(entry.Mutations, &bigtablepb.Mutation{ Mutation: &bigtablepb.Mutation_SetCell_{SetCell: &bigtablepb.Mutation_SetCell{ From 35f7143d6a6afe1ec66c153a5d34c24f1d33a903 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 15 May 2026 17:23:04 -0400 Subject: [PATCH 14/16] Tune Bigtable consumer batch defaults --- sei-db/state_db/ss/offload/consumer/config.go | 16 +++++++++++++ .../consumer/config/example-bigtable.json | 4 ++-- .../ss/offload/consumer/config_test.go | 23 +++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index 98be03feac..038d8f7486 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -10,6 +10,9 @@ import ( const ( backendScylla = "scylla" backendBigtable = "bigtable" + + defaultBigtableMaxBatchRecords = 128 + defaultBigtableBatchMaxWaitMS = 25 ) type Config struct { @@ -67,6 +70,18 @@ func (c *Config) BackendName() string { return backendScylla } +func (c *Config) applyBackendDefaults() { + if c.BackendName() != backendBigtable { + return + } + if c.MaxBatchRecords == 0 { + c.MaxBatchRecords = defaultBigtableMaxBatchRecords + } + if c.BatchMaxWaitMS == 0 { + c.BatchMaxWaitMS = defaultBigtableBatchMaxWaitMS + } +} + func NewSinkFromConfig(cfg Config) (Sink, error) { switch cfg.BackendName() { case backendScylla: @@ -88,6 +103,7 @@ func LoadConfig(path string) (*Config, error) { if err := json.Unmarshal(raw, cfg); err != nil { return nil, fmt.Errorf("parse config: %w", err) } + cfg.applyBackendDefaults() if err := cfg.Validate(); err != nil { return nil, err } diff --git a/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json index 32e9250678..0259f3e4df 100644 --- a/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json +++ b/sei-db/state_db/ss/offload/consumer/config/example-bigtable.json @@ -16,6 +16,6 @@ }, "Workers": 16, "ShardBufferSize": 128, - "MaxBatchRecords": 16, - "BatchMaxWaitMS": 10 + "MaxBatchRecords": 128, + "BatchMaxWaitMS": 25 } diff --git a/sei-db/state_db/ss/offload/consumer/config_test.go b/sei-db/state_db/ss/offload/consumer/config_test.go index 5ca9753998..0ac2e8ac86 100644 --- a/sei-db/state_db/ss/offload/consumer/config_test.go +++ b/sei-db/state_db/ss/offload/consumer/config_test.go @@ -31,3 +31,26 @@ func TestConfigValidateBigtable(t *testing.T) { cfg.Bigtable.Table = "" require.ErrorContains(t, cfg.Validate(), backendBigtable) } + +func TestConfigApplyBackendDefaultsBigtable(t *testing.T) { + cfg := Config{Backend: backendBigtable} + cfg.applyBackendDefaults() + require.Equal(t, defaultBigtableMaxBatchRecords, cfg.MaxBatchRecords) + require.Equal(t, defaultBigtableBatchMaxWaitMS, cfg.BatchMaxWaitMS) + + cfg = Config{ + Backend: backendBigtable, + MaxBatchRecords: 32, + BatchMaxWaitMS: 5, + } + cfg.applyBackendDefaults() + require.Equal(t, 32, cfg.MaxBatchRecords) + require.Equal(t, 5, cfg.BatchMaxWaitMS) +} + +func TestConfigApplyBackendDefaultsScylla(t *testing.T) { + cfg := Config{Backend: backendScylla} + cfg.applyBackendDefaults() + require.Zero(t, cfg.MaxBatchRecords) + require.Zero(t, cfg.BatchMaxWaitMS) +} From 429ee4514ed0de63da2dca393df77b882b634bf4 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 18 May 2026 11:05:19 -0400 Subject: [PATCH 15/16] Batch Bigtable version marker writes --- .../state_db/ss/offload/consumer/bigtable.go | 39 +++++++++---------- .../ss/offload/consumer/bigtable_test.go | 37 +++++++----------- 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/bigtable.go b/sei-db/state_db/ss/offload/consumer/bigtable.go index 13e4513ca1..a77c02ad8d 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable.go @@ -65,12 +65,7 @@ func (s *bigtableSink) WriteBatch(ctx context.Context, records []Record) error { if err := s.writeRecordRows(ctx, records); err != nil { return err } - for _, rec := range records { - if err := s.writeVersionMarker(ctx, rec); err != nil { - return err - } - } - return nil + return s.writeVersionMarkers(ctx, records) } func (s *bigtableSink) writeRecordRows(ctx context.Context, records []Record) error { @@ -131,21 +126,25 @@ func (s *bigtableSink) upgradeRow(version int64, up *proto.TreeNameUpgrade) hist } } -func (s *bigtableSink) writeVersionMarker(ctx context.Context, rec Record) error { - version := rec.Entry.Version - ts := historical.BigtableTimestamp(version) - row := historical.BigtableRowMutation{ - RowKey: historical.BigtableVersionRowKey(version), - SetCells: []historical.BigtableSetCell{ - {Family: s.family, Qualifier: "topic", TimestampMicros: ts, Value: []byte(rec.Topic)}, - {Family: s.family, Qualifier: "partition", TimestampMicros: ts, Value: []byte(strconv.Itoa(rec.Partition))}, - {Family: s.family, Qualifier: "offset", TimestampMicros: ts, Value: []byte(strconv.FormatInt(rec.Offset, 10))}, - {Family: s.family, Qualifier: "ingested_at_unix_nano", TimestampMicros: ts, Value: []byte(strconv.FormatInt(time.Now().UnixNano(), 10))}, - }, +func (s *bigtableSink) writeVersionMarkers(ctx context.Context, records []Record) error { + rows := make([]historical.BigtableRowMutation, 0, len(records)) + ingestedAt := []byte(strconv.FormatInt(time.Now().UnixNano(), 10)) + for _, rec := range records { + version := rec.Entry.Version + ts := historical.BigtableTimestamp(version) + rows = append(rows, historical.BigtableRowMutation{ + RowKey: historical.BigtableVersionRowKey(version), + SetCells: []historical.BigtableSetCell{ + {Family: s.family, Qualifier: "topic", TimestampMicros: ts, Value: []byte(rec.Topic)}, + {Family: s.family, Qualifier: "partition", TimestampMicros: ts, Value: []byte(strconv.Itoa(rec.Partition))}, + {Family: s.family, Qualifier: "offset", TimestampMicros: ts, Value: []byte(strconv.FormatInt(rec.Offset, 10))}, + {Family: s.family, Qualifier: "ingested_at_unix_nano", TimestampMicros: ts, Value: ingestedAt}, + }, + }) } - errs, err := s.applyBulk(ctx, []historical.BigtableRowMutation{row}) - if err := bigtableBulkError([]historical.BigtableRowMutation{row}, errs, err); err != nil { - return fmt.Errorf("insert bigtable version %d: %w", version, err) + errs, err := s.applyBulk(ctx, rows) + if err := bigtableBulkError(rows, errs, err); err != nil { + return fmt.Errorf("insert bigtable version markers: %w", err) } return nil } diff --git a/sei-db/state_db/ss/offload/consumer/bigtable_test.go b/sei-db/state_db/ss/offload/consumer/bigtable_test.go index aa084fc9e4..ef2d1146c4 100644 --- a/sei-db/state_db/ss/offload/consumer/bigtable_test.go +++ b/sei-db/state_db/ss/offload/consumer/bigtable_test.go @@ -43,26 +43,17 @@ func TestBigtableSinkWritesMutationRowsAndVersionMarker(t *testing.T) { } func TestBigtableSinkWriteBatchWritesRowsBeforeMarkers(t *testing.T) { - var rowVersions []int64 - var markerVersions []int64 - var markerBeforeRowsDone bool + var calls [][]string sink := &bigtableSink{ family: historical.DefaultBigtableFamily, shards: historical.DefaultBigtableShards, applyBulk: func(_ context.Context, mutations []historical.BigtableRowMutation) ([]error, error) { - isMarkerBatch := len(mutations) == 1 && mutations[0].RowKey == historical.BigtableVersionRowKey(mustBigtableVersion(t, mutations[0].RowKey)) - if isMarkerBatch && len(rowVersions) != 2 { - markerBeforeRowsDone = true - } + call := make([]string, 0, len(mutations)) for _, mutation := range mutations { - version := mustBigtableVersion(t, mutation.RowKey) - if mutation.RowKey == historical.BigtableVersionRowKey(version) { - markerVersions = append(markerVersions, version) - } else { - rowVersions = append(rowVersions, version) - } + call = append(call, mutation.RowKey) } + calls = append(calls, call) return make([]error, len(mutations)), nil }, } @@ -84,14 +75,14 @@ func TestBigtableSinkWriteBatchWritesRowsBeforeMarkers(t *testing.T) { } require.NoError(t, sink.WriteBatch(context.Background(), records)) - require.False(t, markerBeforeRowsDone) - require.Equal(t, []int64{1, 2}, rowVersions) - require.Equal(t, []int64{1, 2}, markerVersions) -} - -func mustBigtableVersion(t *testing.T, rowKey string) int64 { - t.Helper() - version, ok := historical.BigtableVersionFromRowKey(rowKey) - require.True(t, ok) - return version + require.Equal(t, [][]string{ + { + historical.BigtableMutationRowKey("bank", []byte("k1"), 1, historical.DefaultBigtableShards), + historical.BigtableMutationRowKey("bank", []byte("k2"), 2, historical.DefaultBigtableShards), + }, + { + historical.BigtableVersionRowKey(1), + historical.BigtableVersionRowKey(2), + }, + }, calls) } From a406321317208fe50110ee310c304bafcd6fd61d Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 18 May 2026 13:57:38 -0400 Subject: [PATCH 16/16] Parallelize Bigtable latest version reads --- .../ss/offload/historical/bigtable.go | 37 ++++++++++++++----- .../ss/offload/historical/bigtable_test.go | 32 ++++++++++++++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/sei-db/state_db/ss/offload/historical/bigtable.go b/sei-db/state_db/ss/offload/historical/bigtable.go index 0820def8ee..37cb8faaf7 100644 --- a/sei-db/state_db/ss/offload/historical/bigtable.go +++ b/sei-db/state_db/ss/offload/historical/bigtable.go @@ -8,9 +8,11 @@ import ( "io" "regexp" "strings" + "sync" "time" "cloud.google.com/go/bigtable/apiv2/bigtablepb" + "golang.org/x/sync/errgroup" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/oauth" @@ -31,6 +33,8 @@ const ( const ( bigtableEndpoint = "bigtable.googleapis.com:443" + defaultBigtableReadWorkers = 16 + maxUint16Int = 1<<16 - 1 maxUint32Int = 1<<32 - 1 maxInt64Uint64 = 1<<63 - 1 @@ -311,18 +315,33 @@ func (r *bigtableReader) Get(ctx context.Context, storeName string, key []byte, func BigtableLastVersion(ctx context.Context, readRows BigtableReadRowsFunc) (int64, error) { var maxVersion int64 + var mu sync.Mutex + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(defaultBigtableReadWorkers) for bucket := 0; bucket < VersionBucketCount; bucket++ { - prefix := bigtableVersionRowPrefix(bucket) - err := readRows(ctx, prefix, bigtablePrefixEnd(prefix), 1, "", func(row BigtableRow) bool { - version, ok := BigtableVersionFromRowKey(row.Key) - if ok && version > maxVersion { - maxVersion = version + bucket := bucket + g.Go(func() error { + prefix := bigtableVersionRowPrefix(bucket) + var bucketVersion int64 + err := readRows(gctx, prefix, bigtablePrefixEnd(prefix), 1, "", func(row BigtableRow) bool { + if version, ok := BigtableVersionFromRowKey(row.Key); ok { + bucketVersion = version + } + return false + }) + if err != nil { + return fmt.Errorf("read latest bigtable version bucket %d: %w", bucket, err) + } + mu.Lock() + if bucketVersion > maxVersion { + maxVersion = bucketVersion } - return false + mu.Unlock() + return nil }) - if err != nil { - return 0, fmt.Errorf("read latest bigtable version bucket %d: %w", bucket, err) - } + } + if err := g.Wait(); err != nil { + return 0, err } return maxVersion, nil } diff --git a/sei-db/state_db/ss/offload/historical/bigtable_test.go b/sei-db/state_db/ss/offload/historical/bigtable_test.go index 2c78f2f228..2f08e0b7bc 100644 --- a/sei-db/state_db/ss/offload/historical/bigtable_test.go +++ b/sei-db/state_db/ss/offload/historical/bigtable_test.go @@ -2,7 +2,9 @@ package historical import ( "context" + "fmt" "sort" + "sync" "testing" "github.com/stretchr/testify/require" @@ -91,3 +93,33 @@ func TestBigtableReaderGetUsesMVCCRange(t *testing.T) { require.Equal(t, []byte("v40"), value.Bytes) require.Equal(t, int64(40), value.Version) } + +func TestBigtableLastVersionScansBuckets(t *testing.T) { + versions := map[int]int64{ + 3: 42, + 9: 70, + } + seen := make(map[int]struct{}, VersionBucketCount) + var mu sync.Mutex + + got, err := BigtableLastVersion(context.Background(), func(_ context.Context, startKey, endKey []byte, limit int64, family string, f func(BigtableRow) bool) error { + if len(startKey) != 3 || startKey[0] != bigtableVersionPrefix { + return fmt.Errorf("unexpected start key %q", startKey) + } + if len(endKey) == 0 || limit != 1 || family != "" { + return fmt.Errorf("unexpected scan params") + } + bucket := int(startKey[1])<<8 | int(startKey[2]) + mu.Lock() + seen[bucket] = struct{}{} + version := versions[bucket] + mu.Unlock() + if version > 0 { + f(BigtableRow{Key: BigtableVersionRowKey(version)}) + } + return nil + }) + require.NoError(t, err) + require.Equal(t, int64(70), got) + require.Len(t, seen, VersionBucketCount) +}