From fb202edb598466f189f2b0f10572ec0dad155b9b Mon Sep 17 00:00:00 2001 From: j-rafique Date: Wed, 25 Mar 2026 18:44:02 +0500 Subject: [PATCH] self-healing: complete phase-2 port, docs, and e2e/ci coverage --- .github/workflows/tests.yml | 38 + Makefile | 10 +- gen/supernode/self_healing.pb.go | 717 ++++++++++ gen/supernode/self_healing.swagger.json | 118 ++ gen/supernode/self_healing_grpc.pb.go | 203 +++ pkg/lumera/modules/action/action_mock.go | 15 + pkg/lumera/modules/action/impl.go | 12 + pkg/lumera/modules/action/interface.go | 1 + pkg/lumera/modules/audit/impl.go | 38 + pkg/lumera/modules/audit/interface.go | 1 + pkg/storage/queries/self_healing.go | 260 +++- pkg/storage/queries/sqlite.go | 56 +- pkg/testutil/lumera.go | 8 + pkg/types/self_healing.go | 26 +- proto/supernode/self_healing.proto | 70 + supernode/cascade/reseed.go | 17 +- supernode/cmd/helpers.go | 9 +- supernode/cmd/start.go | 37 + supernode/config/config.go | 24 +- supernode/config/defaults.go | 1 + supernode/config/save.go | 4 + supernode/self_healing/README.md | 277 ++++ supernode/self_healing/service.go | 1227 +++++++++++++++++ supernode/self_healing/service_test.go | 609 ++++++++ .../transport/grpc/self_healing/handler.go | 680 +++++++++ .../grpc/self_healing/handler_e2e_test.go | 482 +++++++ tests/system/README.md | 4 +- tests/system/config.test-1.yml | 1 + tests/system/config.test-2.yml | 1 + tests/system/config.test-3.yml | 1 + tests/system/e2e_self_healing_test.go | 610 ++++++++ tests/system/main_test.go | 8 + tests/system/runtime_cleanup.go | 247 ++++ tests/system/supernode-utils.go | 50 +- 34 files changed, 5834 insertions(+), 28 deletions(-) create mode 100644 gen/supernode/self_healing.pb.go create mode 100644 gen/supernode/self_healing.swagger.json create mode 100644 gen/supernode/self_healing_grpc.pb.go create mode 100644 proto/supernode/self_healing.proto create mode 100644 supernode/self_healing/README.md create mode 100644 supernode/self_healing/service.go create mode 100644 supernode/self_healing/service_test.go create mode 100644 supernode/transport/grpc/self_healing/handler.go create mode 100644 supernode/transport/grpc/self_healing/handler_e2e_test.go create mode 100644 tests/system/e2e_self_healing_test.go create mode 100644 tests/system/runtime_cleanup.go diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 26796204..e666420f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -70,6 +70,44 @@ jobs: - name: Run cascade e2e tests run: make test-cascade + self-healing-e2e-tests: + name: self-healing-e2e-tests + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v6.0.1 + + - name: Setup Go and system deps + uses: ./.github/actions/setup-env + # with: + # bust_lumera_retag: 'true' + + - name: Go mod tidy + run: go mod tidy + + - name: Install Lumera + run: make install-lumera + + - name: Setup Supernode environments + run: make setup-supernodes + + - name: Run self-healing e2e tests + run: make test-self-healing + + - name: Dump self-healing runtime traces + if: always() + run: | + cd tests/system + for f in supernode0.out supernode1.out supernode2.out; do + echo "==== ${f} (self-healing excerpts) ====" + if [ -f "$f" ]; then + grep -E "self-healing|sh-e2e-happy|RequestSelfHealing|VerifySelfHealing|CommitSelfHealing|reconstructed_hash_hex|observer" "$f" | tail -n 200 || true + else + echo "missing ${f}" + fi + done + # sn-manager-e2e-tests: # name: sn-manager-e2e-tests # runs-on: ubuntu-latest diff --git a/Makefile b/Makefile index 52d9589e..450ffe3d 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ release: ################################################### ### Tests and Simulation ### ################################################### -.PHONY: test-e2e test-unit test-integration test-system test-cascade test-sn-manager +.PHONY: test-e2e test-unit test-integration test-system test-cascade test-self-healing test-sn-manager .PHONY: install-lumera setup-supernodes system-test-setup install-deps .PHONY: gen-cascade gen-supernode test-unit: @@ -152,7 +152,7 @@ gen-supernode: --grpc-gateway_out=gen \ --grpc-gateway_opt=paths=source_relative \ --openapiv2_out=gen \ - proto/supernode/service.proto proto/supernode/status.proto proto/supernode/storage_challenge.proto + proto/supernode/service.proto proto/supernode/status.proto proto/supernode/storage_challenge.proto proto/supernode/self_healing.proto # Define the paths SUPERNODE_SRC=supernode/main.go @@ -201,8 +201,12 @@ test-cascade: @echo "Running cascade e2e tests..." @cd tests/system && ${GO} mod tidy && ${GO} test -tags=system_test -v -run TestCascadeE2E . +# Run self-healing e2e tests only +test-self-healing: + @echo "Running self-healing e2e tests..." + @cd tests/system && ${GO} mod tidy && ${GO} test -tags=system_test -v -run '^TestSelfHealingE2E' . + # Run sn-manager e2e tests only test-sn-manager: @echo "Running sn-manager e2e tests..." @cd tests/system && ${GO} test -tags=system_test -v -run '^TestSNManager' . - diff --git a/gen/supernode/self_healing.pb.go b/gen/supernode/self_healing.pb.go new file mode 100644 index 00000000..606d809d --- /dev/null +++ b/gen/supernode/self_healing.pb.go @@ -0,0 +1,717 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.35.2 +// protoc v3.12.4 +// source: supernode/self_healing.proto + +package supernode + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type RequestSelfHealingRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + FileKey string `protobuf:"bytes,3,opt,name=file_key,json=fileKey,proto3" json:"file_key,omitempty"` + ChallengerId string `protobuf:"bytes,4,opt,name=challenger_id,json=challengerId,proto3" json:"challenger_id,omitempty"` + RecipientId string `protobuf:"bytes,5,opt,name=recipient_id,json=recipientId,proto3" json:"recipient_id,omitempty"` + ObserverIds []string `protobuf:"bytes,6,rep,name=observer_ids,json=observerIds,proto3" json:"observer_ids,omitempty"` + ActionId string `protobuf:"bytes,7,opt,name=action_id,json=actionId,proto3" json:"action_id,omitempty"` +} + +func (x *RequestSelfHealingRequest) Reset() { + *x = RequestSelfHealingRequest{} + mi := &file_supernode_self_healing_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RequestSelfHealingRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RequestSelfHealingRequest) ProtoMessage() {} + +func (x *RequestSelfHealingRequest) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RequestSelfHealingRequest.ProtoReflect.Descriptor instead. +func (*RequestSelfHealingRequest) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{0} +} + +func (x *RequestSelfHealingRequest) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *RequestSelfHealingRequest) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *RequestSelfHealingRequest) GetFileKey() string { + if x != nil { + return x.FileKey + } + return "" +} + +func (x *RequestSelfHealingRequest) GetChallengerId() string { + if x != nil { + return x.ChallengerId + } + return "" +} + +func (x *RequestSelfHealingRequest) GetRecipientId() string { + if x != nil { + return x.RecipientId + } + return "" +} + +func (x *RequestSelfHealingRequest) GetObserverIds() []string { + if x != nil { + return x.ObserverIds + } + return nil +} + +func (x *RequestSelfHealingRequest) GetActionId() string { + if x != nil { + return x.ActionId + } + return "" +} + +type RequestSelfHealingResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + RecipientId string `protobuf:"bytes,3,opt,name=recipient_id,json=recipientId,proto3" json:"recipient_id,omitempty"` + Accepted bool `protobuf:"varint,4,opt,name=accepted,proto3" json:"accepted,omitempty"` + ReconstructionRequired bool `protobuf:"varint,5,opt,name=reconstruction_required,json=reconstructionRequired,proto3" json:"reconstruction_required,omitempty"` + ReconstructedHashHex string `protobuf:"bytes,6,opt,name=reconstructed_hash_hex,json=reconstructedHashHex,proto3" json:"reconstructed_hash_hex,omitempty"` + Error string `protobuf:"bytes,7,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *RequestSelfHealingResponse) Reset() { + *x = RequestSelfHealingResponse{} + mi := &file_supernode_self_healing_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RequestSelfHealingResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RequestSelfHealingResponse) ProtoMessage() {} + +func (x *RequestSelfHealingResponse) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RequestSelfHealingResponse.ProtoReflect.Descriptor instead. +func (*RequestSelfHealingResponse) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{1} +} + +func (x *RequestSelfHealingResponse) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *RequestSelfHealingResponse) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *RequestSelfHealingResponse) GetRecipientId() string { + if x != nil { + return x.RecipientId + } + return "" +} + +func (x *RequestSelfHealingResponse) GetAccepted() bool { + if x != nil { + return x.Accepted + } + return false +} + +func (x *RequestSelfHealingResponse) GetReconstructionRequired() bool { + if x != nil { + return x.ReconstructionRequired + } + return false +} + +func (x *RequestSelfHealingResponse) GetReconstructedHashHex() string { + if x != nil { + return x.ReconstructedHashHex + } + return "" +} + +func (x *RequestSelfHealingResponse) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +type VerifySelfHealingRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + FileKey string `protobuf:"bytes,3,opt,name=file_key,json=fileKey,proto3" json:"file_key,omitempty"` + RecipientId string `protobuf:"bytes,4,opt,name=recipient_id,json=recipientId,proto3" json:"recipient_id,omitempty"` + ReconstructedHashHex string `protobuf:"bytes,5,opt,name=reconstructed_hash_hex,json=reconstructedHashHex,proto3" json:"reconstructed_hash_hex,omitempty"` + ObserverId string `protobuf:"bytes,6,opt,name=observer_id,json=observerId,proto3" json:"observer_id,omitempty"` + ActionId string `protobuf:"bytes,7,opt,name=action_id,json=actionId,proto3" json:"action_id,omitempty"` +} + +func (x *VerifySelfHealingRequest) Reset() { + *x = VerifySelfHealingRequest{} + mi := &file_supernode_self_healing_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VerifySelfHealingRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VerifySelfHealingRequest) ProtoMessage() {} + +func (x *VerifySelfHealingRequest) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VerifySelfHealingRequest.ProtoReflect.Descriptor instead. +func (*VerifySelfHealingRequest) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{2} +} + +func (x *VerifySelfHealingRequest) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *VerifySelfHealingRequest) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *VerifySelfHealingRequest) GetFileKey() string { + if x != nil { + return x.FileKey + } + return "" +} + +func (x *VerifySelfHealingRequest) GetRecipientId() string { + if x != nil { + return x.RecipientId + } + return "" +} + +func (x *VerifySelfHealingRequest) GetReconstructedHashHex() string { + if x != nil { + return x.ReconstructedHashHex + } + return "" +} + +func (x *VerifySelfHealingRequest) GetObserverId() string { + if x != nil { + return x.ObserverId + } + return "" +} + +func (x *VerifySelfHealingRequest) GetActionId() string { + if x != nil { + return x.ActionId + } + return "" +} + +type VerifySelfHealingResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + ObserverId string `protobuf:"bytes,3,opt,name=observer_id,json=observerId,proto3" json:"observer_id,omitempty"` + Ok bool `protobuf:"varint,4,opt,name=ok,proto3" json:"ok,omitempty"` + Error string `protobuf:"bytes,5,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *VerifySelfHealingResponse) Reset() { + *x = VerifySelfHealingResponse{} + mi := &file_supernode_self_healing_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VerifySelfHealingResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VerifySelfHealingResponse) ProtoMessage() {} + +func (x *VerifySelfHealingResponse) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VerifySelfHealingResponse.ProtoReflect.Descriptor instead. +func (*VerifySelfHealingResponse) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{3} +} + +func (x *VerifySelfHealingResponse) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *VerifySelfHealingResponse) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *VerifySelfHealingResponse) GetObserverId() string { + if x != nil { + return x.ObserverId + } + return "" +} + +func (x *VerifySelfHealingResponse) GetOk() bool { + if x != nil { + return x.Ok + } + return false +} + +func (x *VerifySelfHealingResponse) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +type CommitSelfHealingRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + FileKey string `protobuf:"bytes,3,opt,name=file_key,json=fileKey,proto3" json:"file_key,omitempty"` + ActionId string `protobuf:"bytes,4,opt,name=action_id,json=actionId,proto3" json:"action_id,omitempty"` + ChallengerId string `protobuf:"bytes,5,opt,name=challenger_id,json=challengerId,proto3" json:"challenger_id,omitempty"` + RecipientId string `protobuf:"bytes,6,opt,name=recipient_id,json=recipientId,proto3" json:"recipient_id,omitempty"` +} + +func (x *CommitSelfHealingRequest) Reset() { + *x = CommitSelfHealingRequest{} + mi := &file_supernode_self_healing_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CommitSelfHealingRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CommitSelfHealingRequest) ProtoMessage() {} + +func (x *CommitSelfHealingRequest) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CommitSelfHealingRequest.ProtoReflect.Descriptor instead. +func (*CommitSelfHealingRequest) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{4} +} + +func (x *CommitSelfHealingRequest) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *CommitSelfHealingRequest) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *CommitSelfHealingRequest) GetFileKey() string { + if x != nil { + return x.FileKey + } + return "" +} + +func (x *CommitSelfHealingRequest) GetActionId() string { + if x != nil { + return x.ActionId + } + return "" +} + +func (x *CommitSelfHealingRequest) GetChallengerId() string { + if x != nil { + return x.ChallengerId + } + return "" +} + +func (x *CommitSelfHealingRequest) GetRecipientId() string { + if x != nil { + return x.RecipientId + } + return "" +} + +type CommitSelfHealingResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ChallengeId string `protobuf:"bytes,1,opt,name=challenge_id,json=challengeId,proto3" json:"challenge_id,omitempty"` + EpochId uint64 `protobuf:"varint,2,opt,name=epoch_id,json=epochId,proto3" json:"epoch_id,omitempty"` + RecipientId string `protobuf:"bytes,3,opt,name=recipient_id,json=recipientId,proto3" json:"recipient_id,omitempty"` + Stored bool `protobuf:"varint,4,opt,name=stored,proto3" json:"stored,omitempty"` + Error string `protobuf:"bytes,5,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *CommitSelfHealingResponse) Reset() { + *x = CommitSelfHealingResponse{} + mi := &file_supernode_self_healing_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CommitSelfHealingResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CommitSelfHealingResponse) ProtoMessage() {} + +func (x *CommitSelfHealingResponse) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CommitSelfHealingResponse.ProtoReflect.Descriptor instead. +func (*CommitSelfHealingResponse) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{5} +} + +func (x *CommitSelfHealingResponse) GetChallengeId() string { + if x != nil { + return x.ChallengeId + } + return "" +} + +func (x *CommitSelfHealingResponse) GetEpochId() uint64 { + if x != nil { + return x.EpochId + } + return 0 +} + +func (x *CommitSelfHealingResponse) GetRecipientId() string { + if x != nil { + return x.RecipientId + } + return "" +} + +func (x *CommitSelfHealingResponse) GetStored() bool { + if x != nil { + return x.Stored + } + return false +} + +func (x *CommitSelfHealingResponse) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +var File_supernode_self_healing_proto protoreflect.FileDescriptor + +var file_supernode_self_healing_proto_rawDesc = []byte{ + 0x0a, 0x1c, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x73, 0x65, 0x6c, 0x66, + 0x5f, 0x68, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x09, + 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x22, 0xfc, 0x01, 0x0a, 0x19, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, + 0x65, 0x6e, 0x67, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, + 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, + 0x6f, 0x63, 0x68, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, + 0x6f, 0x63, 0x68, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x6b, 0x65, + 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x66, 0x69, 0x6c, 0x65, 0x4b, 0x65, 0x79, + 0x12, 0x23, 0x0a, 0x0d, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x69, + 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, + 0x67, 0x65, 0x72, 0x49, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, + 0x6e, 0x74, 0x5f, 0x69, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x72, 0x65, 0x63, + 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, 0x49, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x6f, 0x62, 0x73, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0b, + 0x6f, 0x62, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x49, 0x64, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x61, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, + 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x64, 0x22, 0x9e, 0x02, 0x0a, 0x1a, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, + 0x65, 0x6e, 0x67, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, + 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, + 0x6f, 0x63, 0x68, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, + 0x6f, 0x63, 0x68, 0x49, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, + 0x6e, 0x74, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x72, 0x65, 0x63, + 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, 0x49, 0x64, 0x12, 0x1a, 0x0a, 0x08, 0x61, 0x63, 0x63, 0x65, + 0x70, 0x74, 0x65, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, 0x61, 0x63, 0x63, 0x65, + 0x70, 0x74, 0x65, 0x64, 0x12, 0x37, 0x0a, 0x17, 0x72, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, + 0x75, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x18, + 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x16, 0x72, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x12, 0x34, 0x0a, + 0x16, 0x72, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x68, + 0x61, 0x73, 0x68, 0x5f, 0x68, 0x65, 0x78, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x14, 0x72, + 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, 0x64, 0x48, 0x61, 0x73, 0x68, + 0x48, 0x65, 0x78, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x07, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0x8a, 0x02, 0x0a, 0x18, 0x56, 0x65, + 0x72, 0x69, 0x66, 0x79, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, + 0x6e, 0x67, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x68, + 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, 0x6f, + 0x63, 0x68, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, 0x6f, + 0x63, 0x68, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x6b, 0x65, 0x79, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x66, 0x69, 0x6c, 0x65, 0x4b, 0x65, 0x79, 0x12, + 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x69, 0x64, 0x18, + 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, + 0x49, 0x64, 0x12, 0x34, 0x0a, 0x16, 0x72, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, + 0x74, 0x65, 0x64, 0x5f, 0x68, 0x61, 0x73, 0x68, 0x5f, 0x68, 0x65, 0x78, 0x18, 0x05, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x14, 0x72, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, + 0x64, 0x48, 0x61, 0x73, 0x68, 0x48, 0x65, 0x78, 0x12, 0x1f, 0x0a, 0x0b, 0x6f, 0x62, 0x73, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x6f, + 0x62, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x49, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x61, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x61, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x64, 0x22, 0xa0, 0x01, 0x0a, 0x19, 0x56, 0x65, 0x72, 0x69, 0x66, + 0x79, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, + 0x65, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x68, 0x61, 0x6c, + 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, 0x6f, 0x63, 0x68, + 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, 0x6f, 0x63, 0x68, + 0x49, 0x64, 0x12, 0x1f, 0x0a, 0x0b, 0x6f, 0x62, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x69, + 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x6f, 0x62, 0x73, 0x65, 0x72, 0x76, 0x65, + 0x72, 0x49, 0x64, 0x12, 0x0e, 0x0a, 0x02, 0x6f, 0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x02, 0x6f, 0x6b, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x05, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0xd8, 0x01, 0x0a, 0x18, 0x43, 0x6f, + 0x6d, 0x6d, 0x69, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, + 0x6e, 0x67, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x68, + 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, 0x6f, + 0x63, 0x68, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, 0x6f, + 0x63, 0x68, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x6b, 0x65, 0x79, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x66, 0x69, 0x6c, 0x65, 0x4b, 0x65, 0x79, 0x12, + 0x1b, 0x0a, 0x09, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x08, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x64, 0x12, 0x23, 0x0a, 0x0d, + 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x05, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x72, 0x49, + 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x69, + 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, + 0x6e, 0x74, 0x49, 0x64, 0x22, 0xaa, 0x01, 0x0a, 0x19, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x53, + 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, + 0x73, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, 0x6e, 0x67, 0x65, 0x5f, + 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x68, 0x61, 0x6c, 0x6c, 0x65, + 0x6e, 0x67, 0x65, 0x49, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x70, 0x6f, 0x63, 0x68, 0x5f, 0x69, + 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x65, 0x70, 0x6f, 0x63, 0x68, 0x49, 0x64, + 0x12, 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x69, 0x64, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x72, 0x65, 0x63, 0x69, 0x70, 0x69, 0x65, 0x6e, + 0x74, 0x49, 0x64, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x18, 0x04, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x65, + 0x72, 0x72, 0x6f, 0x72, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, + 0x72, 0x32, 0xbd, 0x02, 0x0a, 0x12, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, + 0x67, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x63, 0x0a, 0x12, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x12, 0x24, + 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x52, 0x65, 0x71, 0x75, 0x65, + 0x73, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x25, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, + 0x2e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, + 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x60, 0x0a, + 0x11, 0x56, 0x65, 0x72, 0x69, 0x66, 0x79, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, + 0x6e, 0x67, 0x12, 0x23, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x56, + 0x65, 0x72, 0x69, 0x66, 0x79, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, + 0x6f, 0x64, 0x65, 0x2e, 0x56, 0x65, 0x72, 0x69, 0x66, 0x79, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, + 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, + 0x60, 0x0a, 0x11, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, + 0x6c, 0x69, 0x6e, 0x67, 0x12, 0x23, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, + 0x2e, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, + 0x6e, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x73, 0x75, 0x70, 0x65, + 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x53, 0x65, 0x6c, 0x66, + 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, + 0x00, 0x42, 0x36, 0x5a, 0x34, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, + 0x4c, 0x75, 0x6d, 0x65, 0x72, 0x61, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2f, 0x73, + 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x76, 0x32, 0x2f, 0x67, 0x65, 0x6e, 0x2f, + 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x33, +} + +var ( + file_supernode_self_healing_proto_rawDescOnce sync.Once + file_supernode_self_healing_proto_rawDescData = file_supernode_self_healing_proto_rawDesc +) + +func file_supernode_self_healing_proto_rawDescGZIP() []byte { + file_supernode_self_healing_proto_rawDescOnce.Do(func() { + file_supernode_self_healing_proto_rawDescData = protoimpl.X.CompressGZIP(file_supernode_self_healing_proto_rawDescData) + }) + return file_supernode_self_healing_proto_rawDescData +} + +var file_supernode_self_healing_proto_msgTypes = make([]protoimpl.MessageInfo, 6) +var file_supernode_self_healing_proto_goTypes = []any{ + (*RequestSelfHealingRequest)(nil), // 0: supernode.RequestSelfHealingRequest + (*RequestSelfHealingResponse)(nil), // 1: supernode.RequestSelfHealingResponse + (*VerifySelfHealingRequest)(nil), // 2: supernode.VerifySelfHealingRequest + (*VerifySelfHealingResponse)(nil), // 3: supernode.VerifySelfHealingResponse + (*CommitSelfHealingRequest)(nil), // 4: supernode.CommitSelfHealingRequest + (*CommitSelfHealingResponse)(nil), // 5: supernode.CommitSelfHealingResponse +} +var file_supernode_self_healing_proto_depIdxs = []int32{ + 0, // 0: supernode.SelfHealingService.RequestSelfHealing:input_type -> supernode.RequestSelfHealingRequest + 2, // 1: supernode.SelfHealingService.VerifySelfHealing:input_type -> supernode.VerifySelfHealingRequest + 4, // 2: supernode.SelfHealingService.CommitSelfHealing:input_type -> supernode.CommitSelfHealingRequest + 1, // 3: supernode.SelfHealingService.RequestSelfHealing:output_type -> supernode.RequestSelfHealingResponse + 3, // 4: supernode.SelfHealingService.VerifySelfHealing:output_type -> supernode.VerifySelfHealingResponse + 5, // 5: supernode.SelfHealingService.CommitSelfHealing:output_type -> supernode.CommitSelfHealingResponse + 3, // [3:6] is the sub-list for method output_type + 0, // [0:3] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_supernode_self_healing_proto_init() } +func file_supernode_self_healing_proto_init() { + if File_supernode_self_healing_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_supernode_self_healing_proto_rawDesc, + NumEnums: 0, + NumMessages: 6, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_supernode_self_healing_proto_goTypes, + DependencyIndexes: file_supernode_self_healing_proto_depIdxs, + MessageInfos: file_supernode_self_healing_proto_msgTypes, + }.Build() + File_supernode_self_healing_proto = out.File + file_supernode_self_healing_proto_rawDesc = nil + file_supernode_self_healing_proto_goTypes = nil + file_supernode_self_healing_proto_depIdxs = nil +} diff --git a/gen/supernode/self_healing.swagger.json b/gen/supernode/self_healing.swagger.json new file mode 100644 index 00000000..565c2147 --- /dev/null +++ b/gen/supernode/self_healing.swagger.json @@ -0,0 +1,118 @@ +{ + "swagger": "2.0", + "info": { + "title": "supernode/self_healing.proto", + "version": "version not set" + }, + "tags": [ + { + "name": "SelfHealingService" + } + ], + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "paths": {}, + "definitions": { + "protobufAny": { + "type": "object", + "properties": { + "@type": { + "type": "string" + } + }, + "additionalProperties": {} + }, + "rpcStatus": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "format": "int32" + }, + "message": { + "type": "string" + }, + "details": { + "type": "array", + "items": { + "type": "object", + "$ref": "#/definitions/protobufAny" + } + } + } + }, + "supernodeCommitSelfHealingResponse": { + "type": "object", + "properties": { + "challengeId": { + "type": "string" + }, + "epochId": { + "type": "string", + "format": "uint64" + }, + "recipientId": { + "type": "string" + }, + "stored": { + "type": "boolean" + }, + "error": { + "type": "string" + } + } + }, + "supernodeRequestSelfHealingResponse": { + "type": "object", + "properties": { + "challengeId": { + "type": "string" + }, + "epochId": { + "type": "string", + "format": "uint64" + }, + "recipientId": { + "type": "string" + }, + "accepted": { + "type": "boolean" + }, + "reconstructionRequired": { + "type": "boolean" + }, + "reconstructedHashHex": { + "type": "string" + }, + "error": { + "type": "string" + } + } + }, + "supernodeVerifySelfHealingResponse": { + "type": "object", + "properties": { + "challengeId": { + "type": "string" + }, + "epochId": { + "type": "string", + "format": "uint64" + }, + "observerId": { + "type": "string" + }, + "ok": { + "type": "boolean" + }, + "error": { + "type": "string" + } + } + } + } +} diff --git a/gen/supernode/self_healing_grpc.pb.go b/gen/supernode/self_healing_grpc.pb.go new file mode 100644 index 00000000..751b51a8 --- /dev/null +++ b/gen/supernode/self_healing_grpc.pb.go @@ -0,0 +1,203 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.5.1 +// - protoc v3.12.4 +// source: supernode/self_healing.proto + +package supernode + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + SelfHealingService_RequestSelfHealing_FullMethodName = "/supernode.SelfHealingService/RequestSelfHealing" + SelfHealingService_VerifySelfHealing_FullMethodName = "/supernode.SelfHealingService/VerifySelfHealing" + SelfHealingService_CommitSelfHealing_FullMethodName = "/supernode.SelfHealingService/CommitSelfHealing" +) + +// SelfHealingServiceClient is the client API for SelfHealingService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +// +// SelfHealingService exposes the minimal control-plane RPCs for deterministic +// expected-owner healing challenges. +type SelfHealingServiceClient interface { + RequestSelfHealing(ctx context.Context, in *RequestSelfHealingRequest, opts ...grpc.CallOption) (*RequestSelfHealingResponse, error) + VerifySelfHealing(ctx context.Context, in *VerifySelfHealingRequest, opts ...grpc.CallOption) (*VerifySelfHealingResponse, error) + CommitSelfHealing(ctx context.Context, in *CommitSelfHealingRequest, opts ...grpc.CallOption) (*CommitSelfHealingResponse, error) +} + +type selfHealingServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewSelfHealingServiceClient(cc grpc.ClientConnInterface) SelfHealingServiceClient { + return &selfHealingServiceClient{cc} +} + +func (c *selfHealingServiceClient) RequestSelfHealing(ctx context.Context, in *RequestSelfHealingRequest, opts ...grpc.CallOption) (*RequestSelfHealingResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(RequestSelfHealingResponse) + err := c.cc.Invoke(ctx, SelfHealingService_RequestSelfHealing_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *selfHealingServiceClient) VerifySelfHealing(ctx context.Context, in *VerifySelfHealingRequest, opts ...grpc.CallOption) (*VerifySelfHealingResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(VerifySelfHealingResponse) + err := c.cc.Invoke(ctx, SelfHealingService_VerifySelfHealing_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *selfHealingServiceClient) CommitSelfHealing(ctx context.Context, in *CommitSelfHealingRequest, opts ...grpc.CallOption) (*CommitSelfHealingResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(CommitSelfHealingResponse) + err := c.cc.Invoke(ctx, SelfHealingService_CommitSelfHealing_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +// SelfHealingServiceServer is the server API for SelfHealingService service. +// All implementations must embed UnimplementedSelfHealingServiceServer +// for forward compatibility. +// +// SelfHealingService exposes the minimal control-plane RPCs for deterministic +// expected-owner healing challenges. +type SelfHealingServiceServer interface { + RequestSelfHealing(context.Context, *RequestSelfHealingRequest) (*RequestSelfHealingResponse, error) + VerifySelfHealing(context.Context, *VerifySelfHealingRequest) (*VerifySelfHealingResponse, error) + CommitSelfHealing(context.Context, *CommitSelfHealingRequest) (*CommitSelfHealingResponse, error) + mustEmbedUnimplementedSelfHealingServiceServer() +} + +// UnimplementedSelfHealingServiceServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedSelfHealingServiceServer struct{} + +func (UnimplementedSelfHealingServiceServer) RequestSelfHealing(context.Context, *RequestSelfHealingRequest) (*RequestSelfHealingResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method RequestSelfHealing not implemented") +} +func (UnimplementedSelfHealingServiceServer) VerifySelfHealing(context.Context, *VerifySelfHealingRequest) (*VerifySelfHealingResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method VerifySelfHealing not implemented") +} +func (UnimplementedSelfHealingServiceServer) CommitSelfHealing(context.Context, *CommitSelfHealingRequest) (*CommitSelfHealingResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method CommitSelfHealing not implemented") +} +func (UnimplementedSelfHealingServiceServer) mustEmbedUnimplementedSelfHealingServiceServer() {} +func (UnimplementedSelfHealingServiceServer) testEmbeddedByValue() {} + +// UnsafeSelfHealingServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to SelfHealingServiceServer will +// result in compilation errors. +type UnsafeSelfHealingServiceServer interface { + mustEmbedUnimplementedSelfHealingServiceServer() +} + +func RegisterSelfHealingServiceServer(s grpc.ServiceRegistrar, srv SelfHealingServiceServer) { + // If the following call pancis, it indicates UnimplementedSelfHealingServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&SelfHealingService_ServiceDesc, srv) +} + +func _SelfHealingService_RequestSelfHealing_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(RequestSelfHealingRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(SelfHealingServiceServer).RequestSelfHealing(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: SelfHealingService_RequestSelfHealing_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(SelfHealingServiceServer).RequestSelfHealing(ctx, req.(*RequestSelfHealingRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _SelfHealingService_VerifySelfHealing_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(VerifySelfHealingRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(SelfHealingServiceServer).VerifySelfHealing(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: SelfHealingService_VerifySelfHealing_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(SelfHealingServiceServer).VerifySelfHealing(ctx, req.(*VerifySelfHealingRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _SelfHealingService_CommitSelfHealing_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(CommitSelfHealingRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(SelfHealingServiceServer).CommitSelfHealing(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: SelfHealingService_CommitSelfHealing_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(SelfHealingServiceServer).CommitSelfHealing(ctx, req.(*CommitSelfHealingRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// SelfHealingService_ServiceDesc is the grpc.ServiceDesc for SelfHealingService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var SelfHealingService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "supernode.SelfHealingService", + HandlerType: (*SelfHealingServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "RequestSelfHealing", + Handler: _SelfHealingService_RequestSelfHealing_Handler, + }, + { + MethodName: "VerifySelfHealing", + Handler: _SelfHealingService_VerifySelfHealing_Handler, + }, + { + MethodName: "CommitSelfHealing", + Handler: _SelfHealingService_CommitSelfHealing_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "supernode/self_healing.proto", +} diff --git a/pkg/lumera/modules/action/action_mock.go b/pkg/lumera/modules/action/action_mock.go index a4524fa3..6164139e 100644 --- a/pkg/lumera/modules/action/action_mock.go +++ b/pkg/lumera/modules/action/action_mock.go @@ -85,3 +85,18 @@ func (mr *MockModuleMockRecorder) GetParams(ctx any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetParams", reflect.TypeOf((*MockModule)(nil).GetParams), ctx) } + +// ListActions mocks base method. +func (m *MockModule) ListActions(ctx context.Context, req *types.QueryListActionsRequest) (*types.QueryListActionsResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListActions", ctx, req) + ret0, _ := ret[0].(*types.QueryListActionsResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListActions indicates an expected call of ListActions. +func (mr *MockModuleMockRecorder) ListActions(ctx, req any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListActions", reflect.TypeOf((*MockModule)(nil).ListActions), ctx, req) +} diff --git a/pkg/lumera/modules/action/impl.go b/pkg/lumera/modules/action/impl.go index ccf4dfea..7272c54f 100644 --- a/pkg/lumera/modules/action/impl.go +++ b/pkg/lumera/modules/action/impl.go @@ -57,3 +57,15 @@ func (m *module) GetParams(ctx context.Context) (*types.QueryParamsResponse, err return resp, nil } + +// ListActions fetches actions with optional filters/pagination. +func (m *module) ListActions(ctx context.Context, req *types.QueryListActionsRequest) (*types.QueryListActionsResponse, error) { + if req == nil { + req = &types.QueryListActionsRequest{} + } + resp, err := m.client.ListActions(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to list actions: %w", err) + } + return resp, nil +} diff --git a/pkg/lumera/modules/action/interface.go b/pkg/lumera/modules/action/interface.go index 585c7bf8..69128df0 100644 --- a/pkg/lumera/modules/action/interface.go +++ b/pkg/lumera/modules/action/interface.go @@ -13,6 +13,7 @@ type Module interface { GetAction(ctx context.Context, actionID string) (*types.QueryGetActionResponse, error) GetActionFee(ctx context.Context, dataSize string) (*types.QueryGetActionFeeResponse, error) GetParams(ctx context.Context) (*types.QueryParamsResponse, error) + ListActions(ctx context.Context, req *types.QueryListActionsRequest) (*types.QueryListActionsResponse, error) } // NewModule creates a new Action module client diff --git a/pkg/lumera/modules/audit/impl.go b/pkg/lumera/modules/audit/impl.go index 06b26908..d1e1546b 100644 --- a/pkg/lumera/modules/audit/impl.go +++ b/pkg/lumera/modules/audit/impl.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/LumeraProtocol/lumera/x/audit/v1/types" + querytypes "github.com/cosmos/cosmos-sdk/types/query" "google.golang.org/grpc" ) @@ -73,3 +74,40 @@ func (m *module) GetEpochReport(ctx context.Context, epochID uint64, supernodeAc } return resp, nil } + +func (m *module) GetStorageChallengeReports(ctx context.Context, supernodeAccount string, epochID uint64) (*types.QueryStorageChallengeReportsResponse, error) { + page := &querytypes.PageRequest{Limit: 1000} + all := make([]types.StorageChallengeReport, 0) + var lastPagination *querytypes.PageResponse + + for { + resp, err := m.client.StorageChallengeReports(ctx, &types.QueryStorageChallengeReportsRequest{ + SupernodeAccount: supernodeAccount, + EpochId: epochID, + FilterByEpochId: true, + Pagination: page, + }) + if err != nil { + return nil, fmt.Errorf("failed to get storage challenge reports: %w", err) + } + if resp == nil { + break + } + + all = append(all, resp.Reports...) + lastPagination = resp.Pagination + if resp.Pagination == nil || len(resp.Pagination.NextKey) == 0 { + break + } + + page = &querytypes.PageRequest{ + Key: resp.Pagination.NextKey, + Limit: 1000, + } + } + + return &types.QueryStorageChallengeReportsResponse{ + Reports: all, + Pagination: lastPagination, + }, nil +} diff --git a/pkg/lumera/modules/audit/interface.go b/pkg/lumera/modules/audit/interface.go index a5ac2939..f9ad0ac4 100644 --- a/pkg/lumera/modules/audit/interface.go +++ b/pkg/lumera/modules/audit/interface.go @@ -15,6 +15,7 @@ type Module interface { GetCurrentEpoch(ctx context.Context) (*types.QueryCurrentEpochResponse, error) GetAssignedTargets(ctx context.Context, supernodeAccount string, epochID uint64) (*types.QueryAssignedTargetsResponse, error) GetEpochReport(ctx context.Context, epochID uint64, supernodeAccount string) (*types.QueryEpochReportResponse, error) + GetStorageChallengeReports(ctx context.Context, supernodeAccount string, epochID uint64) (*types.QueryStorageChallengeReportsResponse, error) } // NewModule creates a new Audit module client. diff --git a/pkg/storage/queries/self_healing.go b/pkg/storage/queries/self_healing.go index 61e7c63c..08b64bc7 100644 --- a/pkg/storage/queries/self_healing.go +++ b/pkg/storage/queries/self_healing.go @@ -2,12 +2,15 @@ package queries import ( "context" + "database/sql" "fmt" + "strings" "time" "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" "github.com/LumeraProtocol/supernode/v2/pkg/types" "github.com/LumeraProtocol/supernode/v2/pkg/utils/metrics" + "github.com/jmoiron/sqlx" json "github.com/json-iterator/go" ) @@ -15,6 +18,11 @@ type SelfHealingQueries interface { BatchInsertSelfHealingChallengeEvents(ctx context.Context, event []types.SelfHealingChallengeEvent) error UpdateSHChallengeEventProcessed(challengeID string, isProcessed bool) error GetSelfHealingChallengeEvents() ([]types.SelfHealingChallengeEvent, error) + GetSelfHealingChallengeEvent(challengeID string) (*types.SelfHealingChallengeEvent, error) + ClaimPendingSelfHealingChallengeEvents(ctx context.Context, owner string, leaseFor time.Duration, limit int) ([]types.SelfHealingChallengeEvent, error) + MarkSelfHealingChallengeEventCompleted(challengeID string, owner string) error + MarkSelfHealingChallengeEventRetry(challengeID string, owner string, reason string, retryAfter time.Duration) error + MarkSelfHealingChallengeEventTerminal(challengeID string, owner string, reason string) error CleanupSelfHealingChallenges() (err error) QuerySelfHealingChallenges() (challenges []types.SelfHealingChallenge, err error) @@ -561,8 +569,8 @@ func (s *SQLiteStore) BatchInsertSelfHealingChallengeEvents(ctx context.Context, stmt, err := tx.Prepare(` INSERT OR IGNORE INTO self_healing_challenge_events - (trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) + (trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, status, attempt_count, next_retry_at, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `) if err != nil { tx.Rollback() @@ -583,7 +591,7 @@ func (s *SQLiteStore) BatchInsertSelfHealingChallengeEvents(ctx context.Context, for _, event := range eventsBatch { now := time.Now().UTC() - _, err = stmt.Exec(event.TriggerID, event.TicketID, event.ChallengeID, event.Data, event.SenderID, false, now, now) + _, err = stmt.Exec(event.TriggerID, event.TicketID, event.ChallengeID, event.Data, event.SenderID, false, "pending", 0, now, now, now) if err != nil { tx.Rollback() return err @@ -602,7 +610,7 @@ func (s *SQLiteStore) BatchInsertSelfHealingChallengeEvents(ctx context.Context, // GetSelfHealingChallengeEvents retrieves the challenge events from DB func (s *SQLiteStore) GetSelfHealingChallengeEvents() ([]types.SelfHealingChallengeEvent, error) { const selectQuery = ` - SELECT trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, created_at, updated_at + SELECT trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, status, attempt_count, lease_owner, lease_expires_at, next_retry_at, last_error, created_at, updated_at FROM self_healing_challenge_events WHERE is_processed = false ` @@ -618,6 +626,7 @@ func (s *SQLiteStore) GetSelfHealingChallengeEvents() ([]types.SelfHealingChalle var event types.SelfHealingChallengeEvent if err := rows.Scan( &event.TriggerID, &event.TicketID, &event.ChallengeID, &event.Data, &event.SenderID, &event.IsProcessed, + &event.Status, &event.AttemptCount, &event.LeaseOwner, &event.LeaseExpiresAt, &event.NextRetryAt, &event.LastError, &event.CreatedAt, &event.UpdatedAt, ); err != nil { return nil, err @@ -629,6 +638,249 @@ func (s *SQLiteStore) GetSelfHealingChallengeEvents() ([]types.SelfHealingChalle return events, nil } +// GetSelfHealingChallengeEvent retrieves a challenge event by challenge id. +func (s *SQLiteStore) GetSelfHealingChallengeEvent(challengeID string) (*types.SelfHealingChallengeEvent, error) { + const selectQuery = ` + SELECT trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, status, attempt_count, lease_owner, lease_expires_at, next_retry_at, last_error, created_at, updated_at + FROM self_healing_challenge_events + WHERE challenge_id = ? + ` + + var event types.SelfHealingChallengeEvent + err := s.db.QueryRow(selectQuery, challengeID).Scan( + &event.TriggerID, &event.TicketID, &event.ChallengeID, &event.Data, &event.SenderID, &event.IsProcessed, + &event.Status, &event.AttemptCount, &event.LeaseOwner, &event.LeaseExpiresAt, &event.NextRetryAt, &event.LastError, + &event.CreatedAt, &event.UpdatedAt, + ) + if err != nil { + return nil, err + } + return &event, nil +} + +// ClaimPendingSelfHealingChallengeEvents atomically claims pending/retry events for processing. +func (s *SQLiteStore) ClaimPendingSelfHealingChallengeEvents(ctx context.Context, owner string, leaseFor time.Duration, limit int) ([]types.SelfHealingChallengeEvent, error) { + owner = strings.TrimSpace(owner) + if owner == "" { + return nil, fmt.Errorf("owner is required") + } + if limit <= 0 { + limit = 1 + } + if leaseFor <= 0 { + leaseFor = 2 * time.Minute + } + now := time.Now().UTC() + leaseUntil := now.Add(leaseFor) + tx, err := s.db.BeginTxx(ctx, nil) + if err != nil { + return nil, err + } + committed := false + defer func() { + if !committed { + _ = tx.Rollback() + } + }() + + const candidateQuery = ` + SELECT challenge_id + FROM self_healing_challenge_events + WHERE is_processed = false + AND (status = 'pending' OR status = 'retry' OR (status = 'processing' AND (lease_expires_at IS NULL OR lease_expires_at <= ?))) + AND (next_retry_at IS NULL OR next_retry_at <= ?) + ORDER BY created_at ASC + LIMIT ? + ` + + rows, err := tx.QueryContext(ctx, candidateQuery, now, now, limit) + if err != nil { + return nil, err + } + candidateIDs := make([]string, 0, limit) + for rows.Next() { + var challengeID string + if err := rows.Scan(&challengeID); err != nil { + _ = rows.Close() + return nil, err + } + candidateIDs = append(candidateIDs, challengeID) + } + if err := rows.Err(); err != nil { + _ = rows.Close() + return nil, err + } + if err := rows.Close(); err != nil { + return nil, err + } + + claimedIDs := make([]string, 0, len(candidateIDs)) + for _, challengeID := range candidateIDs { + res, err := tx.ExecContext(ctx, ` + UPDATE self_healing_challenge_events + SET status = 'processing', + lease_owner = ?, + lease_expires_at = ?, + attempt_count = attempt_count + 1, + updated_at = ? + WHERE challenge_id = ? + AND is_processed = false + AND (status = 'pending' OR status = 'retry' OR (status = 'processing' AND (lease_expires_at IS NULL OR lease_expires_at <= ?))) + AND (next_retry_at IS NULL OR next_retry_at <= ?) + `, owner, leaseUntil, now, challengeID, now, now) + if err != nil { + return nil, err + } + + affected, err := res.RowsAffected() + if err != nil { + return nil, err + } + if affected == 0 { + continue + } + claimedIDs = append(claimedIDs, challengeID) + } + + if len(claimedIDs) == 0 { + if err := tx.Commit(); err != nil { + return nil, err + } + committed = true + return []types.SelfHealingChallengeEvent{}, nil + } + + queryWithIn, args, err := sqlx.In(` + SELECT trigger_id, ticket_id, challenge_id, data, sender_id, is_processed, status, attempt_count, lease_owner, lease_expires_at, next_retry_at, last_error, created_at, updated_at + FROM self_healing_challenge_events + WHERE challenge_id IN (?) + ORDER BY created_at ASC + `, claimedIDs) + if err != nil { + return nil, err + } + queryWithIn = tx.Rebind(queryWithIn) + claimedRows, err := tx.QueryxContext(ctx, queryWithIn, args...) + if err != nil { + return nil, err + } + defer claimedRows.Close() + + claimed := make([]types.SelfHealingChallengeEvent, 0, len(claimedIDs)) + for claimedRows.Next() { + var event types.SelfHealingChallengeEvent + if err := claimedRows.Scan( + &event.TriggerID, &event.TicketID, &event.ChallengeID, &event.Data, &event.SenderID, &event.IsProcessed, + &event.Status, &event.AttemptCount, &event.LeaseOwner, &event.LeaseExpiresAt, &event.NextRetryAt, &event.LastError, + &event.CreatedAt, &event.UpdatedAt, + ); err != nil { + return nil, err + } + claimed = append(claimed, event) + } + if err := claimedRows.Err(); err != nil { + return nil, err + } + + if err := tx.Commit(); err != nil { + return nil, err + } + committed = true + + return claimed, nil +} + +// MarkSelfHealingChallengeEventCompleted marks a claimed event as completed. +func (s *SQLiteStore) MarkSelfHealingChallengeEventCompleted(challengeID string, owner string) error { + now := time.Now().UTC() + res, err := s.db.Exec(` + UPDATE self_healing_challenge_events + SET status = 'completed', + is_processed = true, + lease_owner = NULL, + lease_expires_at = NULL, + next_retry_at = NULL, + last_error = NULL, + updated_at = ? + WHERE challenge_id = ? + AND status = 'processing' + AND lease_owner = ? + `, now, challengeID, owner) + if err != nil { + return err + } + affected, err := res.RowsAffected() + if err != nil { + return err + } + if affected == 0 { + return sql.ErrNoRows + } + return nil +} + +// MarkSelfHealingChallengeEventRetry releases a claimed event with retry metadata. +func (s *SQLiteStore) MarkSelfHealingChallengeEventRetry(challengeID string, owner string, reason string, retryAfter time.Duration) error { + if retryAfter < 0 { + retryAfter = 0 + } + now := time.Now().UTC() + next := now.Add(retryAfter) + res, err := s.db.Exec(` + UPDATE self_healing_challenge_events + SET status = 'retry', + is_processed = false, + lease_owner = NULL, + lease_expires_at = NULL, + next_retry_at = ?, + last_error = ?, + updated_at = ? + WHERE challenge_id = ? + AND status = 'processing' + AND lease_owner = ? + `, next, reason, now, challengeID, owner) + if err != nil { + return err + } + affected, err := res.RowsAffected() + if err != nil { + return err + } + if affected == 0 { + return sql.ErrNoRows + } + return nil +} + +// MarkSelfHealingChallengeEventTerminal marks a claimed event as terminal. +func (s *SQLiteStore) MarkSelfHealingChallengeEventTerminal(challengeID string, owner string, reason string) error { + now := time.Now().UTC() + res, err := s.db.Exec(` + UPDATE self_healing_challenge_events + SET status = 'terminal', + is_processed = true, + lease_owner = NULL, + lease_expires_at = NULL, + next_retry_at = NULL, + last_error = ?, + updated_at = ? + WHERE challenge_id = ? + AND status = 'processing' + AND lease_owner = ? + `, reason, now, challengeID, owner) + if err != nil { + return err + } + affected, err := res.RowsAffected() + if err != nil { + return err + } + if affected == 0 { + return sql.ErrNoRows + } + return nil +} + // UpdateSHChallengeEventProcessed updates the is_processed flag of an event func (s *SQLiteStore) UpdateSHChallengeEventProcessed(challengeID string, isProcessed bool) error { const updateQuery = ` diff --git a/pkg/storage/queries/sqlite.go b/pkg/storage/queries/sqlite.go index dfdd90cd..0f1f36c1 100644 --- a/pkg/storage/queries/sqlite.go +++ b/pkg/storage/queries/sqlite.go @@ -124,6 +124,12 @@ const createSelfHealingChallengeTickets string = ` data BLOB NOT NULL, sender_id TEXT NOT NULL, is_processed BOOLEAN NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + attempt_count INTEGER NOT NULL DEFAULT 0, + lease_owner TEXT, + lease_expires_at DATETIME NULL, + next_retry_at DATETIME NULL, + last_error TEXT, created_at DATETIME NOT NULL, updated_at DATETIME NOT NULL ); @@ -134,7 +140,19 @@ CREATE UNIQUE INDEX IF NOT EXISTS self_healing_challenge_events_unique ON self_h ` const createSelfHealingExecutionMetricsUniqueIndex string = ` -CREATE UNIQUE INDEX IF NOT EXISTS self_healing_execution_metrics_unique ON self_healing_execution_metrics(trigger_id, challenge_id, message_type); +CREATE UNIQUE INDEX IF NOT EXISTS self_healing_execution_metrics_unique ON self_healing_execution_metrics(trigger_id, challenge_id, message_type, sender_id); +` + +const createSelfHealingChallengeEventsStatusIndex string = ` +CREATE INDEX IF NOT EXISTS self_healing_challenge_events_status_next_retry_idx ON self_healing_challenge_events(status, next_retry_at); +` + +const createSelfHealingChallengeEventsLeaseIndex string = ` +CREATE INDEX IF NOT EXISTS self_healing_challenge_events_lease_expires_idx ON self_healing_challenge_events(lease_expires_at); +` + +const dropSelfHealingExecutionMetricsUniqueIndex string = ` +DROP INDEX IF EXISTS self_healing_execution_metrics_unique; ` const alterTablePingHistory = `ALTER TABLE ping_history @@ -202,6 +220,24 @@ CREATE UNIQUE INDEX IF NOT EXISTS healthcheck_challenge_metrics_unique ON health const alterTablePingHistoryHealthCheckColumn = `ALTER TABLE ping_history ADD COLUMN health_check_metrics_last_broadcast_at DATETIME NULL;` +const alterSelfHealingChallengeEventsStatus = `ALTER TABLE self_healing_challenge_events +ADD COLUMN status TEXT NOT NULL DEFAULT 'pending';` + +const alterSelfHealingChallengeEventsAttemptCount = `ALTER TABLE self_healing_challenge_events +ADD COLUMN attempt_count INTEGER NOT NULL DEFAULT 0;` + +const alterSelfHealingChallengeEventsLeaseOwner = `ALTER TABLE self_healing_challenge_events +ADD COLUMN lease_owner TEXT;` + +const alterSelfHealingChallengeEventsLeaseExpiresAt = `ALTER TABLE self_healing_challenge_events +ADD COLUMN lease_expires_at DATETIME NULL;` + +const alterSelfHealingChallengeEventsNextRetryAt = `ALTER TABLE self_healing_challenge_events +ADD COLUMN next_retry_at DATETIME NULL;` + +const alterSelfHealingChallengeEventsLastError = `ALTER TABLE self_healing_challenge_events +ADD COLUMN last_error TEXT;` + const createPingHistoryWithoutUniqueIPAddress string = ` BEGIN TRANSACTION; @@ -352,6 +388,10 @@ func OpenHistoryDB() (LocalStoreInterface, error) { return nil, fmt.Errorf("cannot create table(s): %w", err) } + if _, err := db.Exec(dropSelfHealingExecutionMetricsUniqueIndex); err != nil { + return nil, fmt.Errorf("cannot execute migration: %w", err) + } + if _, err := db.Exec(createSelfHealingExecutionMetricsUniqueIndex); err != nil { return nil, fmt.Errorf("cannot create table(s): %w", err) } @@ -398,6 +438,20 @@ func OpenHistoryDB() (LocalStoreInterface, error) { _, _ = db.Exec(alterTablePingHistoryHealthCheckColumn) + _, _ = db.Exec(alterSelfHealingChallengeEventsStatus) + _, _ = db.Exec(alterSelfHealingChallengeEventsAttemptCount) + _, _ = db.Exec(alterSelfHealingChallengeEventsLeaseOwner) + _, _ = db.Exec(alterSelfHealingChallengeEventsLeaseExpiresAt) + _, _ = db.Exec(alterSelfHealingChallengeEventsNextRetryAt) + _, _ = db.Exec(alterSelfHealingChallengeEventsLastError) + + if _, err := db.Exec(createSelfHealingChallengeEventsStatusIndex); err != nil { + return nil, fmt.Errorf("cannot create self-healing challenge events status index: %w", err) + } + if _, err := db.Exec(createSelfHealingChallengeEventsLeaseIndex); err != nil { + return nil, fmt.Errorf("cannot create self-healing challenge events lease index: %w", err) + } + _, err = db.Exec(createPingHistoryWithoutUniqueIPAddress) if err != nil { logtrace.Error(context.Background(), "error executing ping-history w/o unique ip-address constraint migration", logtrace.Fields{ diff --git a/pkg/testutil/lumera.go b/pkg/testutil/lumera.go index d7bd1212..9ef76c1a 100644 --- a/pkg/testutil/lumera.go +++ b/pkg/testutil/lumera.go @@ -170,6 +170,10 @@ func (m *MockActionModule) GetParams(ctx context.Context) (*types.QueryParamsRes return &types.QueryParamsResponse{}, nil } +func (m *MockActionModule) ListActions(ctx context.Context, req *types.QueryListActionsRequest) (*types.QueryListActionsResponse, error) { + return &types.QueryListActionsResponse{}, nil +} + // MockActionMsgModule implements the action_msg.Module interface for testing type MockActionMsgModule struct{} @@ -217,6 +221,10 @@ func (m *MockAuditModule) GetEpochReport(ctx context.Context, epochID uint64, su return &audittypes.QueryEpochReportResponse{}, nil } +func (m *MockAuditModule) GetStorageChallengeReports(ctx context.Context, supernodeAccount string, epochID uint64) (*audittypes.QueryStorageChallengeReportsResponse, error) { + return &audittypes.QueryStorageChallengeReportsResponse{}, nil +} + type MockAuditMsgModule struct{} func (m *MockAuditMsgModule) SubmitEvidence(ctx context.Context, subjectAddress string, evidenceType audittypes.EvidenceType, actionID string, metadataJSON string) (*sdktx.BroadcastTxResponse, error) { diff --git a/pkg/types/self_healing.go b/pkg/types/self_healing.go index dfd8db70..48bcaf3a 100644 --- a/pkg/types/self_healing.go +++ b/pkg/types/self_healing.go @@ -231,16 +231,22 @@ type SelfHealingMetrics struct { // SelfHealingChallengeEvent represents the challenge event that needs to be healed. type SelfHealingChallengeEvent struct { - ID int64 - TriggerID string - ChallengeID string - TicketID string - Data []byte - SenderID string - IsProcessed bool - ExecMetric SelfHealingExecutionMetric - CreatedAt time.Time - UpdatedAt time.Time + ID int64 + TriggerID string + ChallengeID string + TicketID string + Data []byte + SenderID string + IsProcessed bool + Status string + AttemptCount int + LeaseOwner sql.NullString + LeaseExpiresAt sql.NullTime + NextRetryAt sql.NullTime + LastError sql.NullString + ExecMetric SelfHealingExecutionMetric + CreatedAt time.Time + UpdatedAt time.Time } // Hash returns the hash of the self-healing challenge reports diff --git a/proto/supernode/self_healing.proto b/proto/supernode/self_healing.proto new file mode 100644 index 00000000..46b9a5c4 --- /dev/null +++ b/proto/supernode/self_healing.proto @@ -0,0 +1,70 @@ +syntax = "proto3"; +package supernode; +option go_package = "github.com/LumeraProtocol/supernode/v2/gen/supernode"; + +// SelfHealingService exposes the minimal control-plane RPCs for deterministic +// expected-owner healing challenges. +service SelfHealingService { + rpc RequestSelfHealing(RequestSelfHealingRequest) returns (RequestSelfHealingResponse) {} + rpc VerifySelfHealing(VerifySelfHealingRequest) returns (VerifySelfHealingResponse) {} + rpc CommitSelfHealing(CommitSelfHealingRequest) returns (CommitSelfHealingResponse) {} +} + +message RequestSelfHealingRequest { + string challenge_id = 1; + uint64 epoch_id = 2; + + string file_key = 3; + string challenger_id = 4; + string recipient_id = 5; + repeated string observer_ids = 6; + string action_id = 7; +} + +message RequestSelfHealingResponse { + string challenge_id = 1; + uint64 epoch_id = 2; + + string recipient_id = 3; + bool accepted = 4; + bool reconstruction_required = 5; + string reconstructed_hash_hex = 6; + string error = 7; +} + +message VerifySelfHealingRequest { + string challenge_id = 1; + uint64 epoch_id = 2; + + string file_key = 3; + string recipient_id = 4; + string reconstructed_hash_hex = 5; + string observer_id = 6; + string action_id = 7; +} + +message VerifySelfHealingResponse { + string challenge_id = 1; + uint64 epoch_id = 2; + string observer_id = 3; + bool ok = 4; + string error = 5; +} + +message CommitSelfHealingRequest { + string challenge_id = 1; + uint64 epoch_id = 2; + + string file_key = 3; + string action_id = 4; + string challenger_id = 5; + string recipient_id = 6; +} + +message CommitSelfHealingResponse { + string challenge_id = 1; + uint64 epoch_id = 2; + string recipient_id = 3; + bool stored = 4; + string error = 5; +} diff --git a/supernode/cascade/reseed.go b/supernode/cascade/reseed.go index 3cdd9a26..e2e2b76f 100644 --- a/supernode/cascade/reseed.go +++ b/supernode/cascade/reseed.go @@ -2,6 +2,7 @@ package cascade import ( "context" + "encoding/hex" "fmt" "sort" "strings" @@ -13,7 +14,8 @@ import ( ) type RecoveryReseedRequest struct { - ActionID string + ActionID string + PersistArtifacts *bool } type RecoveryReseedResult struct { @@ -21,6 +23,7 @@ type RecoveryReseedResult struct { DownloadEvents int DownloadLastEvent string DecodeCleanupError string + ReconstructedHashHex string RQIC uint64 RQMax uint64 DataHashVerified bool @@ -114,8 +117,13 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re if err := cascadekit.VerifyB64DataHash(fileHash, meta.DataHash); err != nil { return result, task.wrapErr(ctx, "decoded file hash does not match action metadata", err, fields) } + result.ReconstructedHashHex = hex.EncodeToString(fileHash) result.DataHashVerified = true + if !shouldPersistArtifacts(req) { + return result, nil + } + encodeResult, err := task.encodeInput(ctx, actionID, decodeFilePath, fields) if err != nil { return result, err @@ -143,6 +151,13 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re return result, nil } +func shouldPersistArtifacts(req *RecoveryReseedRequest) bool { + if req == nil || req.PersistArtifacts == nil { + return true + } + return *req.PersistArtifacts +} + func symbolIDsFromLayout(layout codec.Layout) []string { seen := make(map[string]struct{}, 1024) for _, block := range layout.Blocks { diff --git a/supernode/cmd/helpers.go b/supernode/cmd/helpers.go index 0d51bc45..7cf612c3 100644 --- a/supernode/cmd/helpers.go +++ b/supernode/cmd/helpers.go @@ -63,9 +63,10 @@ func isValidBIP39WordCount(wordCount int) bool { // createP2PConfig creates a P2P config from the app config and address func createP2PConfig(config *config.Config, address string) *p2p.Config { return &p2p.Config{ - ListenAddress: config.SupernodeConfig.Host, - Port: config.P2PConfig.Port, - DataDir: config.GetP2PDataDir(), - ID: address, + ListenAddress: config.SupernodeConfig.Host, + Port: config.P2PConfig.Port, + DataDir: config.GetP2PDataDir(), + BootstrapNodes: config.P2PConfig.BootstrapNodes, + ID: address, } } diff --git a/supernode/cmd/start.go b/supernode/cmd/start.go index be1c1c15..114a0419 100644 --- a/supernode/cmd/start.go +++ b/supernode/cmd/start.go @@ -24,6 +24,7 @@ import ( cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" "github.com/LumeraProtocol/supernode/v2/supernode/config" hostReporterService "github.com/LumeraProtocol/supernode/v2/supernode/host_reporter" + selfHealingService "github.com/LumeraProtocol/supernode/v2/supernode/self_healing" statusService "github.com/LumeraProtocol/supernode/v2/supernode/status" storageChallengeService "github.com/LumeraProtocol/supernode/v2/supernode/storage_challenge" // Legacy supernode metrics reporter (MsgReportSupernodeMetrics) has been superseded by @@ -31,6 +32,7 @@ import ( // supernodeMetrics "github.com/LumeraProtocol/supernode/v2/supernode/supernode_metrics" "github.com/LumeraProtocol/supernode/v2/supernode/transport/gateway" cascadeRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/cascade" + selfHealingRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/self_healing" server "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/status" storageChallengeRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/storage_challenge" "github.com/LumeraProtocol/supernode/v2/supernode/verifier" @@ -193,6 +195,7 @@ The supernode will connect to the Lumera network and begin participating in the } storageChallengeServer := storageChallengeRPC.NewServer(appConfig.SupernodeConfig.Identity, p2pService, historyStore) + selfHealingServer := selfHealingRPC.NewServer(appConfig.SupernodeConfig.Identity, p2pService, lumeraClient, historyStore, cService) var storageChallengeRunner *storageChallengeService.Service if appConfig.StorageChallengeConfig.Enabled { storageChallengeRunner, err = storageChallengeService.NewService( @@ -214,6 +217,36 @@ The supernode will connect to the Lumera network and begin participating in the } } + var selfHealingRunner *selfHealingService.Service + if appConfig.SelfHealingConfig.Enabled { + selfHealingRunner, err = selfHealingService.NewService( + appConfig.SupernodeConfig.Identity, + appConfig.SupernodeConfig.Port, + lumeraClient, + p2pService, + kr, + historyStore, + selfHealingService.Config{ + Enabled: true, + PollInterval: time.Duration(appConfig.SelfHealingConfig.PollIntervalMs) * time.Millisecond, + KeyName: appConfig.SupernodeConfig.KeyName, + ActionPageLimit: int(appConfig.SelfHealingConfig.ActionPageLimit), + ActionTargetsTTL: time.Duration(appConfig.SelfHealingConfig.ActionTargetsTTLSeconds) * time.Second, + MaxChallenges: int(appConfig.SelfHealingConfig.MaxChallenges), + EventLeaseDuration: time.Duration(appConfig.SelfHealingConfig.EventLeaseDurationMs) * time.Millisecond, + EventRetryBase: time.Duration(appConfig.SelfHealingConfig.EventRetryBaseMs) * time.Millisecond, + EventRetryMax: time.Duration(appConfig.SelfHealingConfig.EventRetryMaxMs) * time.Millisecond, + MaxEventAttempts: int(appConfig.SelfHealingConfig.MaxEventAttempts), + MaxEventsPerTick: int(appConfig.SelfHealingConfig.MaxEventsPerTick), + EventWorkers: int(appConfig.SelfHealingConfig.EventWorkers), + MaxWindowAge: time.Duration(appConfig.SelfHealingConfig.MaxWindowAgeMs) * time.Millisecond, + }, + ) + if err != nil { + logtrace.Fatal(ctx, "Failed to initialize self-healing runner", logtrace.Fields{"error": err.Error()}) + } + } + // Create supernode server supernodeServer := server.NewSupernodeServer(statusSvc) @@ -228,6 +261,7 @@ The supernode will connect to the Lumera network and begin participating in the grpcserver.ServiceDesc{Desc: &pbcascade.CascadeService_ServiceDesc, Service: cascadeActionServer}, grpcserver.ServiceDesc{Desc: &pbsupernode.SupernodeService_ServiceDesc, Service: supernodeServer}, grpcserver.ServiceDesc{Desc: &pbsupernode.StorageChallengeService_ServiceDesc, Service: storageChallengeServer}, + grpcserver.ServiceDesc{Desc: &pbsupernode.SelfHealingService_ServiceDesc, Service: selfHealingServer}, ) if err != nil { logtrace.Fatal(ctx, "Failed to create gRPC server", logtrace.Fields{"error": err.Error()}) @@ -257,6 +291,9 @@ The supernode will connect to the Lumera network and begin participating in the if storageChallengeRunner != nil { services = append(services, storageChallengeRunner) } + if selfHealingRunner != nil { + services = append(services, selfHealingRunner) + } servicesErr <- RunServices(ctx, services...) }() diff --git a/supernode/config/config.go b/supernode/config/config.go index 92d0de7c..ee39d3d9 100644 --- a/supernode/config/config.go +++ b/supernode/config/config.go @@ -30,8 +30,9 @@ type KeyringConfig struct { } type P2PConfig struct { - Port uint16 `yaml:"port"` - DataDir string `yaml:"data_dir"` + Port uint16 `yaml:"port"` + DataDir string `yaml:"data_dir"` + BootstrapNodes string `yaml:"bootstrap_nodes,omitempty"` } type LumeraClientConfig struct { @@ -53,6 +54,21 @@ type StorageChallengeConfig struct { SubmitEvidence bool `yaml:"submit_evidence,omitempty"` } +type SelfHealingConfig struct { + Enabled bool `yaml:"enabled"` + PollIntervalMs uint64 `yaml:"poll_interval_ms,omitempty"` + ActionPageLimit uint64 `yaml:"action_page_limit,omitempty"` + ActionTargetsTTLSeconds uint64 `yaml:"action_targets_ttl_seconds,omitempty"` + MaxChallenges uint64 `yaml:"max_challenges,omitempty"` + MaxEventsPerTick uint64 `yaml:"max_events_per_tick,omitempty"` + EventWorkers uint64 `yaml:"event_workers,omitempty"` + EventLeaseDurationMs uint64 `yaml:"event_lease_duration_ms,omitempty"` + EventRetryBaseMs uint64 `yaml:"event_retry_base_ms,omitempty"` + EventRetryMaxMs uint64 `yaml:"event_retry_max_ms,omitempty"` + MaxEventAttempts uint64 `yaml:"max_event_attempts,omitempty"` + MaxWindowAgeMs uint64 `yaml:"max_window_age_ms,omitempty"` +} + type Config struct { SupernodeConfig `yaml:"supernode"` KeyringConfig `yaml:"keyring"` @@ -60,6 +76,7 @@ type Config struct { LumeraClientConfig `yaml:"lumera"` RaptorQConfig `yaml:"raptorq"` StorageChallengeConfig `yaml:"storage_challenge"` + SelfHealingConfig `yaml:"self_healing"` // Store base directory (not from YAML) BaseDir string `yaml:"-"` @@ -153,6 +170,9 @@ func LoadConfig(filename string, baseDir string) (*Config, error) { if config.StorageChallengeConfig.PollIntervalMs == 0 { config.StorageChallengeConfig.PollIntervalMs = DefaultStorageChallengePollIntervalMs } + if config.SelfHealingConfig.PollIntervalMs == 0 { + config.SelfHealingConfig.PollIntervalMs = DefaultSelfHealingPollIntervalMs + } // Create directories if err := config.EnsureDirs(); err != nil { diff --git a/supernode/config/defaults.go b/supernode/config/defaults.go index e2cbedc7..e17f157e 100644 --- a/supernode/config/defaults.go +++ b/supernode/config/defaults.go @@ -13,4 +13,5 @@ const ( DefaultChainID = "testing" DefaultRaptorQFilesDir = "raptorq_files" DefaultStorageChallengePollIntervalMs = 5000 + DefaultSelfHealingPollIntervalMs = 10000 ) diff --git a/supernode/config/save.go b/supernode/config/save.go index dfa88b7f..04338f22 100644 --- a/supernode/config/save.go +++ b/supernode/config/save.go @@ -62,5 +62,9 @@ func CreateDefaultConfig(keyName, identity, chainID string, keyringBackend, keyr PollIntervalMs: DefaultStorageChallengePollIntervalMs, SubmitEvidence: true, }, + SelfHealingConfig: SelfHealingConfig{ + Enabled: false, + PollIntervalMs: DefaultSelfHealingPollIntervalMs, + }, } } diff --git a/supernode/self_healing/README.md b/supernode/self_healing/README.md new file mode 100644 index 00000000..deb088ab --- /dev/null +++ b/supernode/self_healing/README.md @@ -0,0 +1,277 @@ +# Self-Healing (Cascade) - Design and Implementation + +## Scope + +This implementation covers **off-chain self-healing for CASCADE actions** in the new Lumera supernode architecture. + +Out of scope for this phase: + +- NFT self-healing +- Sense self-healing +- On-chain capability/module changes for phase-4 protocol governance + +## High-Level Goal + +Recover missing cascade artifacts deterministically and safely: + +1. Trigger only when watchlist conditions are met from weighted multi-reporter audit view. +2. Generate deterministic challenges once per window. +3. Reconstruct missing file content. +4. Verify reconstructed hash through observer quorum. +5. Persist regenerated artifacts to P2P **only after observer quorum succeeds**. + +## Components + +- `supernode/self_healing/service.go` + - Trigger, generation, deterministic selection, event processing, quorum, commit orchestration. +- `supernode/transport/grpc/self_healing/handler.go` + - Recipient/observer RPC handlers: + - `RequestSelfHealing` + - `VerifySelfHealing` + - `CommitSelfHealing` +- `supernode/cascade/reseed.go` + - Recovery reseed primitive used by self-healing: + - reconstruct/hash without persist (`PersistArtifacts=false`) + - reconstruct + persist (`PersistArtifacts=true`) +- `pkg/storage/queries/self_healing.go` + - Event queue persistence, lease claim/reclaim, retry/terminal lifecycle. + +## Triggering and Watchlist Decision + +Self-healing generation is triggered only when weighted watchlist threshold is met: + +1. Active supernodes are discovered. +2. For each active target node, challenger reads `x/audit` storage challenge reports. +3. Reports are deduplicated by reporter and interpreted against required-open-port policy. +4. Node enters weighted watchlist when closed-report percentage exceeds threshold and quorum reporters are present. +5. If `len(watchlist) < WatchlistThreshold`, generation is skipped. + +This prevents single-node/local-opinion triggering and aligns with multi-reporter policy. + +## Deterministic Generation Model + +### Deterministic leader per window + +For each generation window: + +- `windowID = now.Truncate(GenerationInterval).Unix()` +- Leader elected from active set using deterministic hash scoring over `(windowID, watchHash, nodeID)`. + +Only elected leader generates events. + +### Candidate targets + +Leader lists cascade actions from chain (`DONE` and `APPROVED` states), extracts anchor file key from metadata, and optionally caps per window by `MaxChallenges`. + +### File needs healing when + +For each target key: + +- determine deterministic closest holder set +- if **all closest holders are on watchlist**, target is marked for self-healing + +### Recipient and observers + +From online, active, non-watchlist nodes (excluding self): + +- recipient = deterministic pick over `fileKey` +- observers = deterministic top-N over remaining eligible pool + +### Challenge ID + +Challenge ID is deterministic per generation window and action (`deriveWindowChallengeID`). + +## Event Persistence and Dedup + +Each node uses local SQLite (node-local DB, not shared). + +Key guarantees: + +- Unique event key on `(trigger_id, ticket_id, challenge_id)` avoids duplicate inserts. +- Processing uses lease-claim semantics: + - statuses: `pending -> processing -> completed/retry/terminal` + - lease owner + expiry for restart-safe reclaim +- Retry uses exponential backoff and max attempts. + +## Request / Verify / Commit Flow + +### Core behavior + +1. Challenger sends `RequestSelfHealing`. +2. Recipient: + - tries local retrieve + - if missing, reconstructs and returns `reconstructed_hash_hex` **without storing artifacts**. +3. Challenger asks observers to `VerifySelfHealing`. +4. Observer: + - tries local retrieve first + - if missing, reconstructs in fallback mode **without storing artifacts** + - compares computed hash with recipient hash +5. If quorum passes, challenger sends `CommitSelfHealing`. +6. Recipient runs persist mode and stores regenerated artifacts into P2P. + +This is the critical policy change: **store after quorum approval**. + +## Reconstruction Modes (RecoveryReseed) + +`RecoveryReseedRequest` now has `PersistArtifacts *bool`: + +- `false`: decode + integrity verify + hash only +- `true`: decode + integrity verify + re-encode + RQ metadata regen + store artifacts +- `nil`: defaults to persist (backward-compatible behavior for existing recovery-admin path) + +`RecoveryReseedResult` includes `ReconstructedHashHex` for both modes. + +## Observer Fallback (No Local Copy) + +In `VerifySelfHealing`: + +1. Try local retrieve (`localOnly=true`). +2. If not found, resolve action (`action_id` if provided, else by file-key index). +3. Run fallback reconstruction (`PersistArtifacts=false`). +4. Compare fallback hash with `reconstructed_hash_hex` from recipient. +5. Return `ok=true/false`. + +No artifact persistence is performed in observer fallback. + +## RPC Contracts + +### Request + +- `RequestSelfHealingRequest` includes `action_id` (optional but preferred). + +### Verify + +- `VerifySelfHealingRequest` includes `action_id` (optional but preferred). + +### Commit + +- New `CommitSelfHealing(CommitSelfHealingRequest)`. +- Called only after observer quorum success. +- Persists regenerated artifacts on recipient. + +## Sequence Diagram (Happy Path) + +```mermaid +sequenceDiagram + participant C as Challenger + participant R as Recipient + participant O1 as "Observer 1" + participant O2 as "Observer 2" + participant P as P2P + + C->>R: RequestSelfHealing(challenge_id, file_key, action_id) + R->>R: Try local retrieve + alt Missing local file + R->>R: RecoveryReseed(persist=false) + R-->>C: accepted=true, reconstruction_required=true, reconstructed_hash_hex + else Local exists + R-->>C: accepted=true, reconstruction_required=false, reconstructed_hash_hex + end + + C->>O1: VerifySelfHealing(file_key, action_id, reconstructed_hash_hex) + O1->>O1: local retrieve or fallback RecoveryReseed(persist=false) + O1-->>C: ok=true/false + + C->>O2: VerifySelfHealing(file_key, action_id, reconstructed_hash_hex) + O2->>O2: local retrieve or fallback RecoveryReseed(persist=false) + O2-->>C: ok=true/false + + alt quorum ok + C->>R: CommitSelfHealing(challenge_id, file_key, action_id) + R->>R: RecoveryReseed(persist=true) + R->>P: Store regenerated artifacts + R-->>C: stored=true + else quorum failed + C->>C: Mark event retry/terminal by policy + end +``` + +## Flow Diagram (Generation) + +```mermaid +flowchart TD + A["Tick generation window"] --> B["Fetch active supernodes"] + B --> C["Compute weighted watchlist from x/audit reports"] + C --> D{Watchlist >= threshold?} + D -- No --> E["Skip generation"] + D -- Yes --> F["Deterministic leader election"] + F --> G{Am I leader?} + G -- No --> E + G -- Yes --> H["List DONE or APPROVED cascade actions"] + H --> I["Select deterministic window targets"] + I --> J["For each target compute closest holders"] + J --> K{All closest on watchlist?} + K -- No --> L["Ignore target"] + K -- Yes --> M["Pick deterministic recipient and observers"] + M --> N["Persist challenge events and metrics"] +``` + +## Hash Integrity and E2E Assertions + +Self-healing reconstructed hash is checked against action metadata hash: + +- action metadata `DataHash` is canonical registered file hash (base64) +- reconstructed hash is compared in observers and E2E tests + +System happy-path test now asserts: + +1. request returns reconstructed hash +2. reconstructed hash equals registered action hash +3. observer verify returns `ok=true` +4. commit returns `stored=true` + +## E2E Coverage and CI + +Self-healing E2E now covers: + +1. Happy path: request -> verify -> commit (hash and persistence assertions) +2. Observer tampered-hash rejection +3. Stale epoch rejection (`RequestSelfHealing`, `VerifySelfHealing`, `CommitSelfHealing`) +4. Duplicate challenge replay behavior +5. Recipient-down request failure + +CI integration: + +- workflow job: `self-healing-e2e-tests` (`.github/workflows/tests.yml`) +- command: `make test-self-healing` +- make target runs all self-healing E2E tests with: + - `go test -tags=system_test -v -run '^TestSelfHealingE2E' .` + +## Failure Semantics + +Terminal/non-retry examples: + +- stale window/epoch +- invalid payload + +Retry examples: + +- transient RPC/connectivity failure +- observer quorum failure +- commit/store failure + +Retry policy uses bounded exponential backoff and max attempts. + +## Performance and Scaling Notes + +Current controls: + +- bounded generation cap per window (`MaxChallenges`) +- bounded claim batch and worker pool (`MaxEventsPerTick`, `EventWorkers`) +- deterministic assignment avoids storms +- local cache for action target listing (`ActionTargetsTTL`) + +Practical behavior for large volumes: + +- generation remains window-capped +- processing remains lease-coordinated and worker-bounded +- duplicate deliveries collapse via unique event keys and idempotent metrics inserts + +## Why `action_id` is carried in RPCs + +`action_id` reduces ambiguity and chain scans during request/verify/commit: + +- direct action resolution when provided +- fallback to file-key index only when absent + +This improves correctness and reduces expensive lookup paths. diff --git a/supernode/self_healing/service.go b/supernode/self_healing/service.go new file mode 100644 index 00000000..de7d7005 --- /dev/null +++ b/supernode/self_healing/service.go @@ -0,0 +1,1227 @@ +package self_healing + +import ( + "context" + "database/sql" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "hash/fnv" + "net" + "net/url" + "sort" + "strconv" + "strings" + "sync" + "time" + + actiontypes "github.com/LumeraProtocol/lumera/x/action/v1/types" + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/lumera/x/lumeraid/securekeyx" + sntypes "github.com/LumeraProtocol/lumera/x/supernode/v1/types" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/p2p" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/net/credentials" + grpcclient "github.com/LumeraProtocol/supernode/v2/pkg/net/grpc/client" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + "github.com/LumeraProtocol/supernode/v2/pkg/storagechallenge/deterministic" + "github.com/LumeraProtocol/supernode/v2/pkg/types" + "github.com/cosmos/cosmos-sdk/crypto/keyring" + query "github.com/cosmos/cosmos-sdk/types/query" + "lukechampine.com/blake3" +) + +const ( + defaultPollInterval = 10 * time.Second + + defaultPingInterval = 7 * time.Minute + defaultWatchlistInterval = 8 * time.Minute + defaultGenerationInterval = 60 * time.Minute + defaultProcessEventsPeriod = 2 * time.Minute + + defaultWatchlistThreshold = 6 + defaultWatchlistStaleFor = 20 * time.Minute + defaultWatchlistFreshFor = 60 * time.Minute + defaultClosestNodes = 6 + defaultReplicaCount = 5 + defaultObserverCount = 2 + defaultObserverThreshold = 2 + defaultActionPageLimit = 200 + defaultActionTargetsCache = 15 * time.Minute + defaultMaxChallenges = 2048 + defaultEventLeaseDuration = 2 * time.Minute + defaultEventMaxAttempts = 3 + defaultEventRetryBase = 30 * time.Second + defaultEventRetryMax = 15 * time.Minute + defaultMaxEventsPerTick = 64 + defaultEventWorkers = 8 + + requestTimeout = 20 * time.Second + verificationTimout = 20 * time.Second + commitTimeout = 20 * time.Second +) + +type Config struct { + Enabled bool + PollInterval time.Duration + KeyName string + + PingInterval time.Duration + WatchlistInterval time.Duration + GenerationInterval time.Duration + ProcessInterval time.Duration + + WatchlistThreshold int + WatchlistStaleFor time.Duration + WatchlistFreshFor time.Duration + ClosestNodes int + ObserverCount int + ObserverThreshold int + ActionPageLimit int + ActionTargetsTTL time.Duration + MaxChallenges int + EventLeaseDuration time.Duration + EventRetryBase time.Duration + EventRetryMax time.Duration + MaxEventAttempts int + MaxEventsPerTick int + EventWorkers int + + // Max age for a generation window before we mark event stale/terminal. + MaxWindowAge time.Duration +} + +type Service struct { + cfg Config + identity string + grpcPort uint16 + + lumera lumera.Client + p2p p2p.Client + kr keyring.Keyring + store queries.LocalStoreInterface + + grpcClient *grpcclient.Client + grpcOpts *grpcclient.ClientOptions + + requestSelfHealingFn func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) + verifySelfHealingFn func(ctx context.Context, remoteIdentity string, address string, req *supernode.VerifySelfHealingRequest, timeout time.Duration) (*supernode.VerifySelfHealingResponse, error) + commitSelfHealingFn func(ctx context.Context, remoteIdentity string, address string, req *supernode.CommitSelfHealingRequest, timeout time.Duration) (*supernode.CommitSelfHealingResponse, error) + + targetsCacheMu sync.RWMutex + targetsCache []actionHealingTarget + targetsCached time.Time + + lastPingAt time.Time + lastWatchlistAt time.Time + lastGenerateAt time.Time + lastProcessEvtAt time.Time +} + +type challengeEventPayload struct { + EpochID uint64 `json:"epoch_id,omitempty"` + WindowID int64 `json:"window_id"` + ActionID string `json:"action_id,omitempty"` + FileKey string `json:"file_key"` + Recipient string `json:"recipient"` + Observers []string `json:"observers"` + WatchHash string `json:"watch_hash"` + ActiveHash string `json:"active_hash"` +} + +type eventProcessError struct { + Reason string + Retryable bool + Err error +} + +func (e *eventProcessError) Error() string { + if e == nil { + return "" + } + if e.Err != nil { + return fmt.Sprintf("%s: %v", e.Reason, e.Err) + } + return e.Reason +} + +func (e *eventProcessError) Unwrap() error { + if e == nil { + return nil + } + return e.Err +} + +func NewService(identity string, grpcPort uint16, lumeraClient lumera.Client, p2pClient p2p.Client, kr keyring.Keyring, store queries.LocalStoreInterface, cfg Config) (*Service, error) { + identity = strings.TrimSpace(identity) + if identity == "" { + return nil, fmt.Errorf("identity is empty") + } + if lumeraClient == nil || lumeraClient.SuperNode() == nil || lumeraClient.Audit() == nil || lumeraClient.Action() == nil { + return nil, fmt.Errorf("lumera client is missing required modules") + } + if p2pClient == nil { + return nil, fmt.Errorf("p2p client is nil") + } + if kr == nil { + return nil, fmt.Errorf("keyring is nil") + } + if store == nil { + return nil, fmt.Errorf("history store is nil") + } + if strings.TrimSpace(cfg.KeyName) == "" { + return nil, fmt.Errorf("key name is empty") + } + key, err := kr.Key(cfg.KeyName) + if err != nil { + return nil, fmt.Errorf("keyring key not found: %w", err) + } + addr, err := key.GetAddress() + if err != nil { + return nil, fmt.Errorf("get key address: %w", err) + } + if got := addr.String(); got != identity { + return nil, fmt.Errorf("identity mismatch: config.identity=%s key(%s)=%s", identity, cfg.KeyName, got) + } + + if cfg.PollInterval <= 0 { + cfg.PollInterval = defaultPollInterval + } + if cfg.PingInterval <= 0 { + cfg.PingInterval = defaultPingInterval + } + if cfg.WatchlistInterval <= 0 { + cfg.WatchlistInterval = defaultWatchlistInterval + } + if cfg.GenerationInterval <= 0 { + cfg.GenerationInterval = defaultGenerationInterval + } + if cfg.ProcessInterval <= 0 { + cfg.ProcessInterval = defaultProcessEventsPeriod + } + if cfg.WatchlistThreshold <= 0 { + cfg.WatchlistThreshold = defaultWatchlistThreshold + } + if cfg.WatchlistStaleFor <= 0 { + cfg.WatchlistStaleFor = defaultWatchlistStaleFor + } + if cfg.WatchlistFreshFor <= 0 { + cfg.WatchlistFreshFor = defaultWatchlistFreshFor + } + if cfg.ClosestNodes <= 0 { + cfg.ClosestNodes = defaultClosestNodes + } + if cfg.ObserverCount <= 0 { + cfg.ObserverCount = defaultObserverCount + } + if cfg.ObserverThreshold <= 0 { + cfg.ObserverThreshold = defaultObserverThreshold + } + if cfg.ActionPageLimit <= 0 { + cfg.ActionPageLimit = defaultActionPageLimit + } + if cfg.ActionTargetsTTL <= 0 { + cfg.ActionTargetsTTL = defaultActionTargetsCache + } + if cfg.MaxChallenges <= 0 { + cfg.MaxChallenges = defaultMaxChallenges + } + if cfg.EventLeaseDuration <= 0 { + cfg.EventLeaseDuration = defaultEventLeaseDuration + } + if cfg.EventRetryBase <= 0 { + cfg.EventRetryBase = defaultEventRetryBase + } + if cfg.EventRetryMax <= 0 { + cfg.EventRetryMax = defaultEventRetryMax + } + if cfg.MaxEventAttempts <= 0 { + cfg.MaxEventAttempts = defaultEventMaxAttempts + } + if cfg.MaxEventsPerTick <= 0 { + cfg.MaxEventsPerTick = defaultMaxEventsPerTick + } + if cfg.EventWorkers <= 0 { + cfg.EventWorkers = defaultEventWorkers + } + if cfg.MaxWindowAge <= 0 { + cfg.MaxWindowAge = 2 * cfg.GenerationInterval + } + + return &Service{cfg: cfg, identity: identity, grpcPort: grpcPort, lumera: lumeraClient, p2p: p2pClient, kr: kr, store: store}, nil +} + +func (s *Service) Run(ctx context.Context) error { + if !s.cfg.Enabled { + <-ctx.Done() + return nil + } + if err := s.initClients(); err != nil { + return err + } + + ticker := time.NewTicker(s.cfg.PollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + now := time.Now().UTC() + if now.Sub(s.lastPingAt) >= s.cfg.PingInterval { + s.refreshPingInfo(ctx) + s.lastPingAt = now + } + if now.Sub(s.lastWatchlistAt) >= s.cfg.WatchlistInterval { + s.updateWatchlist(ctx) + s.lastWatchlistAt = now + } + if now.Sub(s.lastGenerateAt) >= s.cfg.GenerationInterval { + s.generateChallenges(ctx, now) + s.lastGenerateAt = now + } + if now.Sub(s.lastProcessEvtAt) >= s.cfg.ProcessInterval { + s.processEvents(ctx) + s.lastProcessEvtAt = now + } + } + } +} + +func (s *Service) initClients() error { + validator := lumera.NewSecureKeyExchangeValidator(s.lumera) + grpcCreds, err := credentials.NewClientCreds(&credentials.ClientOptions{ + CommonOptions: credentials.CommonOptions{ + Keyring: s.kr, + LocalIdentity: s.identity, + PeerType: securekeyx.Supernode, + Validator: validator, + }, + }) + if err != nil { + return fmt.Errorf("create gRPC client creds: %w", err) + } + s.grpcClient = grpcclient.NewClient(grpcCreds) + s.grpcOpts = grpcclient.DefaultClientOptions() + s.grpcOpts.EnableRetries = true + return nil +} + +func (s *Service) refreshPingInfo(ctx context.Context) { + active, _, addrMap, err := s.networkSnapshot(ctx) + if err != nil { + logtrace.Warn(ctx, "self-healing ping: snapshot failed", logtrace.Fields{"error": err.Error()}) + return + } + for _, id := range active { + if id == s.identity { + continue + } + addr := addrMap[id] + online := s.probe(addr) + s.upsertPing(ctx, id, addr, online) + } +} + +func (s *Service) probe(address string) bool { + if strings.TrimSpace(address) == "" { + return false + } + host, port, ok := parseHostAndPort(address, int(s.grpcPort)) + if !ok { + return false + } + conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, strconv.Itoa(port)), 2*time.Second) + if err != nil { + return false + } + _ = conn.Close() + return true +} + +func (s *Service) upsertPing(ctx context.Context, nodeID, ip string, online bool) { + existing, err := s.store.GetPingInfoBySupernodeID(nodeID) + if err != nil && err != sql.ErrNoRows { + return + } + if existing == nil { + existing = &types.PingInfo{SupernodeID: nodeID, IPAddress: ip} + } + now := time.Now().UTC() + pi := types.PingInfo{ + SupernodeID: nodeID, + IPAddress: ip, + TotalPings: existing.TotalPings + 1, + TotalSuccessfulPings: existing.TotalSuccessfulPings, + AvgPingResponseTime: existing.AvgPingResponseTime, + IsOnline: online, + IsOnWatchlist: existing.IsOnWatchlist, + IsAdjusted: existing.IsAdjusted, + CumulativeResponseTime: existing.CumulativeResponseTime, + LastSeen: existing.LastSeen, + } + if online { + pi.TotalSuccessfulPings = existing.TotalSuccessfulPings + 1 + pi.IsOnWatchlist = false + pi.IsAdjusted = false + pi.LastSeen = sql.NullTime{Time: now, Valid: true} + } + if pi.TotalSuccessfulPings > 0 { + pi.AvgPingResponseTime = pi.CumulativeResponseTime / float64(pi.TotalSuccessfulPings) + } + _ = s.store.UpsertPingHistory(pi) + _ = ctx +} + +func (s *Service) updateWatchlist(ctx context.Context) { + infos, err := s.store.GetAllPingInfos() + if err != nil { + return + } + cut := time.Now().UTC().Add(-s.cfg.WatchlistStaleFor) + for _, info := range infos { + if info.IsOnWatchlist || info.IsAdjusted || !info.LastSeen.Valid { + continue + } + if info.LastSeen.Time.Before(cut) { + _ = s.store.UpdatePingInfo(info.SupernodeID, true, false) + } + } + _ = ctx +} + +func (s *Service) generateChallenges(ctx context.Context, now time.Time) { + active, online, _, err := s.networkSnapshot(ctx) + if err != nil || len(active) == 0 { + return + } + + watch, err := s.auditWeightedWatchlist(ctx, active) + if err != nil { + logtrace.Warn(ctx, "self-healing weighted watchlist unavailable", logtrace.Fields{"error": err.Error()}) + return + } + if len(watch) < s.cfg.WatchlistThreshold { + logtrace.Debug(ctx, "self-healing trigger skipped: watchlist below threshold", logtrace.Fields{"watchlist_count": len(watch), "threshold": s.cfg.WatchlistThreshold}) + return + } + + windowID := now.Truncate(s.cfg.GenerationInterval).Unix() + leader := electLeader(active, windowID, hashList(watch)) + if leader != s.identity { + return + } + + targets, err := s.listCascadeHealingTargets(ctx) + if err != nil { + logtrace.Warn(ctx, "self-healing action target listing failed", logtrace.Fields{"error": err.Error()}) + return + } + if len(targets) == 0 { + return + } + + watchSet := toSet(watch) + eligibleRecipients := filterEligibleRecipients(online, watchSet, s.identity) + if len(eligibleRecipients) == 0 { + return + } + + triggerID := fmt.Sprintf("window:%d", windowID) + epochID := uint64(0) + if s.lumera != nil && s.lumera.Audit() != nil { + if ep, err := s.lumera.Audit().GetCurrentEpoch(ctx); err == nil && ep != nil { + epochID = ep.EpochId + } + } + + selectedTargets := selectWindowTargets(targets, windowID, s.cfg.MaxChallenges) + if len(selectedTargets) < len(targets) { + logtrace.Info(ctx, "self-healing target set capped for window", logtrace.Fields{ + "window_id": windowID, + "total_targets": len(targets), + "selected_targets": len(selectedTargets), + "max_challenges": s.cfg.MaxChallenges, + "selection_strategy": "window_offset", + }) + } + + events := make([]types.SelfHealingChallengeEvent, 0, len(selectedTargets)) + for _, target := range selectedTargets { + holders, err := deterministic.SelectReplicaSet(active, target.FileKey, uint32(maxInt(1, s.cfg.ClosestNodes))) + if err != nil || len(holders) == 0 { + continue + } + if !allInSet(holders, watchSet) { + continue + } + recipient := pickDeterministicNode(eligibleRecipients, target.FileKey) + if recipient == "" { + continue + } + obsPool := removeOne(eligibleRecipients, recipient) + observers := pickTopNDeterministic(obsPool, target.FileKey+":obs", s.cfg.ObserverCount) + challengeID := deriveWindowChallengeID(windowID, target.ActionID) + + payload := challengeEventPayload{ + EpochID: epochID, + WindowID: windowID, + ActionID: target.ActionID, + FileKey: target.FileKey, + Recipient: recipient, + Observers: observers, + WatchHash: hashList(watch), + ActiveHash: hashList(active), + } + bz, _ := json.Marshal(payload) + events = append(events, types.SelfHealingChallengeEvent{ + TriggerID: triggerID, + TicketID: target.ActionID, + ChallengeID: challengeID, + Data: bz, + SenderID: s.identity, + ExecMetric: types.SelfHealingExecutionMetric{ + TriggerID: triggerID, + ChallengeID: challengeID, + MessageType: int(types.SelfHealingChallengeMessage), + Data: bz, + SenderID: s.identity, + }, + }) + } + + if len(events) == 0 { + return + } + if err := s.store.BatchInsertSelfHealingChallengeEvents(ctx, events); err != nil { + logtrace.Warn(ctx, "self-healing generation insert failed", logtrace.Fields{"error": err.Error()}) + return + } + logtrace.Info(ctx, "self-healing generated challenges", logtrace.Fields{"window_id": windowID, "count": len(events)}) +} + +func (s *Service) processEvents(ctx context.Context) { + owner := s.identity + events, err := s.store.ClaimPendingSelfHealingChallengeEvents(ctx, owner, s.cfg.EventLeaseDuration, s.cfg.MaxEventsPerTick) + if err != nil || len(events) == 0 { + return + } + workers := s.cfg.EventWorkers + if workers <= 1 || len(events) == 1 { + for _, event := range events { + s.handleClaimedEvent(ctx, owner, event) + } + return + } + if workers > len(events) { + workers = len(events) + } + + jobs := make(chan types.SelfHealingChallengeEvent, len(events)) + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for event := range jobs { + s.handleClaimedEvent(ctx, owner, event) + } + }() + } +enqueue: + for _, event := range events { + select { + case <-ctx.Done(): + break enqueue + case jobs <- event: + } + } + close(jobs) + wg.Wait() +} + +func (s *Service) handleClaimedEvent(ctx context.Context, owner string, event types.SelfHealingChallengeEvent) { + if err := s.processEvent(ctx, event); err != nil { + reason := err.Error() + retryable := true + var processErr *eventProcessError + if errors.As(err, &processErr) { + reason = processErr.Reason + retryable = processErr.Retryable + } + + attempted := event.AttemptCount + maxed := attempted >= s.cfg.MaxEventAttempts + if !retryable || maxed { + if terr := s.store.MarkSelfHealingChallengeEventTerminal(event.ChallengeID, owner, reason); terr != nil { + logtrace.Warn(ctx, "self-healing event terminal update failed", logtrace.Fields{"challenge_id": event.ChallengeID, "error": terr.Error()}) + } + logtrace.Warn(ctx, "self-healing event terminal failure", logtrace.Fields{"challenge_id": event.ChallengeID, "reason": reason, "attempt_count": attempted}) + return + } + + delay := s.retryDelayForAttempt(attempted) + if rerr := s.store.MarkSelfHealingChallengeEventRetry(event.ChallengeID, owner, reason, delay); rerr != nil { + logtrace.Warn(ctx, "self-healing event retry update failed", logtrace.Fields{"challenge_id": event.ChallengeID, "error": rerr.Error()}) + } + logtrace.Warn(ctx, "self-healing event process failed", logtrace.Fields{"challenge_id": event.ChallengeID, "reason": reason, "attempt_count": attempted, "retry_after_ms": delay.Milliseconds()}) + return + } + if err := s.store.MarkSelfHealingChallengeEventCompleted(event.ChallengeID, owner); err != nil { + logtrace.Warn(ctx, "self-healing event complete update failed", logtrace.Fields{"challenge_id": event.ChallengeID, "error": err.Error()}) + } +} + +func (s *Service) processEvent(ctx context.Context, event types.SelfHealingChallengeEvent) error { + var pl challengeEventPayload + if err := json.Unmarshal(event.Data, &pl); err != nil { + return &eventProcessError{Reason: "invalid_payload", Retryable: false, Err: err} + } + if strings.TrimSpace(pl.FileKey) == "" || strings.TrimSpace(pl.Recipient) == "" { + return &eventProcessError{Reason: "invalid_payload_fields", Retryable: false} + } + if pl.WindowID > 0 { + windowStart := time.Unix(pl.WindowID, 0).UTC() + if time.Now().UTC().After(windowStart.Add(s.cfg.MaxWindowAge)) { + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": "stale_window"}) + return &eventProcessError{Reason: "stale_window", Retryable: false} + } + } + recipientAddr, err := s.supernodeGRPCAddr(ctx, pl.Recipient) + if err != nil { + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": "recipient_resolve_failed"}) + return &eventProcessError{Reason: "recipient_resolve_failed", Retryable: true, Err: err} + } + + req := &supernode.RequestSelfHealingRequest{ + ChallengeId: event.ChallengeID, + EpochId: pl.EpochID, + FileKey: pl.FileKey, + ChallengerId: s.identity, + RecipientId: pl.Recipient, + ObserverIds: pl.Observers, + ActionId: pl.ActionID, + } + resp, err := s.requestSelfHealing(ctx, pl.Recipient, recipientAddr, req, requestTimeout) + if err != nil || resp == nil || !resp.Accepted { + reason := "request_failed" + if resp != nil && strings.TrimSpace(resp.Error) != "" { + reason = resp.Error + } + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": reason}) + if err != nil { + return &eventProcessError{Reason: "request_rpc_error", Retryable: true, Err: err} + } + lowerReason := strings.ToLower(strings.TrimSpace(reason)) + if strings.Contains(lowerReason, "stale") { + return &eventProcessError{Reason: lowerReason, Retryable: false} + } + return &eventProcessError{Reason: lowerReason, Retryable: true} + } + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingResponseMessage), map[string]any{"event": "response", "reconstruction_required": resp.ReconstructionRequired, "reconstructed_hash_hex": resp.ReconstructedHashHex}) + + if !resp.ReconstructionRequired { + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "completed", "result": "not_required"}) + return nil + } + okCount := 0 + for _, ob := range pl.Observers { + addr, err := s.supernodeGRPCAddr(ctx, ob) + if err != nil { + continue + } + vr, verr := s.verifySelfHealing(ctx, ob, addr, &supernode.VerifySelfHealingRequest{ + ChallengeId: event.ChallengeID, + EpochId: pl.EpochID, + FileKey: pl.FileKey, + RecipientId: pl.Recipient, + ReconstructedHashHex: resp.ReconstructedHashHex, + ObserverId: ob, + ActionId: pl.ActionID, + }, verificationTimout) + ok := verr == nil && vr != nil && vr.Ok + if ok { + okCount++ + } + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingVerificationMessage), map[string]any{"event": "observer_result", "observer_id": ob, "ok": ok}) + } + if okCount < s.cfg.ObserverThreshold { + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": "observer_quorum_failed", "ok_count": okCount}) + return &eventProcessError{Reason: "observer_quorum_failed", Retryable: true} + } + commitReq := &supernode.CommitSelfHealingRequest{ + ChallengeId: event.ChallengeID, + EpochId: pl.EpochID, + FileKey: pl.FileKey, + ActionId: pl.ActionID, + ChallengerId: s.identity, + RecipientId: pl.Recipient, + } + commitResp, cerr := s.commitSelfHealing(ctx, pl.Recipient, recipientAddr, commitReq, commitTimeout) + if cerr != nil || commitResp == nil || !commitResp.Stored { + reason := "commit_failed" + if commitResp != nil && strings.TrimSpace(commitResp.Error) != "" { + reason = strings.ToLower(strings.TrimSpace(commitResp.Error)) + } + if cerr != nil { + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": reason, "error": cerr.Error()}) + return &eventProcessError{Reason: "commit_rpc_error", Retryable: true, Err: cerr} + } + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "failed", "reason": reason}) + if strings.Contains(reason, "stale") { + return &eventProcessError{Reason: reason, Retryable: false} + } + return &eventProcessError{Reason: reason, Retryable: true} + } + s.persistExecution(event.TriggerID, event.ChallengeID, int(types.SelfHealingCompletionMessage), map[string]any{"event": "completed", "result": "healed", "ok_count": okCount}) + return nil +} + +func (s *Service) networkSnapshot(ctx context.Context) (active []string, online []string, addrMap map[string]string, err error) { + resp, err := s.lumera.SuperNode().ListSuperNodes(ctx) + if err != nil || resp == nil { + return nil, nil, nil, fmt.Errorf("list supernodes: %w", err) + } + addrMap = map[string]string{} + for _, sn := range resp.Supernodes { + if sn == nil || strings.TrimSpace(sn.SupernodeAccount) == "" { + continue + } + if !isActiveSN(sn) { + continue + } + id := strings.TrimSpace(sn.SupernodeAccount) + active = append(active, id) + latest, _ := s.lumera.SuperNode().GetSupernodeWithLatestAddress(ctx, id) + if latest != nil { + addrMap[id] = latest.LatestAddress + } + pi, _ := s.store.GetPingInfoBySupernodeID(id) + if pi != nil && pi.IsOnline { + online = append(online, id) + } + } + sort.Strings(active) + sort.Strings(online) + return active, online, addrMap, nil +} + +func (s *Service) auditWeightedWatchlist(ctx context.Context, active []string) ([]string, error) { + if s.lumera == nil || s.lumera.Audit() == nil { + return nil, fmt.Errorf("audit module unavailable") + } + + paramsResp, err := s.lumera.Audit().GetParams(ctx) + if err != nil || paramsResp == nil { + return nil, fmt.Errorf("get audit params: %w", err) + } + params := paramsResp.Params.WithDefaults() + if err := params.Validate(); err != nil { + return nil, fmt.Errorf("invalid audit params: %w", err) + } + + epochResp, err := s.lumera.Audit().GetCurrentEpoch(ctx) + if err != nil || epochResp == nil { + return nil, fmt.Errorf("get current epoch: %w", err) + } + epochID := epochResp.EpochId + + requiredPorts := len(params.RequiredOpenPorts) + if requiredPorts == 0 { + return nil, nil + } + + minReporters := int(params.PeerQuorumReports) + if minReporters <= 0 { + minReporters = 1 + } + thresholdPercent := int(params.PeerPortPostponeThresholdPercent) + if thresholdPercent <= 0 { + thresholdPercent = 100 + } + + watch := make([]string, 0) + for _, target := range active { + if target == s.identity { + continue + } + + reportsResp, err := s.lumera.Audit().GetStorageChallengeReports(ctx, target, epochID) + if err != nil || reportsResp == nil { + continue + } + + total, closed := countWeightedClosedVotes(reportsResp.Reports, requiredPorts) + if total < minReporters { + continue + } + if closed*100 >= thresholdPercent*total { + watch = append(watch, target) + } + } + + sort.Strings(watch) + logtrace.Debug(ctx, "self-healing weighted watchlist computed", logtrace.Fields{ + "epoch_id": epochID, + "watchlist_count": len(watch), + "active_count": len(active), + "min_reporters": minReporters, + "threshold_pct": thresholdPercent, + }) + return watch, nil +} + +func countWeightedClosedVotes(reports []audittypes.StorageChallengeReport, requiredPortsLen int) (total int, closed int) { + if requiredPortsLen <= 0 { + return 0, 0 + } + reporterVotes := make(map[string]bool, len(reports)) + for _, report := range reports { + reporter := strings.TrimSpace(report.ReporterSupernodeAccount) + if reporter == "" { + continue + } + if _, exists := reporterVotes[reporter]; exists { + continue + } + known, isClosed := classifyReporterObservation(report.PortStates, requiredPortsLen) + if !known { + continue + } + reporterVotes[reporter] = isClosed + } + + for _, isClosed := range reporterVotes { + total++ + if isClosed { + closed++ + } + } + return total, closed +} + +func classifyReporterObservation(states []audittypes.PortState, requiredPortsLen int) (known bool, isClosed bool) { + if len(states) != requiredPortsLen { + return false, false + } + allOpen := true + hasClosed := false + for _, st := range states { + switch st { + case audittypes.PortState_PORT_STATE_OPEN: + continue + case audittypes.PortState_PORT_STATE_CLOSED: + hasClosed = true + allOpen = false + default: + // Unknown observations are intentionally excluded from weighting. + allOpen = false + } + } + if hasClosed { + return true, true + } + if allOpen { + return true, false + } + return false, false +} + +type actionHealingTarget struct { + ActionID string + FileKey string +} + +func (s *Service) listCascadeHealingTargets(ctx context.Context) ([]actionHealingTarget, error) { + if cached, ok := s.cachedTargets(); ok { + return cached, nil + } + + s.targetsCacheMu.Lock() + defer s.targetsCacheMu.Unlock() + // Re-check under write lock in case another caller refreshed already. + if cached, ok := s.cachedTargetsLocked(); ok { + return cached, nil + } + + if s.lumera == nil || s.lumera.Action() == nil { + return nil, fmt.Errorf("action module unavailable") + } + + states := []actiontypes.ActionState{ + actiontypes.ActionStateDone, + actiontypes.ActionStateApproved, + } + + targets := make([]actionHealingTarget, 0) + seenByAction := make(map[string]struct{}) + for _, state := range states { + var nextKey []byte + for { + resp, err := s.lumera.Action().ListActions(ctx, &actiontypes.QueryListActionsRequest{ + ActionType: actiontypes.ActionTypeCascade, + ActionState: state, + Pagination: &query.PageRequest{ + Key: nextKey, + Limit: uint64(s.cfg.ActionPageLimit), + }, + }) + if err != nil { + return nil, fmt.Errorf("list cascade actions (state=%s): %w", state.String(), err) + } + if resp == nil { + break + } + + for _, act := range resp.Actions { + if act == nil { + continue + } + actionID := strings.TrimSpace(act.ActionID) + if actionID == "" { + continue + } + if _, seen := seenByAction[actionID]; seen { + continue + } + metadata := act.Metadata + if len(metadata) == 0 { + continue + } + cascadeMeta, err := cascadekit.UnmarshalCascadeMetadata(metadata) + if err != nil { + continue + } + fileKey := pickActionAnchorKey(cascadeMeta.RqIdsIds) + if fileKey == "" { + continue + } + seenByAction[actionID] = struct{}{} + targets = append(targets, actionHealingTarget{ + ActionID: actionID, + FileKey: fileKey, + }) + } + + if resp.Pagination == nil || len(resp.Pagination.NextKey) == 0 { + break + } + nextKey = append(nextKey[:0], resp.Pagination.NextKey...) + } + } + + sort.Slice(targets, func(i, j int) bool { + if targets[i].ActionID == targets[j].ActionID { + return targets[i].FileKey < targets[j].FileKey + } + return targets[i].ActionID < targets[j].ActionID + }) + logtrace.Debug(ctx, "self-healing on-chain action targets listed", logtrace.Fields{ + "targets_count": len(targets), + }) + s.targetsCache = append(s.targetsCache[:0], targets...) + s.targetsCached = time.Now().UTC() + return targets, nil +} + +func (s *Service) cachedTargets() ([]actionHealingTarget, bool) { + s.targetsCacheMu.RLock() + defer s.targetsCacheMu.RUnlock() + return s.cachedTargetsLocked() +} + +func (s *Service) cachedTargetsLocked() ([]actionHealingTarget, bool) { + if len(s.targetsCache) == 0 { + return nil, false + } + if s.cfg.ActionTargetsTTL > 0 && time.Since(s.targetsCached) > s.cfg.ActionTargetsTTL { + return nil, false + } + out := make([]actionHealingTarget, len(s.targetsCache)) + copy(out, s.targetsCache) + return out, true +} + +func pickActionAnchorKey(keys []string) string { + anchor := "" + for _, raw := range keys { + key := strings.TrimSpace(raw) + if key == "" { + continue + } + if anchor == "" || key < anchor { + anchor = key + } + } + return anchor +} + +func selectWindowTargets(targets []actionHealingTarget, windowID int64, limit int) []actionHealingTarget { + n := len(targets) + if n == 0 || limit <= 0 || n <= limit { + out := make([]actionHealingTarget, n) + copy(out, targets) + return out + } + + start := int(windowID % int64(n)) + if start < 0 { + start += n + } + out := make([]actionHealingTarget, 0, limit) + for i := 0; i < limit; i++ { + idx := (start + i) % n + out = append(out, targets[idx]) + } + return out +} + +func isActiveSN(sn *sntypes.SuperNode) bool { + var latest *sntypes.SuperNodeStateRecord + for _, st := range sn.States { + if st == nil { + continue + } + if latest == nil || st.Height > latest.Height { + latest = st + } + } + if latest == nil { + return false + } + return latest.State == sntypes.SuperNodeStateActive +} + +func (s *Service) persistExecution(triggerID, challengeID string, msgType int, payload map[string]any) { + bz, err := json.Marshal(payload) + if err != nil { + return + } + _ = s.store.InsertSelfHealingExecutionMetrics(types.SelfHealingExecutionMetric{TriggerID: triggerID, ChallengeID: challengeID, MessageType: msgType, Data: bz, SenderID: s.identity}) +} + +func (s *Service) supernodeGRPCAddr(ctx context.Context, supernodeAccount string) (string, error) { + info, err := s.lumera.SuperNode().GetSupernodeWithLatestAddress(ctx, supernodeAccount) + if err != nil || info == nil { + return "", fmt.Errorf("resolve supernode address: %w", err) + } + raw := strings.TrimSpace(info.LatestAddress) + if raw == "" { + return "", fmt.Errorf("no ip address for supernode %s", supernodeAccount) + } + host, port, ok := parseHostAndPort(raw, int(s.grpcPort)) + if !ok || strings.TrimSpace(host) == "" { + return "", fmt.Errorf("invalid supernode address for %s: %q", supernodeAccount, raw) + } + return net.JoinHostPort(strings.TrimSpace(host), strconv.Itoa(port)), nil +} + +func (s *Service) retryDelayForAttempt(attempt int) time.Duration { + if attempt <= 0 { + return s.cfg.EventRetryBase + } + delay := s.cfg.EventRetryBase + for i := 1; i < attempt; i++ { + delay *= 2 + if delay >= s.cfg.EventRetryMax { + return s.cfg.EventRetryMax + } + } + if delay > s.cfg.EventRetryMax { + return s.cfg.EventRetryMax + } + return delay +} + +func (s *Service) requestSelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + if s.requestSelfHealingFn != nil { + return s.requestSelfHealingFn(ctx, remoteIdentity, address, req, timeout) + } + return s.callRequestSelfHealing(ctx, remoteIdentity, address, req, timeout) +} + +func (s *Service) verifySelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.VerifySelfHealingRequest, timeout time.Duration) (*supernode.VerifySelfHealingResponse, error) { + if s.verifySelfHealingFn != nil { + return s.verifySelfHealingFn(ctx, remoteIdentity, address, req, timeout) + } + return s.callVerifySelfHealing(ctx, remoteIdentity, address, req, timeout) +} + +func (s *Service) commitSelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.CommitSelfHealingRequest, timeout time.Duration) (*supernode.CommitSelfHealingResponse, error) { + if s.commitSelfHealingFn != nil { + return s.commitSelfHealingFn(ctx, remoteIdentity, address, req, timeout) + } + return s.callCommitSelfHealing(ctx, remoteIdentity, address, req, timeout) +} + +func (s *Service) callRequestSelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + cctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + conn, err := s.grpcClient.Connect(cctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), s.grpcOpts) + if err != nil { + return nil, err + } + defer conn.Close() + client := supernode.NewSelfHealingServiceClient(conn) + return client.RequestSelfHealing(cctx, req) +} + +func (s *Service) callVerifySelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.VerifySelfHealingRequest, timeout time.Duration) (*supernode.VerifySelfHealingResponse, error) { + cctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + conn, err := s.grpcClient.Connect(cctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), s.grpcOpts) + if err != nil { + return nil, err + } + defer conn.Close() + client := supernode.NewSelfHealingServiceClient(conn) + return client.VerifySelfHealing(cctx, req) +} + +func (s *Service) callCommitSelfHealing(ctx context.Context, remoteIdentity string, address string, req *supernode.CommitSelfHealingRequest, timeout time.Duration) (*supernode.CommitSelfHealingResponse, error) { + cctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + conn, err := s.grpcClient.Connect(cctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), s.grpcOpts) + if err != nil { + return nil, err + } + defer conn.Close() + client := supernode.NewSelfHealingServiceClient(conn) + return client.CommitSelfHealing(cctx, req) +} + +func parseHostAndPort(address string, defaultPort int) (host string, port int, ok bool) { + address = strings.TrimSpace(address) + if address == "" { + return "", 0, false + } + if u, err := url.Parse(address); err == nil && u.Host != "" { + address = u.Host + } + if h, p, err := net.SplitHostPort(address); err == nil { + h = strings.TrimSpace(h) + if h == "" { + return "", 0, false + } + if n, err := strconv.Atoi(p); err == nil && n > 0 && n <= 65535 { + return h, n, true + } + return h, defaultPort, true + } + host = strings.TrimSpace(address) + if host == "" { + return "", 0, false + } + if strings.ContainsAny(host, " \t\r\n/\\?#@[]") { + return "", 0, false + } + return host, defaultPort, true +} + +func deriveWindowChallengeID(windowID int64, fileKey string) string { + msg := []byte(fmt.Sprintf("sh:v1:%d:%s", windowID, fileKey)) + sum := blake3.Sum256(msg) + return hex.EncodeToString(sum[:]) +} + +func electLeader(nodes []string, windowID int64, salt string) string { + best := "" + var bestScore uint64 + for i, n := range nodes { + h := fnv.New64a() + _, _ = h.Write([]byte(fmt.Sprintf("%d:%s:%s", windowID, salt, n))) + s := h.Sum64() + if i == 0 || s < bestScore { + bestScore = s + best = n + } + } + return best +} + +func hashList(items []string) string { + cp := append([]string(nil), items...) + sort.Strings(cp) + msg := strings.Join(cp, ",") + sum := blake3.Sum256([]byte(msg)) + return hex.EncodeToString(sum[:]) +} + +func toSet(items []string) map[string]struct{} { + m := map[string]struct{}{} + for _, it := range items { + m[it] = struct{}{} + } + return m +} + +func allInSet(items []string, set map[string]struct{}) bool { + if len(items) == 0 { + return false + } + for _, it := range items { + if _, ok := set[it]; !ok { + return false + } + } + return true +} + +func filterEligibleRecipients(online []string, watch map[string]struct{}, self string) []string { + out := make([]string, 0) + for _, n := range online { + if n == self { + continue + } + if _, bad := watch[n]; bad { + continue + } + out = append(out, n) + } + return out +} + +func pickDeterministicNode(nodes []string, key string) string { + if len(nodes) == 0 { + return "" + } + sel, err := deterministic.SelectReplicaSet(nodes, key, 1) + if err != nil || len(sel) == 0 { + return "" + } + return sel[0] +} + +func pickTopNDeterministic(nodes []string, key string, n int) []string { + if len(nodes) == 0 || n <= 0 { + return nil + } + sel, err := deterministic.SelectReplicaSet(nodes, key, uint32(n)) + if err != nil { + return nil + } + return sel +} + +func removeOne(items []string, target string) []string { + out := make([]string, 0, len(items)) + for _, it := range items { + if it == target { + continue + } + out = append(out, it) + } + return out +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/supernode/self_healing/service_test.go b/supernode/self_healing/service_test.go new file mode 100644 index 00000000..d012c1e8 --- /dev/null +++ b/supernode/self_healing/service_test.go @@ -0,0 +1,609 @@ +package self_healing + +import ( + "context" + "encoding/json" + "errors" + "testing" + "time" + + actiontypes "github.com/LumeraProtocol/lumera/x/action/v1/types" + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + lumeraclient "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + actionmodule "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action" + lumerasn "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + "github.com/LumeraProtocol/supernode/v2/pkg/types" + "github.com/golang/protobuf/proto" + "go.uber.org/mock/gomock" +) + +func TestProcessEventsRecipientDownTerminal(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 2, + MaxEventAttempts: 1, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-recipient-down", challengeEventPayload{ + WindowID: time.Now().UTC().Unix(), + FileKey: "file-1", + Recipient: "recipient-1", + Observers: []string{"observer-1", "observer-2"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + return nil, errors.New("dial failed") + } + + svc.processEvents(context.Background()) + + ev, err := store.GetSelfHealingChallengeEvent("ch-recipient-down") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "terminal"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if !ev.IsProcessed { + t.Fatalf("expected event marked processed") + } + if ev.AttemptCount != 1 { + t.Fatalf("attempt_count=%d want=1", ev.AttemptCount) + } +} + +func TestProcessEventsQuorumFailureTerminalAtMaxAttempts(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 2, + MaxEventAttempts: 1, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-quorum-fail", challengeEventPayload{ + WindowID: time.Now().UTC().Unix(), + FileKey: "file-2", + Recipient: "recipient-1", + Observers: []string{"observer-1", "observer-2"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: req.RecipientId, + Accepted: true, + ReconstructionRequired: true, + ReconstructedHashHex: "abcd", + }, nil + } + svc.verifySelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.VerifySelfHealingRequest, timeout time.Duration) (*supernode.VerifySelfHealingResponse, error) { + if remoteIdentity == "observer-1" { + return &supernode.VerifySelfHealingResponse{ChallengeId: req.ChallengeId, ObserverId: remoteIdentity, Ok: true}, nil + } + return &supernode.VerifySelfHealingResponse{ChallengeId: req.ChallengeId, ObserverId: remoteIdentity, Ok: false, Error: "mismatch"}, nil + } + + svc.processEvents(context.Background()) + + ev, err := store.GetSelfHealingChallengeEvent("ch-quorum-fail") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "terminal"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if !ev.IsProcessed { + t.Fatalf("expected event marked processed") + } +} + +func TestProcessEventsCommitAfterObserverQuorum(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 2, + MaxEventAttempts: 3, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-commit-after-quorum", challengeEventPayload{ + EpochID: 21, + WindowID: time.Now().UTC().Unix(), + ActionID: "action-commit-1", + FileKey: "file-commit-1", + Recipient: "recipient-1", + Observers: []string{"observer-1", "observer-2"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + commitCalled := false + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: req.RecipientId, + Accepted: true, + ReconstructionRequired: true, + ReconstructedHashHex: "abcd", + }, nil + } + svc.verifySelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.VerifySelfHealingRequest, timeout time.Duration) (*supernode.VerifySelfHealingResponse, error) { + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + ObserverId: remoteIdentity, + Ok: true, + }, nil + } + svc.commitSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.CommitSelfHealingRequest, timeout time.Duration) (*supernode.CommitSelfHealingResponse, error) { + commitCalled = true + if req.ActionId != "action-commit-1" { + t.Fatalf("unexpected action_id in commit request: %s", req.ActionId) + } + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: remoteIdentity, + Stored: true, + }, nil + } + + svc.processEvents(context.Background()) + + if !commitCalled { + t.Fatalf("expected commit RPC to be called after observer quorum") + } + ev, err := store.GetSelfHealingChallengeEvent("ch-commit-after-quorum") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "completed"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if !ev.IsProcessed { + t.Fatalf("expected event marked processed") + } +} + +func TestProcessEventsStaleWindowTerminalWithoutRPC(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 2, + MaxEventAttempts: 3, + MaxWindowAge: 5 * time.Minute, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-stale-window", challengeEventPayload{ + WindowID: time.Now().UTC().Add(-2 * time.Hour).Unix(), + FileKey: "file-3", + Recipient: "recipient-1", + Observers: []string{"observer-1", "observer-2"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + rpcCalled := false + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + rpcCalled = true + return nil, errors.New("unexpected call") + } + + svc.processEvents(context.Background()) + + if rpcCalled { + t.Fatalf("request RPC should not be called for stale window") + } + ev, err := store.GetSelfHealingChallengeEvent("ch-stale-window") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "terminal"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } +} + +func TestProcessEventsDuplicateReplayProcessedOnce(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 1, + MaxEventAttempts: 3, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + payload := challengeEventPayload{ + WindowID: time.Now().UTC().Unix(), + FileKey: "file-4", + Recipient: "recipient-1", + Observers: []string{"observer-1"}, + } + if err := insertEvent(t, store, "ch-dup", payload); err != nil { + t.Fatalf("insert event: %v", err) + } + // Duplicate delivery should be ignored by deterministic unique key. + if err := insertEvent(t, store, "ch-dup", payload); err != nil { + t.Fatalf("insert duplicate event: %v", err) + } + + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: req.RecipientId, + Accepted: true, + ReconstructionRequired: false, + }, nil + } + + svc.processEvents(context.Background()) + svc.processEvents(context.Background()) + + ev, err := store.GetSelfHealingChallengeEvent("ch-dup") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "completed"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if ev.AttemptCount != 1 { + t.Fatalf("attempt_count=%d want=1", ev.AttemptCount) + } +} + +func TestProcessEventsRetryThenComplete(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 1, + MaxEventAttempts: 3, + EventRetryBase: 10 * time.Millisecond, + EventRetryMax: 20 * time.Millisecond, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-retry-then-complete", challengeEventPayload{ + WindowID: time.Now().UTC().Unix(), + FileKey: "file-5", + Recipient: "recipient-1", + Observers: []string{"observer-1"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + calls := 0 + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + calls++ + if calls == 1 { + return nil, errors.New("temporary network failure") + } + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: req.RecipientId, + Accepted: true, + ReconstructionRequired: false, + }, nil + } + + svc.processEvents(context.Background()) + + ev, err := store.GetSelfHealingChallengeEvent("ch-retry-then-complete") + if err != nil { + t.Fatalf("get event after first attempt: %v", err) + } + if got, want := ev.Status, "retry"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if ev.AttemptCount != 1 { + t.Fatalf("attempt_count=%d want=1", ev.AttemptCount) + } + + time.Sleep(20 * time.Millisecond) + svc.processEvents(context.Background()) + + ev, err = store.GetSelfHealingChallengeEvent("ch-retry-then-complete") + if err != nil { + t.Fatalf("get event after second attempt: %v", err) + } + if got, want := ev.Status, "completed"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if ev.AttemptCount != 2 { + t.Fatalf("attempt_count=%d want=2", ev.AttemptCount) + } +} + +func TestProcessEventsReclaimsExpiredLease(t *testing.T) { + svc, store, cleanup := newServiceForEventTests(t, Config{ + ObserverThreshold: 1, + MaxEventAttempts: 3, + EventLeaseDuration: 20 * time.Millisecond, + MaxWindowAge: time.Hour, + }) + defer cleanup() + + if err := insertEvent(t, store, "ch-expired-lease", challengeEventPayload{ + WindowID: time.Now().UTC().Unix(), + FileKey: "file-6", + Recipient: "recipient-1", + Observers: []string{"observer-1"}, + }); err != nil { + t.Fatalf("insert event: %v", err) + } + + claimed, err := store.ClaimPendingSelfHealingChallengeEvents(context.Background(), "other-node", 10*time.Millisecond, 1) + if err != nil { + t.Fatalf("claim event as other-node: %v", err) + } + if len(claimed) != 1 { + t.Fatalf("expected 1 claimed event, got %d", len(claimed)) + } + + time.Sleep(20 * time.Millisecond) + + svc.requestSelfHealingFn = func(ctx context.Context, remoteIdentity string, address string, req *supernode.RequestSelfHealingRequest, timeout time.Duration) (*supernode.RequestSelfHealingResponse, error) { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + RecipientId: req.RecipientId, + Accepted: true, + ReconstructionRequired: false, + }, nil + } + + svc.processEvents(context.Background()) + + ev, err := store.GetSelfHealingChallengeEvent("ch-expired-lease") + if err != nil { + t.Fatalf("get event: %v", err) + } + if got, want := ev.Status, "completed"; got != want { + t.Fatalf("status=%s want=%s", got, want) + } + if ev.AttemptCount != 2 { + t.Fatalf("attempt_count=%d want=2", ev.AttemptCount) + } +} + +func TestCountWeightedClosedVotes(t *testing.T) { + reports := []audittypes.StorageChallengeReport{ + {ReporterSupernodeAccount: "rep-1", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_CLOSED}}, + {ReporterSupernodeAccount: "rep-2", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-3", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_CLOSED}}, + } + total, closed := countWeightedClosedVotes(reports, 2) + if total != 3 || closed != 2 { + t.Fatalf("got total=%d closed=%d, want total=3 closed=2", total, closed) + } +} + +func TestDeriveWindowChallengeIDIgnoresRecipientSelection(t *testing.T) { + first := deriveWindowChallengeID(1710000000, "file-key-1") + second := deriveWindowChallengeID(1710000000, "file-key-1") + if first != second { + t.Fatalf("challenge id should be stable for same window/file; got %q vs %q", first, second) + } +} + +func TestAuditWeightedWatchlistUsesQuorumAndThreshold(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + lumeraClient := lumeraclient.NewMockClient(ctrl) + auditModule := &stubAuditModule{ + paramsResp: &audittypes.QueryParamsResponse{ + Params: audittypes.Params{ + PeerQuorumReports: 3, + PeerPortPostponeThresholdPercent: 66, + RequiredOpenPorts: []uint32{4444, 4445}, + }, + }, + currentEpochResp: &audittypes.QueryCurrentEpochResponse{EpochId: 55}, + reportsByTarget: map[string][]audittypes.StorageChallengeReport{ + "target-a": { + {ReporterSupernodeAccount: "rep-1", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_CLOSED, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-2", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_CLOSED, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-3", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_OPEN}}, + }, + "target-b": { + {ReporterSupernodeAccount: "rep-1", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-2", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_CLOSED, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-3", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_OPEN, audittypes.PortState_PORT_STATE_OPEN}}, + }, + "target-c": { + {ReporterSupernodeAccount: "rep-1", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_CLOSED, audittypes.PortState_PORT_STATE_OPEN}}, + {ReporterSupernodeAccount: "rep-2", PortStates: []audittypes.PortState{audittypes.PortState_PORT_STATE_CLOSED, audittypes.PortState_PORT_STATE_OPEN}}, + }, + }, + } + lumeraClient.EXPECT().Audit().AnyTimes().Return(auditModule) + + svc := &Service{ + cfg: Config{ + WatchlistThreshold: 2, + }, + identity: "self-node", + lumera: lumeraClient, + } + + watch, err := svc.auditWeightedWatchlist(context.Background(), []string{"target-b", "target-a", "target-c", "self-node"}) + if err != nil { + t.Fatalf("auditWeightedWatchlist error: %v", err) + } + if len(watch) != 1 || watch[0] != "target-a" { + t.Fatalf("unexpected watchlist: %v", watch) + } +} + +func TestListCascadeHealingTargetsUsesOnChainActions(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + lumeraClient := lumeraclient.NewMockClient(ctrl) + actionMod := actionmodule.NewMockModule(ctrl) + lumeraClient.EXPECT().Action().AnyTimes().Return(actionMod) + + metaA, err := proto.Marshal(&actiontypes.CascadeMetadata{RqIdsIds: []string{"rq-z", "rq-a"}}) + if err != nil { + t.Fatalf("marshal metaA: %v", err) + } + metaB, err := proto.Marshal(&actiontypes.CascadeMetadata{RqIdsIds: []string{"rq-m"}}) + if err != nil { + t.Fatalf("marshal metaB: %v", err) + } + + actionMod.EXPECT().ListActions(gomock.Any(), gomock.Any()).AnyTimes().DoAndReturn( + func(ctx context.Context, req *actiontypes.QueryListActionsRequest) (*actiontypes.QueryListActionsResponse, error) { + switch req.ActionState { + case actiontypes.ActionStateDone: + return &actiontypes.QueryListActionsResponse{ + Actions: []*actiontypes.Action{ + {ActionID: "action-2", Metadata: metaB, State: actiontypes.ActionStateDone}, + {ActionID: "action-1", Metadata: metaA, State: actiontypes.ActionStateDone}, + }, + }, nil + case actiontypes.ActionStateApproved: + return &actiontypes.QueryListActionsResponse{ + Actions: []*actiontypes.Action{ + // Duplicate action id across states should be deduped. + {ActionID: "action-1", Metadata: metaA, State: actiontypes.ActionStateApproved}, + }, + }, nil + default: + return &actiontypes.QueryListActionsResponse{}, nil + } + }, + ) + + svc := &Service{lumera: lumeraClient} + targets, err := svc.listCascadeHealingTargets(context.Background()) + if err != nil { + t.Fatalf("listCascadeHealingTargets error: %v", err) + } + if len(targets) != 2 { + t.Fatalf("targets length=%d want=2", len(targets)) + } + if targets[0].ActionID != "action-1" || targets[0].FileKey != "rq-a" { + t.Fatalf("unexpected first target: %+v", targets[0]) + } + if targets[1].ActionID != "action-2" || targets[1].FileKey != "rq-m" { + t.Fatalf("unexpected second target: %+v", targets[1]) + } +} + +func newServiceForEventTests(t *testing.T, cfg Config) (*Service, queries.LocalStoreInterface, func()) { + t.Helper() + t.Setenv("HOME", t.TempDir()) + + store, err := queries.OpenHistoryDB() + if err != nil { + t.Fatalf("open history db: %v", err) + } + + ctrl := gomock.NewController(t) + lumeraClient := lumeraclient.NewMockClient(ctrl) + supernodeModule := lumerasn.NewMockModule(ctrl) + + lumeraClient.EXPECT().SuperNode().AnyTimes().Return(supernodeModule) + supernodeModule.EXPECT().GetSupernodeWithLatestAddress(gomock.Any(), gomock.Any()).AnyTimes().DoAndReturn( + func(ctx context.Context, address string) (*lumerasn.SuperNodeInfo, error) { + return &lumerasn.SuperNodeInfo{ + SupernodeAccount: address, + LatestAddress: "127.0.0.1:4444", + }, nil + }, + ) + + if cfg.EventLeaseDuration <= 0 { + cfg.EventLeaseDuration = 30 * time.Second + } + if cfg.EventRetryBase <= 0 { + cfg.EventRetryBase = 100 * time.Millisecond + } + if cfg.EventRetryMax <= 0 { + cfg.EventRetryMax = time.Second + } + if cfg.MaxEventsPerTick <= 0 { + cfg.MaxEventsPerTick = 16 + } + + svc := &Service{ + cfg: cfg, + identity: "challenger-1", + lumera: lumeraClient, + store: store, + } + + cleanup := func() { + store.CloseHistoryDB(context.Background()) + ctrl.Finish() + } + return svc, store, cleanup +} + +func insertEvent(t *testing.T, store queries.LocalStoreInterface, challengeID string, payload challengeEventPayload) error { + t.Helper() + bz, err := json.Marshal(payload) + if err != nil { + return err + } + now := time.Now().UTC().Unix() + return store.BatchInsertSelfHealingChallengeEvents(context.Background(), []types.SelfHealingChallengeEvent{ + { + TriggerID: "window:test", + TicketID: payload.FileKey, + ChallengeID: challengeID, + Data: bz, + SenderID: "challenger-1", + ExecMetric: types.SelfHealingExecutionMetric{ + TriggerID: "window:test", + ChallengeID: challengeID, + MessageType: int(types.SelfHealingChallengeMessage), + Data: []byte(`{"event":"challenge"}`), + SenderID: "challenger-1", + CreatedAt: time.Unix(now, 0).UTC(), + UpdatedAt: time.Unix(now, 0).UTC(), + }, + }, + }) +} + +type stubAuditModule struct { + paramsResp *audittypes.QueryParamsResponse + currentEpochResp *audittypes.QueryCurrentEpochResponse + reportsByTarget map[string][]audittypes.StorageChallengeReport +} + +func (s *stubAuditModule) GetParams(ctx context.Context) (*audittypes.QueryParamsResponse, error) { + if s.paramsResp == nil { + return nil, errors.New("params unavailable") + } + return s.paramsResp, nil +} + +func (s *stubAuditModule) GetCurrentEpoch(ctx context.Context) (*audittypes.QueryCurrentEpochResponse, error) { + if s.currentEpochResp == nil { + return nil, errors.New("epoch unavailable") + } + return s.currentEpochResp, nil +} + +func (s *stubAuditModule) GetEpochAnchor(ctx context.Context, epochID uint64) (*audittypes.QueryEpochAnchorResponse, error) { + return &audittypes.QueryEpochAnchorResponse{}, nil +} + +func (s *stubAuditModule) GetCurrentEpochAnchor(ctx context.Context) (*audittypes.QueryCurrentEpochAnchorResponse, error) { + return &audittypes.QueryCurrentEpochAnchorResponse{}, nil +} + +func (s *stubAuditModule) GetAssignedTargets(ctx context.Context, supernodeAccount string, epochID uint64) (*audittypes.QueryAssignedTargetsResponse, error) { + return &audittypes.QueryAssignedTargetsResponse{}, nil +} + +func (s *stubAuditModule) GetEpochReport(ctx context.Context, epochID uint64, supernodeAccount string) (*audittypes.QueryEpochReportResponse, error) { + return &audittypes.QueryEpochReportResponse{}, nil +} + +func (s *stubAuditModule) GetStorageChallengeReports(ctx context.Context, supernodeAccount string, epochID uint64) (*audittypes.QueryStorageChallengeReportsResponse, error) { + reports := s.reportsByTarget[supernodeAccount] + return &audittypes.QueryStorageChallengeReportsResponse{Reports: reports}, nil +} diff --git a/supernode/transport/grpc/self_healing/handler.go b/supernode/transport/grpc/self_healing/handler.go new file mode 100644 index 00000000..b974269f --- /dev/null +++ b/supernode/transport/grpc/self_healing/handler.go @@ -0,0 +1,680 @@ +package self_healing + +import ( + "context" + "encoding/base64" + "encoding/hex" + "encoding/json" + "fmt" + "strings" + "sync" + "time" + + actiontypes "github.com/LumeraProtocol/lumera/x/action/v1/types" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/p2p" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + "github.com/LumeraProtocol/supernode/v2/pkg/storagechallenge/deterministic" + "github.com/LumeraProtocol/supernode/v2/pkg/types" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" + query "github.com/cosmos/cosmos-sdk/types/query" + "golang.org/x/sync/singleflight" + "lukechampine.com/blake3" +) + +const epochSkewTolerance = uint64(2) +const defaultActionPageLimit = 200 +const defaultActionIndexTTL = 5 * time.Minute + +type Server struct { + supernode.UnimplementedSelfHealingServiceServer + + identity string + p2p p2p.Client + lumera lumera.Client + store queries.LocalStoreInterface + cascadeFactory cascadeService.CascadeServiceFactory + + actionIndexMu sync.RWMutex + actionIndexByFile map[string]string + actionIndexLoadedAt time.Time + actionIndexTTL time.Duration + actionIndexRefresh sync.Mutex + reseedInFlight singleflight.Group +} + +func NewServer(identity string, p2pClient p2p.Client, lumeraClient lumera.Client, store queries.LocalStoreInterface, cascadeFactory ...cascadeService.CascadeServiceFactory) *Server { + var factory cascadeService.CascadeServiceFactory + if len(cascadeFactory) > 0 { + factory = cascadeFactory[0] + } + return &Server{ + identity: identity, + p2p: p2pClient, + lumera: lumeraClient, + store: store, + cascadeFactory: factory, + actionIndexByFile: make(map[string]string), + actionIndexTTL: defaultActionIndexTTL, + } +} + +func (s *Server) RequestSelfHealing(ctx context.Context, req *supernode.RequestSelfHealingRequest) (*supernode.RequestSelfHealingResponse, error) { + if req == nil { + return nil, fmt.Errorf("nil request") + } + if strings.TrimSpace(req.ChallengeId) == "" || strings.TrimSpace(req.FileKey) == "" { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "challenge_id and file_key are required", + }, nil + } + if rid := strings.TrimSpace(req.RecipientId); rid != "" && rid != s.identity { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "recipient mismatch", + }, nil + } + if req.EpochId > 0 && s.isStaleEpoch(ctx, req.EpochId) { + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "stale epoch", + }, nil + } + + actionID, expectedHashHex, aerr := s.resolveActionAndExpectedHash(ctx, req.ActionId, req.FileKey) + if aerr != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingResponseMessage), map[string]any{ + "event": "response", + "accepted": false, + "reason": "action_context_resolution_failed", + "error": aerr.Error(), + }) + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "action resolution failed", + }, nil + } + + hashHex := "" + reconstructionRequired := true + data, err := s.p2p.Retrieve(ctx, req.FileKey, true) + if err == nil && len(data) > 0 { + sum := blake3.Sum256(data) + localHashHex := hex.EncodeToString(sum[:]) + if strings.EqualFold(localHashHex, expectedHashHex) { + hashHex = strings.ToLower(localHashHex) + reconstructionRequired = false + } + } + if reconstructionRequired { + reseedRes, rerr := s.runRecoveryReseed(ctx, actionID, false) + if rerr != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingResponseMessage), map[string]any{ + "event": "response", + "accepted": false, + "reason": "reconstruction_failed", + "action_id": actionID, + "error": rerr.Error(), + }) + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "reconstruction failed", + }, nil + } + if reseedRes != nil { + hashHex = strings.ToLower(strings.TrimSpace(reseedRes.ReconstructedHashHex)) + } + if hashHex == "" { + s.persistExecution(req.ChallengeId, int(types.SelfHealingResponseMessage), map[string]any{ + "event": "response", + "accepted": false, + "reason": "reconstructed_hash_missing", + "action_id": actionID, + }) + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "reconstructed hash missing", + }, nil + } + reconstructionRequired = true + } + if !strings.EqualFold(hashHex, expectedHashHex) { + s.persistExecution(req.ChallengeId, int(types.SelfHealingResponseMessage), map[string]any{ + "event": "response", + "accepted": false, + "reason": "reconstructed_hash_mismatch_action", + "action_id": actionID, + "expected_hash_hex": expectedHashHex, + "reconstructed_hash": hashHex, + }) + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: false, + Error: "reconstructed hash mismatch action hash", + }, nil + } + s.persistExecution(req.ChallengeId, int(types.SelfHealingResponseMessage), map[string]any{"event": "response", "accepted": true, "reconstruction_required": reconstructionRequired, "reconstructed_hash_hex": hashHex}) + + return &supernode.RequestSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Accepted: true, + ReconstructionRequired: reconstructionRequired, + ReconstructedHashHex: hashHex, + }, nil +} + +func (s *Server) VerifySelfHealing(ctx context.Context, req *supernode.VerifySelfHealingRequest) (*supernode.VerifySelfHealingResponse, error) { + if req == nil { + return nil, fmt.Errorf("nil request") + } + if strings.TrimSpace(req.ChallengeId) == "" || strings.TrimSpace(req.FileKey) == "" || strings.TrimSpace(req.ReconstructedHashHex) == "" { + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "challenge_id, file_key and reconstructed_hash_hex are required", + }, nil + } + if oid := strings.TrimSpace(req.ObserverId); oid != "" && oid != s.identity { + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "observer mismatch", + }, nil + } + if req.EpochId > 0 && s.isStaleEpoch(ctx, req.EpochId) { + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "stale epoch", + }, nil + } + + actionID, expectedHashHex, aerr := s.resolveActionAndExpectedHash(ctx, req.ActionId, req.FileKey) + if aerr != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingVerificationMessage), map[string]any{ + "event": "verification", + "ok": false, + "reason": "action_context_resolution_failed", + "error": aerr.Error(), + "file_key": req.FileKey, + }) + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "observer action resolution failed", + }, nil + } + if !strings.EqualFold(req.ReconstructedHashHex, expectedHashHex) { + s.persistExecution(req.ChallengeId, int(types.SelfHealingVerificationMessage), map[string]any{ + "event": "verification", + "ok": false, + "reason": "recipient_hash_mismatch_action", + "expected_hash_hex": expectedHashHex, + "recipient_hash_hex": strings.ToLower(strings.TrimSpace(req.ReconstructedHashHex)), + "action_id": actionID, + }) + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "reconstructed hash does not match action hash", + }, nil + } + + got := "" + needReconstruct := true + data, err := s.p2p.Retrieve(ctx, req.FileKey, true) + if err == nil && len(data) > 0 { + sum := blake3.Sum256(data) + localHashHex := hex.EncodeToString(sum[:]) + if strings.EqualFold(localHashHex, expectedHashHex) { + got = strings.ToLower(localHashHex) + needReconstruct = false + } + } + if needReconstruct { + reseedRes, rerr := s.runRecoveryReseed(ctx, actionID, false) + if rerr != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingVerificationMessage), map[string]any{ + "event": "verification", + "ok": false, + "reason": "observer_reconstruction_failed", + "action_id": actionID, + "error": rerr.Error(), + }) + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "observer reconstruction failed", + }, nil + } + if reseedRes != nil { + got = strings.ToLower(strings.TrimSpace(reseedRes.ReconstructedHashHex)) + } + if got == "" { + s.persistExecution(req.ChallengeId, int(types.SelfHealingVerificationMessage), map[string]any{ + "event": "verification", + "ok": false, + "reason": "observer_reconstructed_hash_missing", + "action_id": actionID, + }) + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: false, + Error: "observer reconstructed hash missing", + }, nil + } + } + + ok := strings.EqualFold(got, req.ReconstructedHashHex) && strings.EqualFold(got, expectedHashHex) + errMsg := "" + if !ok { + errMsg = "reconstructed hash mismatch" + } + s.persistExecution(req.ChallengeId, int(types.SelfHealingVerificationMessage), map[string]any{ + "event": "verification", + "ok": ok, + "expected": strings.ToLower(req.ReconstructedHashHex), + "expected_action": expectedHashHex, + "got": got, + "used_reconstruction": needReconstruct, + }) + + return &supernode.VerifySelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + ObserverId: s.identity, + Ok: ok, + Error: errMsg, + }, nil +} + +func (s *Server) CommitSelfHealing(ctx context.Context, req *supernode.CommitSelfHealingRequest) (*supernode.CommitSelfHealingResponse, error) { + if req == nil { + return nil, fmt.Errorf("nil request") + } + if strings.TrimSpace(req.ChallengeId) == "" || strings.TrimSpace(req.FileKey) == "" { + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: false, + Error: "challenge_id and file_key are required", + }, nil + } + if rid := strings.TrimSpace(req.RecipientId); rid != "" && rid != s.identity { + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: false, + Error: "recipient mismatch", + }, nil + } + if req.EpochId > 0 && s.isStaleEpoch(ctx, req.EpochId) { + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: false, + Error: "stale epoch", + }, nil + } + + actionID, err := s.resolveActionID(ctx, req.ActionId, req.FileKey) + if err != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingCompletionMessage), map[string]any{ + "event": "commit", + "stored": false, + "reason": "action_resolution_failed", + "error": err.Error(), + }) + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: false, + Error: "action resolution failed", + }, nil + } + if _, err := s.runRecoveryReseed(ctx, actionID, true); err != nil { + s.persistExecution(req.ChallengeId, int(types.SelfHealingCompletionMessage), map[string]any{ + "event": "commit", + "stored": false, + "reason": "store_failed", + "action_id": actionID, + "error": err.Error(), + }) + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: false, + Error: "artifact store failed", + }, nil + } + + s.persistExecution(req.ChallengeId, int(types.SelfHealingCompletionMessage), map[string]any{ + "event": "commit", + "stored": true, + "action_id": actionID, + }) + return &supernode.CommitSelfHealingResponse{ + ChallengeId: req.ChallengeId, + EpochId: req.EpochId, + RecipientId: s.identity, + Stored: true, + }, nil +} + +func (s *Server) persistExecution(challengeID string, msgType int, payload map[string]any) { + if s.store == nil { + return + } + bz, err := json.Marshal(payload) + if err != nil { + return + } + now := time.Now().UTC() + _ = s.store.InsertSelfHealingExecutionMetrics(types.SelfHealingExecutionMetric{ + TriggerID: challengeID, + ChallengeID: challengeID, + MessageType: msgType, + Data: bz, + SenderID: s.identity, + SenderSignature: []byte{}, + CreatedAt: now, + UpdatedAt: now, + }) +} + +func (s *Server) isStaleEpoch(ctx context.Context, reqEpoch uint64) bool { + if s.lumera == nil || s.lumera.Node() == nil || s.lumera.Audit() == nil { + return false + } + latest, err := s.lumera.Node().GetLatestBlock(ctx) + if err != nil || latest == nil { + return false + } + var height int64 + if sdkBlk := latest.GetSdkBlock(); sdkBlk != nil { + height = sdkBlk.Header.Height + } else if blk := latest.GetBlock(); blk != nil { + height = blk.Header.Height + } + if height <= 0 { + return false + } + paramsResp, err := s.lumera.Audit().GetParams(ctx) + if err != nil || paramsResp == nil { + return false + } + p := paramsResp.Params.WithDefaults() + if err := p.Validate(); err != nil { + return false + } + current, ok := deterministic.EpochID(height, p.EpochZeroHeight, p.EpochLengthBlocks) + if !ok { + return false + } + if reqEpoch > current+epochSkewTolerance { + return true + } + if current > reqEpoch+epochSkewTolerance { + return true + } + return false +} + +func (s *Server) resolveActionIDByFileKey(ctx context.Context, fileKey string) (string, error) { + key := strings.TrimSpace(fileKey) + if key == "" { + return "", fmt.Errorf("empty file key") + } + + if actionID, ok := s.getActionIDFromIndex(key); ok { + return actionID, nil + } + + if err := s.refreshActionIndexIfNeeded(ctx, false); err != nil { + return "", err + } + if actionID, ok := s.getActionIDFromIndex(key); ok { + return actionID, nil + } + + // Force one fresh on-chain pull to avoid stale-cache misses. + if err := s.refreshActionIndexIfNeeded(ctx, true); err != nil { + return "", err + } + if actionID, ok := s.getActionIDFromIndex(key); ok { + return actionID, nil + } + + return "", fmt.Errorf("cascade action not found for key") +} + +func (s *Server) resolveActionAndExpectedHash(ctx context.Context, actionID string, fileKey string) (string, string, error) { + resolvedActionID, err := s.resolveActionID(ctx, actionID, fileKey) + if err != nil { + return "", "", err + } + expectedHashHex, err := s.resolveActionDataHashHex(ctx, resolvedActionID) + if err != nil { + return "", "", err + } + return resolvedActionID, expectedHashHex, nil +} + +func (s *Server) resolveActionDataHashHex(ctx context.Context, actionID string) (string, error) { + actionID = strings.TrimSpace(actionID) + if actionID == "" { + return "", fmt.Errorf("missing action_id") + } + if s.lumera == nil || s.lumera.Action() == nil { + return "", fmt.Errorf("action module unavailable") + } + resp, err := s.lumera.Action().GetAction(ctx, actionID) + if err != nil { + return "", fmt.Errorf("get action: %w", err) + } + if resp == nil || resp.Action == nil || len(resp.Action.Metadata) == 0 { + return "", fmt.Errorf("cascade action metadata unavailable") + } + meta, err := cascadekit.UnmarshalCascadeMetadata(resp.Action.Metadata) + if err != nil { + return "", err + } + dataHashB64 := strings.TrimSpace(meta.DataHash) + if dataHashB64 == "" { + return "", fmt.Errorf("action metadata data hash missing") + } + raw, err := base64.StdEncoding.DecodeString(dataHashB64) + if err != nil { + return "", fmt.Errorf("decode action data hash: %w", err) + } + if len(raw) == 0 { + return "", fmt.Errorf("action data hash empty") + } + return strings.ToLower(hex.EncodeToString(raw)), nil +} + +func (s *Server) resolveActionID(ctx context.Context, actionID string, fileKey string) (string, error) { + actionID = strings.TrimSpace(actionID) + if actionID != "" { + return actionID, nil + } + return s.resolveActionIDByFileKey(ctx, fileKey) +} + +func (s *Server) getActionIDFromIndex(fileKey string) (string, bool) { + s.actionIndexMu.RLock() + defer s.actionIndexMu.RUnlock() + actionID, ok := s.actionIndexByFile[fileKey] + return actionID, ok +} + +func (s *Server) refreshActionIndexIfNeeded(ctx context.Context, force bool) error { + s.actionIndexRefresh.Lock() + defer s.actionIndexRefresh.Unlock() + + if !force { + s.actionIndexMu.RLock() + fresh := time.Since(s.actionIndexLoadedAt) <= s.actionIndexTTL && len(s.actionIndexByFile) > 0 + s.actionIndexMu.RUnlock() + if fresh { + return nil + } + } + + if s.lumera == nil || s.lumera.Action() == nil { + return fmt.Errorf("action module unavailable") + } + + states := []actiontypes.ActionState{ + actiontypes.ActionStateDone, + actiontypes.ActionStateApproved, + } + newIndex := make(map[string]string) + for _, state := range states { + var nextKey []byte + for { + resp, err := s.lumera.Action().ListActions(ctx, &actiontypes.QueryListActionsRequest{ + ActionType: actiontypes.ActionTypeCascade, + ActionState: state, + Pagination: &query.PageRequest{ + Key: nextKey, + Limit: defaultActionPageLimit, + }, + }) + if err != nil { + return fmt.Errorf("list cascade actions (state=%s): %w", state.String(), err) + } + if resp == nil { + break + } + for _, action := range resp.Actions { + if action == nil { + continue + } + actionID := strings.TrimSpace(action.ActionID) + if actionID == "" || len(action.Metadata) == 0 { + continue + } + meta, err := cascadekit.UnmarshalCascadeMetadata(action.Metadata) + if err != nil { + continue + } + anchor := pickActionAnchorKey(meta.RqIdsIds) + if anchor == "" { + continue + } + newIndex[anchor] = actionID + } + if resp.Pagination == nil || len(resp.Pagination.NextKey) == 0 { + break + } + nextKey = append(nextKey[:0], resp.Pagination.NextKey...) + } + } + + s.actionIndexMu.Lock() + s.actionIndexByFile = newIndex + s.actionIndexLoadedAt = time.Now().UTC() + s.actionIndexMu.Unlock() + return nil +} + +func (s *Server) runRecoveryReseed(ctx context.Context, actionID string, persistArtifacts bool) (*cascadeService.RecoveryReseedResult, error) { + actionID = strings.TrimSpace(actionID) + if actionID == "" { + return nil, fmt.Errorf("missing action_id") + } + callKey := fmt.Sprintf("%s:%t", actionID, persistArtifacts) + result, err, _ := s.reseedInFlight.Do(callKey, func() (any, error) { + if s.cascadeFactory == nil { + return nil, fmt.Errorf("recovery reseed unavailable") + } + task := s.cascadeFactory.NewCascadeRegistrationTask() + if task == nil { + return nil, fmt.Errorf("failed to build cascade task") + } + recoveryTask, ok := any(task).(interface { + RecoveryReseed(context.Context, *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) + }) + if !ok { + return nil, fmt.Errorf("cascade task does not support recovery reseed") + } + reseedRes, err := recoveryTask.RecoveryReseed(ctx, &cascadeService.RecoveryReseedRequest{ + ActionID: actionID, + PersistArtifacts: boolPtr(persistArtifacts), + }) + if err != nil { + return nil, err + } + return reseedRes, nil + }) + if err != nil { + return nil, err + } + if typed, ok := result.(*cascadeService.RecoveryReseedResult); ok { + return typed, nil + } + return nil, fmt.Errorf("unexpected recovery result type") +} + +func boolPtr(v bool) *bool { + return &v +} + +func pickActionAnchorKey(keys []string) string { + anchor := "" + for _, raw := range keys { + key := strings.TrimSpace(raw) + if key == "" { + continue + } + if anchor == "" || key < anchor { + anchor = key + } + } + return anchor +} diff --git a/supernode/transport/grpc/self_healing/handler_e2e_test.go b/supernode/transport/grpc/self_healing/handler_e2e_test.go new file mode 100644 index 00000000..b6497e89 --- /dev/null +++ b/supernode/transport/grpc/self_healing/handler_e2e_test.go @@ -0,0 +1,482 @@ +package self_healing + +import ( + "context" + "encoding/base64" + "encoding/hex" + "fmt" + "net" + "testing" + "time" + + actiontypes "github.com/LumeraProtocol/lumera/x/action/v1/types" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/p2p" + lumeraclient "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + actionmod "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" + "github.com/golang/protobuf/proto" + "go.uber.org/mock/gomock" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" + "lukechampine.com/blake3" +) + +type fakeP2P struct { + local map[string][]byte + network map[string][]byte +} + +func newFakeP2P() *fakeP2P { + return &fakeP2P{local: map[string][]byte{}, network: map[string][]byte{}} +} + +func (f *fakeP2P) Retrieve(ctx context.Context, key string, localOnly ...bool) ([]byte, error) { + if len(localOnly) > 0 && localOnly[0] { + if v, ok := f.local[key]; ok { + return append([]byte(nil), v...), nil + } + return nil, nil + } + if v, ok := f.local[key]; ok { + return append([]byte(nil), v...), nil + } + if v, ok := f.network[key]; ok { + return append([]byte(nil), v...), nil + } + return nil, nil +} + +func (f *fakeP2P) BatchRetrieve(ctx context.Context, keys []string, reqCount int, txID string, localOnly ...bool) (map[string][]byte, error) { + out := make(map[string][]byte) + for _, k := range keys { + if v, _ := f.Retrieve(ctx, k, localOnly...); len(v) > 0 { + out[k] = v + } + } + return out, nil +} + +func (f *fakeP2P) BatchRetrieveStream(ctx context.Context, keys []string, required int32, txID string, onSymbol func(base58Key string, data []byte) error, localOnly ...bool) (int32, error) { + var n int32 + for _, k := range keys { + v, _ := f.Retrieve(ctx, k, localOnly...) + if len(v) == 0 { + continue + } + if err := onSymbol(k, v); err != nil { + return n, err + } + n++ + } + return n, nil +} + +func (f *fakeP2P) Store(ctx context.Context, data []byte, typ int) (string, error) { return "", nil } +func (f *fakeP2P) StoreBatch(ctx context.Context, values [][]byte, typ int, taskID string) error { + return nil +} +func (f *fakeP2P) Delete(ctx context.Context, key string) error { return nil } +func (f *fakeP2P) Stats(ctx context.Context) (*p2p.StatsSnapshot, error) { return nil, nil } +func (f *fakeP2P) NClosestNodes(ctx context.Context, n int, key string, ignores ...string) []string { + return nil +} +func (f *fakeP2P) NClosestNodesWithIncludingNodeList(ctx context.Context, n int, key string, ignores, nodesToInclude []string) []string { + return nil +} +func (f *fakeP2P) LocalStore(ctx context.Context, key string, data []byte) (string, error) { + f.local[key] = append([]byte(nil), data...) + return key, nil +} +func (f *fakeP2P) DisableKey(ctx context.Context, b58EncodedHash string) error { return nil } +func (f *fakeP2P) EnableKey(ctx context.Context, b58EncodedHash string) error { return nil } +func (f *fakeP2P) GetLocalKeys(ctx context.Context, from *time.Time, to time.Time) ([]string, error) { + out := make([]string, 0, len(f.local)) + for k := range f.local { + out = append(out, k) + } + return out, nil +} + +type fakeCascadeTask struct { + recoveryFn func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) +} + +func (f *fakeCascadeTask) Register(ctx context.Context, req *cascadeService.RegisterRequest, send func(resp *cascadeService.RegisterResponse) error) error { + return nil +} + +func (f *fakeCascadeTask) Download(ctx context.Context, req *cascadeService.DownloadRequest, send func(resp *cascadeService.DownloadResponse) error) error { + return nil +} + +func (f *fakeCascadeTask) CleanupDownload(ctx context.Context, tmpDir string) error { + return nil +} + +func (f *fakeCascadeTask) RecoveryReseed(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + if f.recoveryFn != nil { + return f.recoveryFn(ctx, req) + } + return &cascadeService.RecoveryReseedResult{ActionID: req.ActionID}, nil +} + +type fakeCascadeFactory struct { + task *fakeCascadeTask +} + +func (f *fakeCascadeFactory) NewCascadeRegistrationTask() cascadeService.CascadeTask { + return f.task +} + +func mockLumeraActionLookup(t *testing.T, fileKey, actionID, dataHashHex string) (lumeraclient.Client, func()) { + t.Helper() + ctrl := gomock.NewController(t) + lumeraClient := lumeraclient.NewMockClient(ctrl) + actionModule := actionmod.NewMockModule(ctrl) + + metaPayload := &actiontypes.CascadeMetadata{ + RqIdsIds: []string{fileKey}, + } + if dataHashHex != "" { + raw, derr := hex.DecodeString(dataHashHex) + if derr != nil { + t.Fatalf("decode data hash hex: %v", derr) + } + metaPayload.DataHash = base64.StdEncoding.EncodeToString(raw) + } + meta, err := proto.Marshal(metaPayload) + if err != nil { + t.Fatalf("marshal cascade metadata: %v", err) + } + + lumeraClient.EXPECT().Action().AnyTimes().Return(actionModule) + lumeraClient.EXPECT().Node().AnyTimes().Return(nil) + lumeraClient.EXPECT().Audit().AnyTimes().Return(nil) + actionModule.EXPECT().GetAction(gomock.Any(), actionID).AnyTimes().Return(&actiontypes.QueryGetActionResponse{ + Action: &actiontypes.Action{ + ActionID: actionID, + Metadata: meta, + State: actiontypes.ActionStateDone, + }, + }, nil) + actionModule.EXPECT().ListActions(gomock.Any(), gomock.Any()).AnyTimes().DoAndReturn( + func(ctx context.Context, req *actiontypes.QueryListActionsRequest) (*actiontypes.QueryListActionsResponse, error) { + if req == nil { + return nil, fmt.Errorf("nil request") + } + return &actiontypes.QueryListActionsResponse{ + Actions: []*actiontypes.Action{ + { + ActionID: actionID, + Metadata: meta, + State: actiontypes.ActionStateDone, + }, + }, + }, nil + }, + ) + + return lumeraClient, ctrl.Finish +} + +func startSelfHealingTestServer(t *testing.T, identity string, p2p *fakeP2P, lumeraClient lumeraclient.Client, cascadeFactory cascadeService.CascadeServiceFactory) (*grpc.ClientConn, func()) { + t.Helper() + lis := bufconn.Listen(1024 * 1024) + s := grpc.NewServer() + supernode.RegisterSelfHealingServiceServer(s, NewServer(identity, p2p, lumeraClient, nil, cascadeFactory)) + go func() { _ = s.Serve(lis) }() + + dialer := func(context.Context, string) (net.Conn, error) { return lis.Dial() } + conn, err := grpc.DialContext(context.Background(), "bufnet", grpc.WithContextDialer(dialer), grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + t.Fatalf("dial: %v", err) + } + cleanup := func() { + _ = conn.Close() + s.Stop() + _ = lis.Close() + } + return conn, cleanup +} + +func TestSelfHealingE2E_RequestThenVerify(t *testing.T) { + const fileKey = "key-1" + const actionID = "action-1" + payload := []byte("hello-self-healing") + payloadHashHex := blake3Hex(payload) + + recipientP2P := newFakeP2P() + recipientP2P.network[fileKey] = payload + + observerP2P := newFakeP2P() + observerP2P.local[fileKey] = payload // observer has local authoritative copy + + lumeraClient, lumeraCleanup := mockLumeraActionLookup(t, fileKey, actionID, payloadHashHex) + defer lumeraCleanup() + factory := &fakeCascadeFactory{ + task: &fakeCascadeTask{ + recoveryFn: func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + if req == nil || req.ActionID != actionID { + return nil, fmt.Errorf("unexpected recovery action_id") + } + if req.PersistArtifacts != nil && !*req.PersistArtifacts { + return &cascadeService.RecoveryReseedResult{ + ActionID: actionID, + ReconstructedHashHex: payloadHashHex, + }, nil + } + recipientP2P.local[fileKey] = append([]byte(nil), payload...) + return &cascadeService.RecoveryReseedResult{ + ActionID: actionID, + ReconstructedHashHex: payloadHashHex, + }, nil + }, + }, + } + + recConn, recCleanup := startSelfHealingTestServer(t, "recipient-1", recipientP2P, lumeraClient, factory) + defer recCleanup() + obsConn, obsCleanup := startSelfHealingTestServer(t, "observer-1", observerP2P, lumeraClient, nil) + defer obsCleanup() + + recClient := supernode.NewSelfHealingServiceClient(recConn) + obsClient := supernode.NewSelfHealingServiceClient(obsConn) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + resp, err := recClient.RequestSelfHealing(ctx, &supernode.RequestSelfHealingRequest{ + ChallengeId: "ch-1", + EpochId: 12, + FileKey: fileKey, + ChallengerId: "challenger-1", + RecipientId: "recipient-1", + ObserverIds: []string{"observer-1"}, + ActionId: actionID, + }) + if err != nil { + t.Fatalf("request self-healing: %v", err) + } + if !resp.Accepted { + t.Fatalf("expected accepted=true, got false err=%s", resp.Error) + } + if !resp.ReconstructionRequired { + t.Fatalf("expected reconstruction_required=true") + } + if got := recipientP2P.local[fileKey]; len(got) > 0 { + t.Fatalf("expected no recipient local store before commit") + } + + ver, err := obsClient.VerifySelfHealing(ctx, &supernode.VerifySelfHealingRequest{ + ChallengeId: "ch-1", + EpochId: 12, + FileKey: fileKey, + RecipientId: "recipient-1", + ReconstructedHashHex: resp.ReconstructedHashHex, + ObserverId: "observer-1", + ActionId: actionID, + }) + if err != nil { + t.Fatalf("verify self-healing: %v", err) + } + if !ver.Ok { + t.Fatalf("expected verify ok=true, got false err=%s", ver.Error) + } + commitResp, err := recClient.CommitSelfHealing(ctx, &supernode.CommitSelfHealingRequest{ + ChallengeId: "ch-1", + EpochId: 12, + FileKey: fileKey, + ActionId: actionID, + ChallengerId: "challenger-1", + RecipientId: "recipient-1", + }) + if err != nil { + t.Fatalf("commit self-healing: %v", err) + } + if !commitResp.Stored { + t.Fatalf("expected commit stored=true, got false err=%s", commitResp.Error) + } + if got := recipientP2P.local[fileKey]; string(got) != string(payload) { + t.Fatalf("recipient local store not repaired after commit") + } +} + +func TestSelfHealingE2E_RequestFileNotRetrievable(t *testing.T) { + const fileKey = "missing-key" + + recipientP2P := newFakeP2P() // file is absent both locally and on network + recConn, recCleanup := startSelfHealingTestServer(t, "recipient-1", recipientP2P, nil, nil) + defer recCleanup() + + recClient := supernode.NewSelfHealingServiceClient(recConn) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + resp, err := recClient.RequestSelfHealing(ctx, &supernode.RequestSelfHealingRequest{ + ChallengeId: "ch-missing", + EpochId: 12, + FileKey: fileKey, + ChallengerId: "challenger-1", + RecipientId: "recipient-1", + ObserverIds: []string{"observer-1"}, + }) + if err != nil { + t.Fatalf("request self-healing: %v", err) + } + if resp.Accepted { + t.Fatalf("expected accepted=false for non-retrievable file") + } +} + +func TestSelfHealingE2E_RequestRecipientMismatch(t *testing.T) { + recipientP2P := newFakeP2P() + recConn, recCleanup := startSelfHealingTestServer(t, "recipient-1", recipientP2P, nil, nil) + defer recCleanup() + + recClient := supernode.NewSelfHealingServiceClient(recConn) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + resp, err := recClient.RequestSelfHealing(ctx, &supernode.RequestSelfHealingRequest{ + ChallengeId: "ch-recipient-mismatch", + EpochId: 12, + FileKey: "key-1", + ChallengerId: "challenger-1", + RecipientId: "recipient-2", + ObserverIds: []string{"observer-1"}, + }) + if err != nil { + t.Fatalf("request self-healing: %v", err) + } + if resp.Accepted { + t.Fatalf("expected accepted=false for recipient mismatch") + } +} + +func TestSelfHealingE2E_VerifyHashMismatch(t *testing.T) { + const fileKey = "key-verify-mismatch" + payload := []byte("hello-self-healing") + + observerP2P := newFakeP2P() + observerP2P.local[fileKey] = payload + + obsConn, obsCleanup := startSelfHealingTestServer(t, "observer-1", observerP2P, nil, nil) + defer obsCleanup() + + obsClient := supernode.NewSelfHealingServiceClient(obsConn) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + ver, err := obsClient.VerifySelfHealing(ctx, &supernode.VerifySelfHealingRequest{ + ChallengeId: "ch-mismatch", + EpochId: 12, + FileKey: fileKey, + RecipientId: "recipient-1", + ReconstructedHashHex: "deadbeef", + ObserverId: "observer-1", + }) + if err != nil { + t.Fatalf("verify self-healing: %v", err) + } + if ver.Ok { + t.Fatalf("expected verify ok=false on hash mismatch") + } +} + +func TestSelfHealingE2E_VerifyFallbackReconstructWithoutPersist(t *testing.T) { + const fileKey = "key-verify-fallback" + const actionID = "action-verify-fallback" + payload := []byte("fallback-self-healing") + payloadHashHex := blake3Hex(payload) + + observerP2P := newFakeP2P() + lumeraClient, lumeraCleanup := mockLumeraActionLookup(t, fileKey, actionID, payloadHashHex) + defer lumeraCleanup() + + fallbackReconstructCalled := false + factory := &fakeCascadeFactory{ + task: &fakeCascadeTask{ + recoveryFn: func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + if req == nil || req.ActionID != actionID { + return nil, fmt.Errorf("unexpected recovery action_id") + } + if req.PersistArtifacts == nil || *req.PersistArtifacts { + return nil, fmt.Errorf("expected non-persist recovery path") + } + fallbackReconstructCalled = true + return &cascadeService.RecoveryReseedResult{ + ActionID: actionID, + ReconstructedHashHex: payloadHashHex, + }, nil + }, + }, + } + + obsConn, obsCleanup := startSelfHealingTestServer(t, "observer-1", observerP2P, lumeraClient, factory) + defer obsCleanup() + obsClient := supernode.NewSelfHealingServiceClient(obsConn) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + ver, err := obsClient.VerifySelfHealing(ctx, &supernode.VerifySelfHealingRequest{ + ChallengeId: "ch-fallback", + EpochId: 12, + FileKey: fileKey, + RecipientId: "recipient-1", + ReconstructedHashHex: payloadHashHex, + ObserverId: "observer-1", + ActionId: actionID, + }) + if err != nil { + t.Fatalf("verify self-healing: %v", err) + } + if !ver.Ok { + t.Fatalf("expected verify ok=true, got false err=%s", ver.Error) + } + if !fallbackReconstructCalled { + t.Fatalf("expected fallback reconstruction to be called") + } + if got := observerP2P.local[fileKey]; len(got) > 0 { + t.Fatalf("fallback verify must not persist local artifacts") + } +} + +func TestSelfHealingE2E_VerifyObserverMismatch(t *testing.T) { + const fileKey = "key-verify-observer-mismatch" + payload := []byte("hello-self-healing") + + observerP2P := newFakeP2P() + observerP2P.local[fileKey] = payload + + obsConn, obsCleanup := startSelfHealingTestServer(t, "observer-1", observerP2P, nil, nil) + defer obsCleanup() + + obsClient := supernode.NewSelfHealingServiceClient(obsConn) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + ver, err := obsClient.VerifySelfHealing(ctx, &supernode.VerifySelfHealingRequest{ + ChallengeId: "ch-observer-mismatch", + EpochId: 12, + FileKey: fileKey, + RecipientId: "recipient-1", + ReconstructedHashHex: "deadbeef", + ObserverId: "observer-2", + }) + if err != nil { + t.Fatalf("verify self-healing: %v", err) + } + if ver.Ok { + t.Fatalf("expected verify ok=false for observer mismatch") + } +} + +func blake3Hex(data []byte) string { + sum := blake3.Sum256(data) + return hex.EncodeToString(sum[:]) +} diff --git a/tests/system/README.md b/tests/system/README.md index 9212b7cf..7ed9df59 100644 --- a/tests/system/README.md +++ b/tests/system/README.md @@ -13,6 +13,9 @@ This suite brings up a local Lumera chain and three local supernodes, then runs The tests manage `INTEGRATION_TEST` internally so local loopback addresses (127.0.0.1/localhost) are accepted by the P2P layer during test runs. +By default, system tests also perform a hard pre/post runtime cleanup for stale `lumerad`/`supernode` listeners on known test ports to reduce flaky reruns. +Set `SYSTEM_TEST_HARD_CLEANUP=0` to disable this behavior. + ## Layout - `config.test-1.yml`, `config.test-2.yml`, `config.test-3.yml` — Supernode configs (hosts on 0.0.0.0, P2P ports 4445/4447/4449; gRPC 4444/4446/4448) @@ -20,4 +23,3 @@ The tests manage `INTEGRATION_TEST` internally so local loopback addresses (127. - `supernode-utils.go` — Helpers to start/stop the supernode processes for tests See `docs/TESTING.md` for deeper details and troubleshooting. - diff --git a/tests/system/config.test-1.yml b/tests/system/config.test-1.yml index 191c6b37..f6c95bda 100644 --- a/tests/system/config.test-1.yml +++ b/tests/system/config.test-1.yml @@ -18,6 +18,7 @@ keyring: p2p: port: 4445 data_dir: "data/p2p" # Relative to base_dir + bootstrap_nodes: "lumera1cf0ms9ttgdvz6zwlqfty4tjcawhuaq69p40w0c@localhost:4447,lumera1cjyc4ruq739e2lakuhargejjkr0q5vg6x3d7kp@localhost:4449" # Lumera Chain Configuration lumera: diff --git a/tests/system/config.test-2.yml b/tests/system/config.test-2.yml index 5cc934e3..248ed68d 100644 --- a/tests/system/config.test-2.yml +++ b/tests/system/config.test-2.yml @@ -19,6 +19,7 @@ keyring: p2p: port: 4447 data_dir: "data/p2p" + bootstrap_nodes: "lumera1em87kgrvgttrkvuamtetyaagjrhnu3vjy44at4@localhost:4445,lumera1cjyc4ruq739e2lakuhargejjkr0q5vg6x3d7kp@localhost:4449" # Lumera Chain Configuration lumera: diff --git a/tests/system/config.test-3.yml b/tests/system/config.test-3.yml index 06beaf9b..edb60dc2 100644 --- a/tests/system/config.test-3.yml +++ b/tests/system/config.test-3.yml @@ -19,6 +19,7 @@ keyring: p2p: port: 4449 data_dir: "data/p2p" + bootstrap_nodes: "lumera1em87kgrvgttrkvuamtetyaagjrhnu3vjy44at4@localhost:4445,lumera1cf0ms9ttgdvz6zwlqfty4tjcawhuaq69p40w0c@localhost:4447" # Lumera Chain Configuration lumera: diff --git a/tests/system/e2e_self_healing_test.go b/tests/system/e2e_self_healing_test.go new file mode 100644 index 00000000..4c7bbd62 --- /dev/null +++ b/tests/system/e2e_self_healing_test.go @@ -0,0 +1,610 @@ +package system + +import ( + "context" + "encoding/base64" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + actiontypes "github.com/LumeraProtocol/lumera/x/action/v1/types" + "github.com/LumeraProtocol/lumera/x/lumeraid/securekeyx" + pb "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + snkeyring "github.com/LumeraProtocol/supernode/v2/pkg/keyring" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/net/credentials" + grpcclient "github.com/LumeraProtocol/supernode/v2/pkg/net/grpc/client" + "github.com/LumeraProtocol/supernode/v2/sdk/action" + sdkconfig "github.com/LumeraProtocol/supernode/v2/sdk/config" + snconfig "github.com/LumeraProtocol/supernode/v2/supernode/config" + "github.com/cosmos/cosmos-sdk/crypto/keyring" + "github.com/stretchr/testify/require" + "github.com/tidwall/gjson" +) + +const ( + shNode0KeyName = "testkey1" + shNode1KeyName = "testkey2" + shNode2KeyName = "testkey3" + + shNode0Identity = "lumera1em87kgrvgttrkvuamtetyaagjrhnu3vjy44at4" + shNode1Identity = "lumera1cf0ms9ttgdvz6zwlqfty4tjcawhuaq69p40w0c" + shNode2Identity = "lumera1cjyc4ruq739e2lakuhargejjkr0q5vg6x3d7kp" + + shUserKeyName = "user-sh" + shUserMnemonic = "little tone alley oval festival gloom sting asthma crime select swap auto when trip luxury pact risk sister pencil about crisp upon opera timber" +) + +type selfHealingNode struct { + Identity string + GRPCAddr string +} + +type selfHealingRPCClient struct { + client *grpcclient.Client + opts *grpcclient.ClientOptions +} + +type selfHealingFixture struct { + cli *LumeradCli + userLumera lumera.Client + shClient *selfHealingRPCClient + nodes []selfHealingNode + recipient selfHealingNode + observer selfHealingNode + actionID string + fileKey string + registeredHashHex string + cmds []*exec.Cmd +} + +func TestSelfHealingE2EHappyPath(t *testing.T) { + fixture := setupSelfHealingFixture(t) + + ctx := context.Background() + challengeID := newSelfHealingChallengeID("sh-e2e-happy") + + req := buildSelfHealingRequest(challengeID, 0, fixture) + reqCtx, cancelReq := context.WithTimeout(ctx, 30*time.Second) + defer cancelReq() + + t.Logf("self-healing request start challenge_id=%s recipient=%s observers=%v", challengeID, fixture.recipient.Identity, req.ObserverIds) + reqResp, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.NoError(t, err) + require.True(t, reqResp.Accepted, "self-healing request was rejected: %s", reqResp.Error) + require.NotEmpty(t, reqResp.ReconstructedHashHex) + t.Logf("self-healing request response accepted=%t reconstruction_required=%t reconstructed_hash=%s", reqResp.Accepted, reqResp.ReconstructionRequired, reqResp.ReconstructedHashHex) + require.True(t, strings.EqualFold(reqResp.ReconstructedHashHex, fixture.registeredHashHex), "recipient reconstructed hash mismatch: got=%s want=%s", reqResp.ReconstructedHashHex, fixture.registeredHashHex) + t.Logf("self-healing hash assertion passed reconstructed_hash=%s registered_action_hash=%s", reqResp.ReconstructedHashHex, fixture.registeredHashHex) + + verifyReq := buildSelfHealingVerifyRequest(challengeID, 0, reqResp.ReconstructedHashHex, fixture) + verifyCtx, cancelVerify := context.WithTimeout(ctx, 30*time.Second) + defer cancelVerify() + + t.Logf("self-healing verify start challenge_id=%s observer=%s recipient=%s", challengeID, fixture.observer.Identity, fixture.recipient.Identity) + verifyResp, err := fixture.shClient.Verify(verifyCtx, fixture.observer.Identity, fixture.observer.GRPCAddr, verifyReq) + require.NoError(t, err) + require.True(t, verifyResp.Ok, "observer verification failed: %s", verifyResp.Error) + t.Logf("self-healing verify response observer=%s ok=%t error=%q", verifyResp.ObserverId, verifyResp.Ok, verifyResp.Error) + + commitReq := buildSelfHealingCommitRequest(challengeID, 0, fixture) + commitCtx, cancelCommit := context.WithTimeout(ctx, 30*time.Second) + defer cancelCommit() + + t.Logf("self-healing commit start challenge_id=%s recipient=%s", challengeID, fixture.recipient.Identity) + commitResp, err := fixture.shClient.Commit(commitCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, commitReq) + require.NoError(t, err) + require.True(t, commitResp.Stored, "self-healing commit failed: %s", commitResp.Error) + t.Logf("self-healing commit response stored=%t error=%q", commitResp.Stored, commitResp.Error) +} + +func TestSelfHealingE2EFailureScenarios(t *testing.T) { + fixture := setupSelfHealingFixture(t) + ctx := context.Background() + + t.Run("ObserverRejectsTamperedHash", func(t *testing.T) { + challengeID := newSelfHealingChallengeID("sh-e2e-verify-mismatch") + req := buildSelfHealingRequest(challengeID, 0, fixture) + + reqCtx, cancelReq := context.WithTimeout(ctx, 30*time.Second) + defer cancelReq() + + reqResp, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.NoError(t, err) + require.True(t, reqResp.Accepted, "self-healing request was rejected: %s", reqResp.Error) + require.NotEmpty(t, reqResp.ReconstructedHashHex) + + tampered := tamperHexChar(reqResp.ReconstructedHashHex) + verifyReq := buildSelfHealingVerifyRequest(challengeID, 0, tampered, fixture) + + verifyCtx, cancelVerify := context.WithTimeout(ctx, 30*time.Second) + defer cancelVerify() + + verifyResp, err := fixture.shClient.Verify(verifyCtx, fixture.observer.Identity, fixture.observer.GRPCAddr, verifyReq) + require.NoError(t, err) + require.False(t, verifyResp.Ok, "tampered hash must fail observer verification") + require.NotEmpty(t, verifyResp.Error) + t.Logf("self-healing tampered verify response observer=%s ok=%t error=%q", verifyResp.ObserverId, verifyResp.Ok, verifyResp.Error) + }) + + t.Run("StaleEpochRejected", func(t *testing.T) { + epochResp, err := fixture.userLumera.Audit().GetCurrentEpoch(ctx) + require.NoError(t, err) + require.NotNil(t, epochResp) + + staleEpoch := epochResp.GetEpochId() + 100 + challengeID := newSelfHealingChallengeID("sh-e2e-stale") + + req := buildSelfHealingRequest(challengeID, staleEpoch, fixture) + reqCtx, cancelReq := context.WithTimeout(ctx, 30*time.Second) + defer cancelReq() + + reqResp, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.NoError(t, err) + require.False(t, reqResp.Accepted) + require.Contains(t, strings.ToLower(reqResp.Error), "stale epoch") + + verifyReq := buildSelfHealingVerifyRequest(challengeID, staleEpoch, fixture.registeredHashHex, fixture) + verifyCtx, cancelVerify := context.WithTimeout(ctx, 30*time.Second) + defer cancelVerify() + + verifyResp, err := fixture.shClient.Verify(verifyCtx, fixture.observer.Identity, fixture.observer.GRPCAddr, verifyReq) + require.NoError(t, err) + require.False(t, verifyResp.Ok) + require.Contains(t, strings.ToLower(verifyResp.Error), "stale epoch") + + commitReq := buildSelfHealingCommitRequest(challengeID, staleEpoch, fixture) + commitCtx, cancelCommit := context.WithTimeout(ctx, 30*time.Second) + defer cancelCommit() + + commitResp, err := fixture.shClient.Commit(commitCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, commitReq) + require.NoError(t, err) + require.False(t, commitResp.Stored) + require.Contains(t, strings.ToLower(commitResp.Error), "stale epoch") + }) + + t.Run("DuplicateChallengeReplay", func(t *testing.T) { + challengeID := newSelfHealingChallengeID("sh-e2e-duplicate") + req := buildSelfHealingRequest(challengeID, 0, fixture) + + reqCtx, cancelReq := context.WithTimeout(ctx, 30*time.Second) + defer cancelReq() + + firstResp, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.NoError(t, err) + require.True(t, firstResp.Accepted, "initial request rejected: %s", firstResp.Error) + require.True(t, strings.EqualFold(firstResp.ReconstructedHashHex, fixture.registeredHashHex), "first request reconstructed hash mismatch") + + replayResp, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.NoError(t, err) + if replayResp.Accepted { + require.True(t, strings.EqualFold(replayResp.ReconstructedHashHex, firstResp.ReconstructedHashHex), "replay reconstructed hash diverged") + require.True(t, strings.EqualFold(replayResp.ReconstructedHashHex, fixture.registeredHashHex), "replay reconstructed hash mismatch with action hash") + } else { + require.NotEmpty(t, replayResp.Error, "replay rejection must include a reason") + } + }) + + t.Run("RecipientDownRequestFails", func(t *testing.T) { + stopProcessWithTimeout(fixture.cmds[1], 3*time.Second) + require.NoError(t, waitForTCPUnavailable(fixture.recipient.GRPCAddr, 10*time.Second), "recipient gRPC endpoint should be down before request") + + challengeID := newSelfHealingChallengeID("sh-e2e-recipient-down") + req := buildSelfHealingRequest(challengeID, 0, fixture) + reqCtx, cancelReq := context.WithTimeout(ctx, 10*time.Second) + defer cancelReq() + + _, err := fixture.shClient.Request(reqCtx, fixture.recipient.Identity, fixture.recipient.GRPCAddr, req) + require.Error(t, err, "request should fail while recipient is down") + t.Logf("self-healing recipient down request error=%v", err) + }) +} + +func setupSelfHealingFixture(t *testing.T) *selfHealingFixture { + t.Helper() + + os.Setenv("INTEGRATION_TEST", "true") + os.Setenv("INTEGRATION_TEST_ENV", "true") + t.Cleanup(func() { + os.Unsetenv("INTEGRATION_TEST") + os.Unsetenv("INTEGRATION_TEST_ENV") + }) + + sut.ModifyGenesisJSON(t, SetStakingBondDenomUlume(t), SetActionParams(t), SetSupernodeMetricsParams(t)) + sut.StartChain(t) + + cli := NewLumeradCLI(t, sut, true) + registerSelfHealingSupernodes(t, cli) + + // Keep supernodes above eligibility floor so CASCADE registration can finalize deterministically. + cli.FundAddress(shNode0Identity, "2000000ulume") + cli.FundAddressWithNode(shNode1Identity, "2000000ulume", "node1") + cli.FundAddressWithNode(shNode2Identity, "2000000ulume", "node2") + + cmds := StartAllSupernodes(t) + t.Cleanup(func() { + StopAllSupernodes(cmds) + }) + + require.NoError(t, waitForSupernodeGatewaysReady(30*time.Second), "supernode gateways did not become ready") + + userAddress := cli.AddKeyFromSeed(shUserKeyName, shUserMnemonic) + cli.FundAddress(userAddress, "1000000ulume") + sut.AwaitNextBlock(t) + + memKR, err := snkeyring.InitKeyring(snconfig.KeyringConfig{Backend: "memory"}) + require.NoError(t, err) + _, err = snkeyring.RecoverAccountFromMnemonic(memKR, shUserKeyName, shUserMnemonic) + require.NoError(t, err) + + const lumeraGRPCAddr = "localhost:9090" + const lumeraChainID = "testing" + + userLumeraCfg, err := lumera.NewConfig(lumeraGRPCAddr, lumeraChainID, shUserKeyName, memKR) + require.NoError(t, err) + userLumera, err := lumera.NewClient(context.Background(), userLumeraCfg) + require.NoError(t, err) + t.Cleanup(func() { + _ = userLumera.Close() + }) + + actionClient, err := action.NewClient( + context.Background(), + sdkconfig.Config{ + Account: sdkconfig.AccountConfig{ + KeyName: shUserKeyName, + Keyring: memKR, + }, + Lumera: sdkconfig.LumeraConfig{ + GRPCAddr: lumeraGRPCAddr, + ChainID: lumeraChainID, + }, + }, + nil, + ) + require.NoError(t, err) + + ctx := context.Background() + testFilePath := filepath.Join(WorkDir, "test.txt") + + cascadeMeta, price, expiration, err := actionClient.BuildCascadeMetadataFromFile(ctx, testFilePath, false, "") + require.NoError(t, err) + startSig, err := actionClient.GenerateStartCascadeSignatureFromFile(ctx, testFilePath) + require.NoError(t, err) + + metadataBz, err := json.Marshal(cascadeMeta) + require.NoError(t, err) + + fixtureStat, err := os.Stat(testFilePath) + require.NoError(t, err) + fileSizeKbs := int64(0) + if fixtureStat.Size() > 0 { + fileSizeKbs = (fixtureStat.Size() + 1023) / 1024 + } + + reqResp, err := userLumera.ActionMsg().RequestAction( + ctx, + "CASCADE", + string(metadataBz), + price, + expiration, + strconv.FormatInt(fileSizeKbs, 10), + ) + require.NoError(t, err) + require.NotNil(t, reqResp) + require.NotNil(t, reqResp.TxResponse) + require.Zero(t, reqResp.TxResponse.Code) + + txHash := reqResp.TxResponse.TxHash + require.NotEmpty(t, txHash) + sut.AwaitNextBlock(t) + + txResp := cli.CustomQuery("q", "tx", txHash) + actionID := extractActionIDFromTxQuery(txResp) + require.NotEmpty(t, actionID) + + _, err = actionClient.StartCascade(ctx, testFilePath, actionID, startSig) + require.NoError(t, err) + require.NoError(t, waitForActionFinalizedStateWithClient(ctx, userLumera, actionID)) + + actionResp, err := userLumera.Action().GetAction(ctx, actionID) + require.NoError(t, err) + require.NotNil(t, actionResp) + require.NotNil(t, actionResp.Action) + + cmeta, err := cascadekit.UnmarshalCascadeMetadata(actionResp.Action.Metadata) + require.NoError(t, err) + registeredHashRaw, err := base64.StdEncoding.DecodeString(strings.TrimSpace(cmeta.DataHash)) + require.NoError(t, err) + require.NotEmpty(t, registeredHashRaw) + + registeredHashHex := hex.EncodeToString(registeredHashRaw) + fileKey := pickAnchorKey(cmeta.RqIdsIds) + require.NotEmpty(t, fileKey) + t.Logf("self-healing fixture prepared action_id=%s file_key=%s registered_action_hash=%s", actionID, fileKey, registeredHashHex) + + node0DiskKR, err := snkeyring.InitKeyring(snconfig.KeyringConfig{ + Backend: "test", + Dir: filepath.Join(WorkDir, "supernode-data1", "keys"), + }) + require.NoError(t, err) + + node0LumeraCfg, err := lumera.NewConfig(lumeraGRPCAddr, lumeraChainID, shNode0KeyName, node0DiskKR) + require.NoError(t, err) + node0Lumera, err := lumera.NewClient(context.Background(), node0LumeraCfg) + require.NoError(t, err) + t.Cleanup(func() { + _ = node0Lumera.Close() + }) + + shClient, err := newSelfHealingRPCClient(node0Lumera, node0DiskKR, shNode0Identity) + require.NoError(t, err) + + nodes := []selfHealingNode{ + {Identity: shNode0Identity}, + {Identity: shNode1Identity}, + {Identity: shNode2Identity}, + } + for i := range nodes { + nodes[i].GRPCAddr = mustGetSupernodeLatestAddr(t, userLumera, nodes[i].Identity) + } + + // Keep roles deterministic and distinct in system tests. + recipient := nodes[1] + observer := nodes[2] + t.Logf("self-healing role selection recipient=%s observer=%s", recipient.Identity, observer.Identity) + + return &selfHealingFixture{ + cli: cli, + userLumera: userLumera, + shClient: shClient, + nodes: nodes, + recipient: recipient, + observer: observer, + actionID: actionID, + fileKey: fileKey, + registeredHashHex: registeredHashHex, + cmds: cmds, + } +} + +func registerSelfHealingSupernodes(t *testing.T, cli *LumeradCli) { + t.Helper() + type reg struct { + nodeKey string + grpcPort string + address string + p2pPort string + } + + nodes := []reg{ + {nodeKey: "node0", grpcPort: "4444", address: shNode0Identity, p2pPort: "4445"}, + {nodeKey: "node1", grpcPort: "4446", address: shNode1Identity, p2pPort: "4447"}, + {nodeKey: "node2", grpcPort: "4448", address: shNode2Identity, p2pPort: "4449"}, + } + + for _, n := range nodes { + valAddr := strings.TrimSpace(cli.Keys("keys", "show", n.nodeKey, "--bech", "val", "-a")) + require.NotEmpty(t, valAddr) + resp := cli.CustomCommand( + "tx", "supernode", "register-supernode", + valAddr, + "localhost:"+n.grpcPort, + n.address, + "--p2p-port", n.p2pPort, + "--from", n.nodeKey, + ) + RequireTxSuccess(t, resp) + sut.AwaitNextBlock(t) + } +} + +func waitForActionFinalizedStateWithClient(ctx context.Context, client lumera.Client, actionID string) error { + for i := 0; i < actionStateRetries; i++ { + resp, err := client.Action().GetAction(ctx, actionID) + if err == nil && resp != nil && resp.Action != nil { + if resp.Action.State == actiontypes.ActionStateDone || resp.Action.State == actiontypes.ActionStateApproved { + return nil + } + } + time.Sleep(actionStateDelay) + } + return fmt.Errorf("action %s did not reach a finalized state (%s/%s)", actionID, actiontypes.ActionStateDone.String(), actiontypes.ActionStateApproved.String()) +} + +func extractActionIDFromTxQuery(txResp string) string { + events := gjson.Get(txResp, "events").Array() + for _, event := range events { + if event.Get("type").String() != "action_registered" { + continue + } + attrs := event.Get("attributes").Array() + for _, attr := range attrs { + if attr.Get("key").String() == "action_id" { + return attr.Get("value").String() + } + } + } + return "" +} + +func mustGetSupernodeLatestAddr(t *testing.T, client lumera.Client, identity string) string { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + info, err := client.SuperNode().GetSupernodeWithLatestAddress(ctx, identity) + require.NoError(t, err) + require.NotNil(t, info) + addr := strings.TrimSpace(info.LatestAddress) + require.NotEmpty(t, addr) + return addr +} + +func pickAnchorKey(keys []string) string { + anchor := "" + for _, raw := range keys { + key := strings.TrimSpace(raw) + if key == "" { + continue + } + if anchor == "" || key < anchor { + anchor = key + } + } + return anchor +} + +func waitForSupernodeGatewaysReady(timeout time.Duration) error { + deadline := time.Now().Add(timeout) + urls := []string{ + "http://localhost:8002/api/v1/status", + "http://localhost:8003/api/v1/status", + "http://localhost:8004/api/v1/status", + } + + for time.Now().Before(deadline) { + allReady := true + for _, u := range urls { + resp, err := http.Get(u) //nolint:gosec + if err != nil { + allReady = false + break + } + body, rerr := io.ReadAll(resp.Body) + _ = resp.Body.Close() + if rerr != nil || resp.StatusCode >= 400 || len(body) == 0 { + allReady = false + break + } + if !gjson.ValidBytes(body) { + allReady = false + break + } + } + if allReady { + return nil + } + time.Sleep(1 * time.Second) + } + + return fmt.Errorf("timed out waiting for supernode gateways readiness") +} + +func newSelfHealingChallengeID(prefix string) string { + return fmt.Sprintf("%s-%d", prefix, time.Now().UnixNano()) +} + +func buildSelfHealingRequest(challengeID string, epochID uint64, f *selfHealingFixture) *pb.RequestSelfHealingRequest { + return &pb.RequestSelfHealingRequest{ + ChallengeId: challengeID, + EpochId: epochID, + FileKey: f.fileKey, + ChallengerId: shNode0Identity, + RecipientId: f.recipient.Identity, + ObserverIds: []string{f.observer.Identity}, + ActionId: f.actionID, + } +} + +func buildSelfHealingVerifyRequest(challengeID string, epochID uint64, reconstructedHashHex string, f *selfHealingFixture) *pb.VerifySelfHealingRequest { + return &pb.VerifySelfHealingRequest{ + ChallengeId: challengeID, + EpochId: epochID, + FileKey: f.fileKey, + RecipientId: f.recipient.Identity, + ReconstructedHashHex: reconstructedHashHex, + ObserverId: f.observer.Identity, + ActionId: f.actionID, + } +} + +func buildSelfHealingCommitRequest(challengeID string, epochID uint64, f *selfHealingFixture) *pb.CommitSelfHealingRequest { + return &pb.CommitSelfHealingRequest{ + ChallengeId: challengeID, + EpochId: epochID, + FileKey: f.fileKey, + ActionId: f.actionID, + ChallengerId: shNode0Identity, + RecipientId: f.recipient.Identity, + } +} + +func newSelfHealingRPCClient(lumeraClient lumera.Client, kr keyring.Keyring, localIdentity string) (*selfHealingRPCClient, error) { + validator := lumera.NewSecureKeyExchangeValidator(lumeraClient) + grpcCreds, err := credentials.NewClientCreds(&credentials.ClientOptions{ + CommonOptions: credentials.CommonOptions{ + Keyring: kr, + LocalIdentity: localIdentity, + PeerType: securekeyx.Supernode, + Validator: validator, + }, + }) + if err != nil { + return nil, err + } + opts := grpcclient.DefaultClientOptions() + opts.EnableRetries = true + return &selfHealingRPCClient{client: grpcclient.NewClient(grpcCreds), opts: opts}, nil +} + +func (c *selfHealingRPCClient) Request(ctx context.Context, remoteIdentity string, address string, req *pb.RequestSelfHealingRequest) (*pb.RequestSelfHealingResponse, error) { + conn, err := c.client.Connect(ctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), c.opts) + if err != nil { + return nil, err + } + defer conn.Close() + return pb.NewSelfHealingServiceClient(conn).RequestSelfHealing(ctx, req) +} + +func (c *selfHealingRPCClient) Verify(ctx context.Context, remoteIdentity string, address string, req *pb.VerifySelfHealingRequest) (*pb.VerifySelfHealingResponse, error) { + conn, err := c.client.Connect(ctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), c.opts) + if err != nil { + return nil, err + } + defer conn.Close() + return pb.NewSelfHealingServiceClient(conn).VerifySelfHealing(ctx, req) +} + +func (c *selfHealingRPCClient) Commit(ctx context.Context, remoteIdentity string, address string, req *pb.CommitSelfHealingRequest) (*pb.CommitSelfHealingResponse, error) { + conn, err := c.client.Connect(ctx, fmt.Sprintf("%s@%s", strings.TrimSpace(remoteIdentity), address), c.opts) + if err != nil { + return nil, err + } + defer conn.Close() + return pb.NewSelfHealingServiceClient(conn).CommitSelfHealing(ctx, req) +} + +func waitForTCPUnavailable(address string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + conn, err := net.DialTimeout("tcp", address, 500*time.Millisecond) + if err != nil { + return nil + } + _ = conn.Close() + time.Sleep(250 * time.Millisecond) + } + return fmt.Errorf("tcp endpoint still reachable: %s", address) +} + +func tamperHexChar(hashHex string) string { + trimmed := strings.TrimSpace(strings.ToLower(hashHex)) + if trimmed == "" { + return "0" + } + if trimmed[0] == '0' { + return "1" + trimmed[1:] + } + return "0" + trimmed[1:] +} diff --git a/tests/system/main_test.go b/tests/system/main_test.go index 67c97efa..884a16de 100644 --- a/tests/system/main_test.go +++ b/tests/system/main_test.go @@ -54,6 +54,11 @@ func TestMain(m *testing.M) { if *rebuild { sut.BuildNewBinary() } + + if err := performSystemTestRuntimeCleanup(*nodesCount, verbose); err != nil { + panic(fmt.Sprintf("system test preflight cleanup failed: %v", err)) + } + // setup chain and keyring sut.SetupChain() @@ -62,6 +67,9 @@ func TestMain(m *testing.M) { // postprocess sut.StopChain() + if err := performSystemTestRuntimeCleanup(*nodesCount, verbose); err != nil { + fmt.Fprintf(os.Stderr, "system test post-run cleanup warning: %v\n", err) + } if verbose || exitCode != 0 { sut.PrintBuffer() printResultFlag(exitCode == 0) diff --git a/tests/system/runtime_cleanup.go b/tests/system/runtime_cleanup.go new file mode 100644 index 00000000..2c78be29 --- /dev/null +++ b/tests/system/runtime_cleanup.go @@ -0,0 +1,247 @@ +package system + +import ( + "bufio" + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "sort" + "strconv" + "strings" + "syscall" + "time" +) + +const systemTestHardCleanupEnv = "SYSTEM_TEST_HARD_CLEANUP" + +type listeningProcess struct { + PID int + Command string + Port int +} + +type processPortSet struct { + Command string + Ports map[int]struct{} +} + +func performSystemTestRuntimeCleanup(nodesCount int, verbose bool) error { + if !isSystemTestHardCleanupEnabled() { + return nil + } + + lsofPath, err := exec.LookPath("lsof") + if err != nil { + if verbose { + fmt.Printf("system tests: skipping hard cleanup (%s not found)\n", "lsof") + } + return nil + } + + ports := systemTestReservedPorts(nodesCount) + targetsByPID := make(map[int]*processPortSet) + blockers := make([]string, 0) + + for _, port := range ports { + listeners, err := listListeningProcesses(lsofPath, port) + if err != nil { + return fmt.Errorf("list listeners on port %d: %w", port, err) + } + for _, p := range listeners { + cmd := normalizeCommandName(p.Command) + if !isSystemTestProcess(cmd) { + blockers = append(blockers, fmt.Sprintf("port %d occupied by non-test process %q (pid=%d)", port, cmd, p.PID)) + continue + } + + meta, ok := targetsByPID[p.PID] + if !ok { + meta = &processPortSet{Command: cmd, Ports: make(map[int]struct{})} + targetsByPID[p.PID] = meta + } + meta.Ports[port] = struct{}{} + } + } + + if len(blockers) > 0 { + sort.Strings(blockers) + return fmt.Errorf("%s", strings.Join(blockers, "; ")) + } + + pids := make([]int, 0, len(targetsByPID)) + for pid := range targetsByPID { + pids = append(pids, pid) + } + sort.Ints(pids) + + for _, pid := range pids { + meta := targetsByPID[pid] + if verbose { + fmt.Printf("system tests: cleaning stale process pid=%d cmd=%s ports=%v\n", pid, meta.Command, sortedPorts(meta.Ports)) + } + if err := terminateProcess(pid, 2*time.Second); err != nil { + return fmt.Errorf("terminate stale process pid=%d cmd=%s: %w", pid, meta.Command, err) + } + } + + return nil +} + +func isSystemTestHardCleanupEnabled() bool { + v := strings.TrimSpace(strings.ToLower(os.Getenv(systemTestHardCleanupEnv))) + switch v { + case "", "1", "true", "yes", "on": + return true + default: + return false + } +} + +func systemTestReservedPorts(nodesCount int) []int { + if nodesCount <= 0 { + nodesCount = 4 + } + + ports := make(map[int]struct{}) + addRange := func(start, end int) { + for p := start; p <= end; p++ { + ports[p] = struct{}{} + } + } + + // lumerad testnet ports (safety ranges include small offsets for prior failed runs) + addRange(16656, 16656+nodesCount+2) // CometBFT P2P + addRange(26656, 26656+nodesCount+2) // CometBFT P2P/RPC defaults + addRange(26657, 26657+nodesCount+2) // CometBFT RPC + addRange(9090, 9090+nodesCount+2) // gRPC + addRange(1317, 1317+nodesCount+2) // API + addRange(6060, 6060+nodesCount+2) // pprof + addRange(7180, 7180+nodesCount+2) // telemetry + + // supernode system-test ports + addRange(4444, 4449) + addRange(8002, 8004) + ports[50051] = struct{}{} // optional RaptorQ service + + out := make([]int, 0, len(ports)) + for p := range ports { + out = append(out, p) + } + sort.Ints(out) + return out +} + +func listListeningProcesses(lsofPath string, port int) ([]listeningProcess, error) { + cmd := exec.Command(lsofPath, "-nP", fmt.Sprintf("-iTCP:%d", port), "-sTCP:LISTEN", "-Fpc") + out, err := cmd.Output() + if err != nil { + // lsof exits non-zero when no matches are found. + if _, ok := err.(*exec.ExitError); ok { + return nil, nil + } + return nil, err + } + + byPID := make(map[int]listeningProcess) + currentPID := 0 + scanner := bufio.NewScanner(bytes.NewReader(out)) + for scanner.Scan() { + line := scanner.Text() + if len(line) < 2 { + continue + } + switch line[0] { + case 'p': + pid, convErr := strconv.Atoi(line[1:]) + if convErr != nil { + currentPID = 0 + continue + } + currentPID = pid + if _, exists := byPID[pid]; !exists { + byPID[pid] = listeningProcess{PID: pid, Port: port} + } + case 'c': + if currentPID == 0 { + continue + } + p := byPID[currentPID] + p.Command = line[1:] + byPID[currentPID] = p + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + + listeners := make([]listeningProcess, 0, len(byPID)) + for _, p := range byPID { + listeners = append(listeners, p) + } + sort.Slice(listeners, func(i, j int) bool { + return listeners[i].PID < listeners[j].PID + }) + return listeners, nil +} + +func normalizeCommandName(cmd string) string { + cmd = strings.TrimSpace(cmd) + if cmd == "" { + return "" + } + cmd = strings.Fields(cmd)[0] + return filepath.Base(cmd) +} + +func isSystemTestProcess(command string) bool { + return strings.HasPrefix(command, "lumerad") || strings.HasPrefix(command, "supernode") +} + +func terminateProcess(pid int, timeout time.Duration) error { + proc, err := os.FindProcess(pid) + if err != nil { + return err + } + + _ = proc.Signal(syscall.SIGTERM) + if waitForProcessExit(pid, timeout) { + return nil + } + + _ = proc.Signal(syscall.SIGKILL) + if waitForProcessExit(pid, timeout) { + return nil + } + + return fmt.Errorf("process %d did not exit", pid) +} + +func waitForProcessExit(pid int, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if !processExists(pid) { + return true + } + time.Sleep(100 * time.Millisecond) + } + return !processExists(pid) +} + +func processExists(pid int) bool { + err := syscall.Kill(pid, 0) + if err == nil || err == syscall.EPERM { + return true + } + return false +} + +func sortedPorts(portSet map[int]struct{}) []int { + ports := make([]int, 0, len(portSet)) + for p := range portSet { + ports = append(ports, p) + } + sort.Ints(ports) + return ports +} diff --git a/tests/system/supernode-utils.go b/tests/system/supernode-utils.go index 4d74d372..5ea33223 100644 --- a/tests/system/supernode-utils.go +++ b/tests/system/supernode-utils.go @@ -6,6 +6,7 @@ import ( "os" "os/exec" "path/filepath" + "syscall" "testing" "time" ) @@ -34,6 +35,10 @@ func StartAllSupernodes(t *testing.T) []*exec.Cmd { // Start each supernode for i, dataDir := range dataDirs { + if err := resetSupernodeRuntimeState(dataDir); err != nil { + t.Fatalf("failed to clean runtime state for %s: %v", dataDir, err) + } + binPath := filepath.Join(dataDir, "supernode") // Ensure the binary exists @@ -85,8 +90,49 @@ func StartAllSupernodes(t *testing.T) []*exec.Cmd { func StopAllSupernodes(cmds []*exec.Cmd) { for _, cmd := range cmds { if cmd != nil && cmd.Process != nil { - _ = cmd.Process.Kill() - _, _ = cmd.Process.Wait() + stopProcessWithTimeout(cmd, 3*time.Second) } } } + +func resetSupernodeRuntimeState(dataDir string) error { + volatileDirs := []string{ + "data", + "raptorq_files", + "raptorq_files_test", + } + + for _, rel := range volatileDirs { + if err := os.RemoveAll(filepath.Join(dataDir, rel)); err != nil { + return err + } + } + + return nil +} + +func stopProcessWithTimeout(cmd *exec.Cmd, timeout time.Duration) { + if cmd == nil || cmd.Process == nil { + return + } + + done := make(chan struct{}) + go func() { + _, _ = cmd.Process.Wait() + close(done) + }() + + _ = cmd.Process.Signal(syscall.SIGTERM) + + select { + case <-done: + return + case <-time.After(timeout): + } + + _ = cmd.Process.Kill() + select { + case <-done: + case <-time.After(2 * time.Second): + } +}