diff --git a/core/consumer/registry.go b/core/consumer/registry.go index ce817293..0c232b3e 100644 --- a/core/consumer/registry.go +++ b/core/consumer/registry.go @@ -30,6 +30,8 @@ type TopicKey string const ( // TopicKeyStart is the pipeline stage where new requests arrive from the gateway. TopicKeyStart TopicKey = "start" + // TopicKeyCancel is the pipeline stage where cancellation requests arrive from the gateway. + TopicKeyCancel TopicKey = "cancel" // TopicKeyValidate is the pipeline stage where requests are published for validation. TopicKeyValidate TopicKey = "validate" // TopicKeyBatch is the pipeline stage where validated requests are published for batching. diff --git a/core/request/log.go b/core/request/log.go index 4a163edd..e84891d1 100644 --- a/core/request/log.go +++ b/core/request/log.go @@ -25,13 +25,20 @@ import ( // PublishLog publishes a single request log entry to the log topic for async persistence. // The partitionKey ensures ordering of log entries for the same request; typically set to the request ID. +// +// The message ID is scoped to (requestID, status) so that the queue's +// (topic, partition_key, id) unique index dedupes retries of the same logical +// log event (same delivery re-processed) without rejecting distinct statuses +// for the same request (e.g. "started" emitted by the start controller and +// "cancelled" emitted later by the cancel controller). func PublishLog(ctx context.Context, registry consumer.TopicRegistry, logEntry entity.RequestLog, partitionKey string) error { payload, err := logEntry.ToBytes() if err != nil { return fmt.Errorf("failed to serialize request log: %w", err) } - msg := entityqueue.NewMessage(logEntry.RequestID, payload, partitionKey, nil) + msgID := fmt.Sprintf("%s/%s", logEntry.RequestID, logEntry.Status) + msg := entityqueue.NewMessage(msgID, payload, partitionKey, nil) q, ok := registry.Queue(consumer.TopicKeyLog) if !ok { diff --git a/core/request/log_test.go b/core/request/log_test.go index ac53e499..9e9ff1db 100644 --- a/core/request/log_test.go +++ b/core/request/log_test.go @@ -113,3 +113,51 @@ func TestPublishBatchLogs_Empty(t *testing.T) { err := PublishBatchLogs(context.Background(), registry, nil, entity.RequestStatusScored, nil) require.NoError(t, err) } + +// TestPublishLog_MessageIDScopedByStatus locks in the queue-id scheme: +// distinct statuses for the same request must produce distinct message IDs so +// the queue's (topic, partition_key, id) uniqueness check does not reject the +// second publish, while the same status emitted twice (retry of the same +// delivery) must produce the same message ID so the queue dedupes it. +// +// Regression test for the duplicate-key crash where the orchestrator cancel +// controller could not emit a `cancelled` log entry because the start +// controller had already published `started` for the same request. +func TestPublishLog_MessageIDScopedByStatus(t *testing.T) { + ctrl := gomock.NewController(t) + + var ids []string + mockPub := queuemock.NewMockPublisher(ctrl) + mockPub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ string, msg queue.Message) error { + ids = append(ids, msg.ID) + return nil + }, + ).AnyTimes() + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{{Key: consumer.TopicKeyLog, Name: "log", Queue: mockQ}}, + ) + require.NoError(t, err) + + // Three distinct statuses for the same request. + for _, st := range []entity.RequestStatus{ + entity.RequestStatusStarted, + entity.RequestStatusCancelling, + entity.RequestStatusCancelled, + } { + require.NoError(t, PublishLog(context.Background(), registry, + entity.NewRequestLog("req/1", st, 0, "", nil), "req/1")) + } + // Re-emit "started" to simulate a retry of the same delivery — must reuse the same ID. + require.NoError(t, PublishLog(context.Background(), registry, + entity.NewRequestLog("req/1", entity.RequestStatusStarted, 0, "", nil), "req/1")) + + require.Equal(t, []string{ + "req/1/started", + "req/1/cancelling", + "req/1/cancelled", + "req/1/started", + }, ids) +} diff --git a/entity/BUILD.bazel b/entity/BUILD.bazel index 5231f436..6925a16a 100644 --- a/entity/BUILD.bazel +++ b/entity/BUILD.bazel @@ -6,6 +6,7 @@ go_library( "batch.go", "batch_dependent.go", "build.go", + "cancel_request.go", "change_provider.go", "change_record.go", "land_request.go", @@ -23,6 +24,7 @@ go_test( srcs = [ "batch_test.go", "build_test.go", + "cancel_request_test.go", "land_request_test.go", "request_log_test.go", "request_test.go", diff --git a/entity/batch.go b/entity/batch.go index 7ab7bc84..6ff709de 100644 --- a/entity/batch.go +++ b/entity/batch.go @@ -34,12 +34,24 @@ const ( BatchStateFailed BatchState = "failed" // BatchStateScored is the state of a batch that has been scored for build success probability. BatchStateScored BatchState = "scored" + // BatchStateCancelling is the non-terminal intent state set when a cancel has been requested but the + // batch has not yet been transitioned to BatchStateCancelled. A batch in this state may still reach + // BatchStateSucceeded or BatchStateFailed if a concurrent merge wins the race (e.g. the push had + // already completed before the cancel CAS observed the batch); those terminal states prevail. + // Forward-progress controllers must treat this state as halted (no new work). The speculate + // controller owns the transition to the terminal BatchStateCancelled and the downstream fan-out + // (cancelling in-flight builds, respeculating dependents, publishing to conclude). + BatchStateCancelling BatchState = "cancelling" // BatchStateCancelled is the terminal state of a batch that was cancelled before completion. BatchStateCancelled BatchState = "cancelled" ) // IsTerminal returns true if the batch state is a terminal state. // Terminal states are states from which no further transitions are possible. +// BatchStateCancelling is intentionally excluded: cancellation is best-effort and a Cancelling batch +// may still transition to BatchStateSucceeded or BatchStateFailed before it reaches BatchStateCancelled. +// Callers that want to gate forward progress (and treat Cancelling as halted) should use +// IsBatchStateHalted instead. func (s BatchState) IsTerminal() bool { switch s { case BatchStateSucceeded, BatchStateFailed, BatchStateCancelled: @@ -49,6 +61,14 @@ func (s BatchState) IsTerminal() bool { } } +// IsBatchStateHalted returns true if the batch is either terminal or in the process of being cancelled. +// Forward-progress controllers (score, build, buildsignal, speculate, merge) use this to short-circuit +// work for batches that the user has asked to cancel — even though Cancelling is non-terminal, no +// further pipeline work should start (cancel will write the terminal state and fan out). +func IsBatchStateHalted(s BatchState) bool { + return s.IsTerminal() || s == BatchStateCancelling +} + // Batch represents a group of requests to land (merge into target branch of the source control repository). type Batch struct { // ID is the globally unique identifier for the batch. Format: "/batch/". diff --git a/entity/cancel_request.go b/entity/cancel_request.go new file mode 100644 index 00000000..3937982a --- /dev/null +++ b/entity/cancel_request.go @@ -0,0 +1,38 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package entity + +import "encoding/json" + +// CancelRequest represents a cancellation request sent over the queue from the gateway to the orchestrator. +// It identifies the request to cancel by its ID and carries an optional human-readable reason for observability. +type CancelRequest struct { + // ID is the globally unique identifier of the request to cancel. Format: "/". + ID string `json:"id"` + // Reason is an optional free-form explanation of why the cancellation was requested. + Reason string `json:"reason"` +} + +// ToBytes serializes the CancelRequest to JSON bytes for queue message payload. +func (r CancelRequest) ToBytes() ([]byte, error) { + return json.Marshal(r) +} + +// CancelRequestFromBytes deserializes a CancelRequest from JSON bytes. +func CancelRequestFromBytes(data []byte) (CancelRequest, error) { + var req CancelRequest + err := json.Unmarshal(data, &req) + return req, err +} diff --git a/entity/cancel_request_test.go b/entity/cancel_request_test.go new file mode 100644 index 00000000..b82b5282 --- /dev/null +++ b/entity/cancel_request_test.go @@ -0,0 +1,55 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package entity + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCancelRequest_SerializationRoundTrip(t *testing.T) { + tests := []struct { + name string + req CancelRequest + }{ + { + name: "with reason", + req: CancelRequest{ID: "queue1/100", Reason: "obsolete change"}, + }, + { + name: "without reason", + req: CancelRequest{ID: "queue2/200"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := tt.req.ToBytes() + require.NoError(t, err) + + deserialized, err := CancelRequestFromBytes(data) + require.NoError(t, err) + + assert.Equal(t, tt.req, deserialized) + }) + } +} + +func TestCancelRequestFromBytes_InvalidJSON(t *testing.T) { + _, err := CancelRequestFromBytes([]byte(`{not json`)) + assert.Error(t, err) +} diff --git a/entity/request.go b/entity/request.go index 8b9e274d..683f29d6 100644 --- a/entity/request.go +++ b/entity/request.go @@ -40,17 +40,42 @@ const ( RequestStateStarted RequestState = "started" // RequestStateValidated indicates that the request has been validated (duplicate check, merge check etc.) successfully. RequestStateValidated RequestState = "validated" + // RequestStateBatched indicates that the request has been claimed by the batch controller and enrolled in a + // batch. The CAS-write of this state by the batch controller is the serialization point between batch and + // cancel: the batch controller transitions Validated → Batched immediately before persisting the new batch, + // so any concurrent cancel that has already transitioned the request to Cancelling will lose the CAS and + // abandon the batch. From this state forward, the request's terminal outcome is owned by the batch it is + // enrolled in (via conclude), not by the cancel controller's request-only fast path. + RequestStateBatched RequestState = "batched" // RequestStateProcessing is the state of a land request that is being processed. RequestStateProcessing RequestState = "processing" // RequestStateLanded is the state of a land request that has been successfully processed and landed. This is the final state. RequestStateLanded RequestState = "landed" // RequestStateError is the state of a land request that has encountered an error. This is the final state. RequestStateError RequestState = "error" + // RequestStateCancelling is the non-terminal intent state set when the user has requested cancellation but the + // request has not yet been transitioned to RequestStateCancelled. A request in this state may still reach + // RequestStateLanded or RequestStateError if a concurrent merge or failure wins the race; those terminal + // states prevail. Forward-progress controllers must treat this state the same as terminal (i.e. do not start + // any new work on the request). + RequestStateCancelling RequestState = "cancelling" + // RequestStateCancelled is the state of a land request that was cancelled by the user before it could land. This is the final state. + RequestStateCancelled RequestState = "cancelled" ) -// IsRequestStateTerminal returns true if the state represents a final, irreversible state (landed or error). +// IsRequestStateTerminal returns true if the state represents a final, irreversible state (landed, error, or cancelled). +// RequestStateCancelling is intentionally excluded: cancellation is best-effort and a Cancelling request may still +// transition to Landed or Error before it reaches Cancelled. Callers that want to gate forward progress (and treat +// Cancelling as halted) should use IsRequestStateHalted instead. func IsRequestStateTerminal(s RequestState) bool { - return s == RequestStateLanded || s == RequestStateError + return s == RequestStateLanded || s == RequestStateError || s == RequestStateCancelled +} + +// IsRequestStateHalted returns true if the request is either terminal or in the process of being cancelled. +// Forward-progress controllers (validate, batch, ...) use this to short-circuit work for requests that the +// user has asked to cancel — even though Cancelling is non-terminal, no further pipeline work should start. +func IsRequestStateHalted(s RequestState) bool { + return IsRequestStateTerminal(s) || s == RequestStateCancelling } // Change represents a code change identified by URIs from a code change provider (e.g., GitHub Pull Request, Phabricator Diff). diff --git a/entity/request_log.go b/entity/request_log.go index 35569778..6dee8839 100644 --- a/entity/request_log.go +++ b/entity/request_log.go @@ -78,6 +78,16 @@ const ( // RequestStatusError indicates that the request has encountered an error. It corresponds to the RequestStateError state. RequestStatusError RequestStatus = "error" + + // RequestStatusCancelling indicates that the user has requested cancellation but the request has not yet transitioned + // to the RequestStateCancelled state. Cancellation is best-effort: a request that has already been merged or that + // races to completion before the cancel propagates through the pipeline may still land. Observers should treat this + // as intent only and rely on RequestStatusCancelled (or RequestStatusLanded) for the terminal outcome. Emitted by + // the gateway when the Cancel RPC is received. + RequestStatusCancelling RequestStatus = "cancelling" + + // RequestStatusCancelled indicates that the request was cancelled by the user before it could land. It corresponds to the RequestStateCancelled state. + RequestStatusCancelled RequestStatus = "cancelled" ) // RequestLog is an append-only record that captures a point-in-time snapshot of a request's status diff --git a/entity/request_test.go b/entity/request_test.go index 32d3ba65..1e8c9072 100644 --- a/entity/request_test.go +++ b/entity/request_test.go @@ -94,6 +94,48 @@ func TestRequestFromBytes_EmptyData(t *testing.T) { assert.Equal(t, int32(0), req.Version) } +func TestIsRequestStateTerminal(t *testing.T) { + tests := []struct { + state RequestState + terminal bool + }{ + {RequestStateUnknown, false}, + {RequestStateStarted, false}, + {RequestStateValidated, false}, + {RequestStateProcessing, false}, + {RequestStateCancelling, false}, // intent only — not terminal + {RequestStateLanded, true}, + {RequestStateError, true}, + {RequestStateCancelled, true}, + } + for _, tt := range tests { + t.Run(string(tt.state), func(t *testing.T) { + assert.Equal(t, tt.terminal, IsRequestStateTerminal(tt.state)) + }) + } +} + +func TestIsRequestStateHalted(t *testing.T) { + tests := []struct { + state RequestState + halted bool + }{ + {RequestStateUnknown, false}, + {RequestStateStarted, false}, + {RequestStateValidated, false}, + {RequestStateProcessing, false}, + {RequestStateCancelling, true}, // intent halts forward progress + {RequestStateLanded, true}, + {RequestStateError, true}, + {RequestStateCancelled, true}, + } + for _, tt := range tests { + t.Run(string(tt.state), func(t *testing.T) { + assert.Equal(t, tt.halted, IsRequestStateHalted(tt.state)) + }) + } +} + func TestRequest_SerializationRoundTrip(t *testing.T) { tests := []struct { name string diff --git a/example/server/gateway/BUILD.bazel b/example/server/gateway/BUILD.bazel index 9367297a..c26007ed 100644 --- a/example/server/gateway/BUILD.bazel +++ b/example/server/gateway/BUILD.bazel @@ -21,7 +21,9 @@ go_library( "@com_github_go_sql_driver_mysql//:mysql", "@com_github_uber_go_tally_v4//:tally", "@org_golang_google_grpc//:grpc", + "@org_golang_google_grpc//codes", "@org_golang_google_grpc//reflection", + "@org_golang_google_grpc//status", "@org_uber_go_zap//:zap", ], ) diff --git a/example/server/gateway/main.go b/example/server/gateway/main.go index 52be614f..3a4850da 100644 --- a/example/server/gateway/main.go +++ b/example/server/gateway/main.go @@ -37,7 +37,9 @@ import ( pb "github.com/uber/submitqueue/gateway/protopb" "go.uber.org/zap" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/reflection" + "google.golang.org/grpc/status" ) // GatewayServer wraps the controller and implements the gRPC service interface @@ -45,6 +47,7 @@ type GatewayServer struct { pb.UnimplementedSubmitQueueGatewayServer pingController *controller.PingController landController *controller.LandController + cancelController *controller.CancelController statusController *controller.StatusController } @@ -58,6 +61,11 @@ func (s *GatewayServer) Land(ctx context.Context, req *pb.LandRequest) (*pb.Land return s.landController.Land(ctx, req) } +// Cancel delegates to the controller +func (s *GatewayServer) Cancel(ctx context.Context, req *pb.CancelRequest) (*pb.CancelResponse, error) { + return s.cancelController.Cancel(ctx, req) +} + // Status delegates to the controller func (s *GatewayServer) Status(ctx context.Context, req *pb.StatusRequest) (*pb.StatusResponse, error) { return s.statusController.Status(ctx, req) @@ -167,17 +175,29 @@ func run() error { ) // Build a publish-only topic registry: gateway only feeds the start of the - // orchestrator pipeline (TopicKeyStart). No subscription is configured - // because the gateway never consumes from the queue. + // orchestrator pipeline (TopicKeyStart) and the cancel topic (TopicKeyCancel). + // No subscription is configured because the gateway never consumes from the queue. registry, err := consumer.NewTopicRegistry([]consumer.TopicConfig{ {Key: consumer.TopicKeyStart, Name: "start", Queue: mysqlQueue}, + {Key: consumer.TopicKeyCancel, Name: "cancel", Queue: mysqlQueue}, }) if err != nil { return fmt.Errorf("failed to create topic registry: %w", err) } - // Create gRPC server - grpcServer := grpc.NewServer() + // Create gRPC server with a unary interceptor that translates user-input + // validation errors (anything in the chain that matches controller.ErrInvalidRequest) + // into codes.InvalidArgument so gRPC clients can distinguish bad input from + // infrastructure failures. Other errors pass through unchanged. + grpcServer := grpc.NewServer(grpc.UnaryInterceptor( + func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + resp, err := handler(ctx, req) + if err != nil && controller.IsInvalidRequest(err) { + return nil, status.Error(codes.InvalidArgument, err.Error()) + } + return resp, err + }, + )) // Initialize request log store from shared app database connection requestLogStore := mysqlstorage.NewRequestLogStore(appDB, scope.SubScope("request_log_store")) @@ -196,10 +216,12 @@ func run() error { // Create controllers and wrap them for gRPC pingController := controller.NewPingController(logger, scope) landController := controller.NewLandController(logger.Sugar(), scope, cnt, requestLogStore, queueConfigs, registry) + cancelController := controller.NewCancelController(logger.Sugar(), scope, requestLogStore, registry) statusController := controller.NewStatusController(logger.Sugar(), scope, requestLogStore) gatewayServer := &GatewayServer{ pingController: pingController, landController: landController, + cancelController: cancelController, statusController: statusController, } diff --git a/example/server/gateway/queues.yaml b/example/server/gateway/queues.yaml index 84d9aa49..3831dc44 100644 --- a/example/server/gateway/queues.yaml +++ b/example/server/gateway/queues.yaml @@ -20,3 +20,12 @@ queues: change_provider: github merge_checker: github land_provider: github + + - name: e2e-cancel-queue + vcs_type: git + vcs_address: git@github.com:uber/submitqueue.git + target: main + build_runner: buildkite.com/uber/submitqueue-ci + change_provider: github + merge_checker: github + land_provider: github diff --git a/example/server/orchestrator/BUILD.bazel b/example/server/orchestrator/BUILD.bazel index 9912c570..144f8ad1 100644 --- a/example/server/orchestrator/BUILD.bazel +++ b/example/server/orchestrator/BUILD.bazel @@ -38,6 +38,7 @@ go_library( "//orchestrator/controller/batch", "//orchestrator/controller/build", "//orchestrator/controller/buildsignal", + "//orchestrator/controller/cancel", "//orchestrator/controller/conclude", "//orchestrator/controller/log", "//orchestrator/controller/merge", diff --git a/example/server/orchestrator/main.go b/example/server/orchestrator/main.go index 53fc315d..48c1fe2b 100644 --- a/example/server/orchestrator/main.go +++ b/example/server/orchestrator/main.go @@ -57,6 +57,7 @@ import ( "github.com/uber/submitqueue/orchestrator/controller/batch" "github.com/uber/submitqueue/orchestrator/controller/build" "github.com/uber/submitqueue/orchestrator/controller/buildsignal" + "github.com/uber/submitqueue/orchestrator/controller/cancel" "github.com/uber/submitqueue/orchestrator/controller/conclude" logctrl "github.com/uber/submitqueue/orchestrator/controller/log" "github.com/uber/submitqueue/orchestrator/controller/merge" @@ -235,7 +236,7 @@ func run() error { return err } - logger.Info("controllers registered", zap.Int("count", 9)) + logger.Info("controllers registered", zap.Int("count", 11)) // Start consumers if err := c.Start(ctx); err != nil { @@ -332,6 +333,14 @@ func newTopicRegistry(q extqueue.Queue, subscriberName string) (consumer.TopicRe subscriberName, "orchestrator-start", ), }, + { + Key: consumer.TopicKeyCancel, + Name: "cancel", + Queue: q, + Subscription: extqueue.DefaultSubscriptionConfig( + subscriberName, "orchestrator-cancel", + ), + }, { Key: consumer.TopicKeyValidate, Name: "validate", @@ -430,6 +439,18 @@ func registerControllers(c consumer.Consumer, logger *zap.SugaredLogger, scope t return fmt.Errorf("failed to register request controller: %w", err) } + cancelController := cancel.NewController( + logger, + scope, + store, + registry, + consumer.TopicKeyCancel, + "orchestrator-cancel", + ) + if err := c.Register(cancelController); err != nil { + return fmt.Errorf("failed to register cancel controller: %w", err) + } + validateController := validate.NewController( logger, scope, diff --git a/gateway/controller/BUILD.bazel b/gateway/controller/BUILD.bazel index b66d43b8..c4a3c4ce 100644 --- a/gateway/controller/BUILD.bazel +++ b/gateway/controller/BUILD.bazel @@ -3,6 +3,7 @@ load("@rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "controller", srcs = [ + "cancel.go", "land.go", "ping.go", "status.go", @@ -28,6 +29,7 @@ go_library( go_test( name = "controller_test", srcs = [ + "cancel_test.go", "land_test.go", "ping_test.go", "status_test.go", diff --git a/gateway/controller/cancel.go b/gateway/controller/cancel.go new file mode 100644 index 00000000..47b2c7a2 --- /dev/null +++ b/gateway/controller/cancel.go @@ -0,0 +1,151 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controller + +import ( + "context" + "fmt" + "time" + + "github.com/uber-go/tally/v4" + "github.com/uber/submitqueue/core/consumer" + "github.com/uber/submitqueue/core/errs" + "github.com/uber/submitqueue/entity" + "github.com/uber/submitqueue/entity/queue" + "github.com/uber/submitqueue/extension/storage" + pb "github.com/uber/submitqueue/gateway/protopb" + "go.uber.org/zap" +) + +// CancelController handles cancel business logic for the gateway. It validates the request, +// records a RequestStatusCancelling log entry (intent-only — cancellation is best-effort +// and may still race a successful merge), publishes a CancelRequest to the cancel topic, +// and returns a response. The orchestrator-side cancel controller performs the actual +// state transitions and emits the terminal RequestStatusCancelled log entry. +type CancelController struct { + logger *zap.SugaredLogger + metricsScope tally.Scope + requestLogStore storage.RequestLogStore + registry consumer.TopicRegistry +} + +// NewCancelController creates a new instance of the gateway cancel controller. +// The controller writes a RequestStatusCancelling log entry through requestLogStore and +// publishes cancel requests to the topic registered under consumer.TopicKeyCancel. +func NewCancelController(logger *zap.SugaredLogger, scope tally.Scope, requestLogStore storage.RequestLogStore, registry consumer.TopicRegistry) *CancelController { + return &CancelController{ + logger: logger, + metricsScope: scope, + requestLogStore: requestLogStore, + registry: registry, + } +} + +// Cancel handles a cancel request and returns an empty response. The actual cancellation +// is performed asynchronously by the orchestrator cancel controller. Cancel is idempotent: +// the orchestrator treats already-terminal requests as a no-op. +// +// Cancellation is best-effort: a request that has already merged or that races to +// completion before the cancel propagates may still land. The RequestStatusCancelling +// entry written here records the user's intent; the terminal outcome is reflected by a +// later RequestStatusCancelled (orchestrator side) or RequestStatusLanded entry. +func (c *CancelController) Cancel(ctx context.Context, req *pb.CancelRequest) (*pb.CancelResponse, error) { + start := time.Now() + defer func() { + c.metricsScope.Timer("cancel_request_latency").Record(time.Since(start)) + }() + + c.metricsScope.Counter("cancel_request_count").Inc(1) + + if req.Sqid == "" { + return nil, fmt.Errorf("CancelController requires the request to have a sqid specified: %w", ErrInvalidRequest) + } + + cancelRequest := entity.CancelRequest{ + ID: req.Sqid, + Reason: req.Reason, + } + + c.logger.Debugw("cancel request received", + "sqid", cancelRequest.ID, + "reason", cancelRequest.Reason, + ) + + // Verify the sqid exists before recording intent or publishing. Cancel is opt-in + // by sqid; an unknown sqid is a user error and must never leave a cancelling log + // row or a queue message behind for a request that never existed. The Land + // controller writes its "accepted" log entry synchronously to the same store, so + // a NotFound here reliably means "this sqid was never accepted by the gateway" + // rather than "in flight" — there is no false-negative race window. + if _, err := c.requestLogStore.List(ctx, cancelRequest.ID); err != nil { + if storage.IsNotFound(err) { + c.metricsScope.Counter("cancel_request_not_found").Inc(1) + return nil, errs.NewUserError(&RequestNotFoundError{Sqid: cancelRequest.ID}) + } + return nil, fmt.Errorf("CancelController failed to look up request log for sqid=%s: %w", cancelRequest.ID, err) + } + + // Record the user's intent in the request log before publishing. Writing direct to the + // store (rather than via the log topic) keeps the gateway-emitted entry consistent with + // the Land "accepted" entry and guarantees the entry is visible the moment Cancel returns. + metadata := map[string]string{} + if cancelRequest.Reason != "" { + metadata["reason"] = cancelRequest.Reason + } + logEntry := entity.NewRequestLog(cancelRequest.ID, entity.RequestStatusCancelling, 0, "", metadata) + if err := c.requestLogStore.Insert(ctx, logEntry); err != nil { + return nil, fmt.Errorf("CancelController failed to insert cancelling log for sqid=%s: %w", cancelRequest.ID, err) + } + + if err := c.publishToQueue(ctx, cancelRequest); err != nil { + return nil, fmt.Errorf("CancelController failed to publish cancel request to queue: %w", err) + } + + c.logger.Infow("cancel request published to queue", + "sqid", cancelRequest.ID, + "topic_key", consumer.TopicKeyCancel, + ) + c.metricsScope.Counter("cancel_publish_success").Inc(1) + + return &pb.CancelResponse{}, nil +} + +// publishToQueue publishes a cancel request to the cancel queue for async processing. +func (c *CancelController) publishToQueue(ctx context.Context, cancelRequest entity.CancelRequest) error { + payload, err := cancelRequest.ToBytes() + if err != nil { + return fmt.Errorf("failed to serialize cancel request: %w", err) + } + + // Partition by the sqid so retries and reorderings on the same request are serialised. + // TODO: figure best way to ID and partition the message according to new guidelines on queue usage + msg := queue.NewMessage(cancelRequest.ID, payload, cancelRequest.ID, nil) + + q, ok := c.registry.Queue(consumer.TopicKeyCancel) + if !ok { + return fmt.Errorf("no queue registered for topic key %s", consumer.TopicKeyCancel) + } + + topicName, ok := c.registry.TopicName(consumer.TopicKeyCancel) + if !ok { + return fmt.Errorf("no topic name registered for topic key %s", consumer.TopicKeyCancel) + } + + if err := q.Publisher().Publish(ctx, topicName, msg); err != nil { + return fmt.Errorf("failed to publish cancel request message: %w", err) + } + + return nil +} diff --git a/gateway/controller/cancel_test.go b/gateway/controller/cancel_test.go new file mode 100644 index 00000000..efb7e014 --- /dev/null +++ b/gateway/controller/cancel_test.go @@ -0,0 +1,248 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controller + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/uber-go/tally/v4" + "github.com/uber/submitqueue/core/consumer" + "github.com/uber/submitqueue/core/errs" + "github.com/uber/submitqueue/entity" + "github.com/uber/submitqueue/entity/queue" + queuemock "github.com/uber/submitqueue/extension/queue/mock" + "github.com/uber/submitqueue/extension/storage" + storagemock "github.com/uber/submitqueue/extension/storage/mock" + pb "github.com/uber/submitqueue/gateway/protopb" + "go.uber.org/mock/gomock" + "go.uber.org/zap" +) + +// newCancelTestRegistry builds a single-entry TopicRegistry for TopicKeyCancel wired +// to a mock Queue/Publisher and returns both the registry and the publisher mock. +func newCancelTestRegistry(t *testing.T, ctrl *gomock.Controller) (consumer.TopicRegistry, *queuemock.MockPublisher) { + t.Helper() + pub := queuemock.NewMockPublisher(ctrl) + q := queuemock.NewMockQueue(ctrl) + q.EXPECT().Publisher().Return(pub).AnyTimes() + + registry, err := consumer.NewTopicRegistry([]consumer.TopicConfig{ + {Key: consumer.TopicKeyCancel, Name: "cancel", Queue: q}, + }) + require.NoError(t, err) + return registry, pub +} + +// newCancelTestRegistryWithNoopPublisher returns a registry whose publisher silently accepts any Publish call. +func newCancelTestRegistryWithNoopPublisher(t *testing.T, ctrl *gomock.Controller) consumer.TopicRegistry { + t.Helper() + registry, pub := newCancelTestRegistry(t, ctrl) + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + return registry +} + +// newRequestLogStoreNoop returns a RequestLogStore mock whose List returns a single +// dummy entry (so existence check passes) and whose Insert silently succeeds for any input. +func newRequestLogStoreNoop(t *testing.T, ctrl *gomock.Controller) *storagemock.MockRequestLogStore { + t.Helper() + store := storagemock.NewMockRequestLogStore(ctrl) + store.EXPECT().List(gomock.Any(), gomock.Any()).Return([]entity.RequestLog{{}}, nil).AnyTimes() + store.EXPECT().Insert(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + return store +} + +func TestNewCancelController(t *testing.T) { + ctrl := gomock.NewController(t) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, newRequestLogStoreNoop(t, ctrl), newCancelTestRegistryWithNoopPublisher(t, ctrl)) + require.NotNil(t, controller) +} + +func TestCancel_HappyPath(t *testing.T) { + ctrl := gomock.NewController(t) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, newRequestLogStoreNoop(t, ctrl), newCancelTestRegistryWithNoopPublisher(t, ctrl)) + ctx := context.Background() + + req := &pb.CancelRequest{Sqid: "test-queue/42", Reason: "user changed their mind"} + resp, err := controller.Cancel(ctx, req) + + require.NoError(t, err) + require.NotNil(t, resp) +} + +func TestCancel_ReturnsErrorOnEmptySqid(t *testing.T) { + ctrl := gomock.NewController(t) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, newRequestLogStoreNoop(t, ctrl), newCancelTestRegistryWithNoopPublisher(t, ctrl)) + ctx := context.Background() + + req := &pb.CancelRequest{Sqid: "", Reason: "anything"} + _, err := controller.Cancel(ctx, req) + + require.Error(t, err) + assert.True(t, IsInvalidRequest(err)) +} + +func TestCancel_PublishesToQueue(t *testing.T) { + var publishedTopic string + var publishedMessage queue.Message + + ctrl := gomock.NewController(t) + + registry, publisher := newCancelTestRegistry(t, ctrl) + publisher.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(ctx context.Context, topic string, msg queue.Message) error { + publishedTopic = topic + publishedMessage = msg + return nil + }, + ) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, newRequestLogStoreNoop(t, ctrl), registry) + ctx := context.Background() + + req := &pb.CancelRequest{Sqid: "my-queue/7", Reason: "obsolete change"} + _, err := controller.Cancel(ctx, req) + require.NoError(t, err) + + assert.Equal(t, "cancel", publishedTopic) + assert.Equal(t, "my-queue/7", publishedMessage.ID) + assert.Equal(t, "my-queue/7", publishedMessage.PartitionKey) + + deserialized, err := entity.CancelRequestFromBytes(publishedMessage.Payload) + require.NoError(t, err) + assert.Equal(t, "my-queue/7", deserialized.ID) + assert.Equal(t, "obsolete change", deserialized.Reason) +} + +// TestCancel_InsertsCancellingLog asserts that Cancel records a RequestStatusCancelling +// log entry (intent) carrying the reason in metadata, and that the entry is written +// before the cancel topic publish so observers see intent the moment Cancel returns. +func TestCancel_InsertsCancellingLog(t *testing.T) { + ctrl := gomock.NewController(t) + + var insertedLog entity.RequestLog + logStore := storagemock.NewMockRequestLogStore(ctrl) + logStore.EXPECT().List(gomock.Any(), "my-queue/42").Return([]entity.RequestLog{{}}, nil) + logStore.EXPECT().Insert(gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, entry entity.RequestLog) error { + insertedLog = entry + return nil + }, + ).Times(1) + + registry, publisher := newCancelTestRegistry(t, ctrl) + insertedBeforePublish := false + publisher.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, _ string, _ queue.Message) error { + insertedBeforePublish = insertedLog.RequestID != "" + return nil + }, + ) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, logStore, registry) + + req := &pb.CancelRequest{Sqid: "my-queue/42", Reason: "obsolete change"} + _, err := controller.Cancel(context.Background(), req) + require.NoError(t, err) + + assert.Equal(t, "my-queue/42", insertedLog.RequestID) + assert.Equal(t, entity.RequestStatusCancelling, insertedLog.Status) + assert.Equal(t, "obsolete change", insertedLog.Metadata["reason"]) + assert.True(t, insertedBeforePublish, "log entry must be inserted before publish to the cancel topic") +} + +// TestCancel_LogInsertFailure asserts that a failure to insert the Cancelling log entry +// short-circuits the RPC with an error and the cancel topic is never published to. +func TestCancel_LogInsertFailure(t *testing.T) { + ctrl := gomock.NewController(t) + + logStore := storagemock.NewMockRequestLogStore(ctrl) + logStore.EXPECT().List(gomock.Any(), "q/1").Return([]entity.RequestLog{{}}, nil) + logStore.EXPECT().Insert(gomock.Any(), gomock.Any()).Return(fmt.Errorf("db unavailable")) + + registry, publisher := newCancelTestRegistry(t, ctrl) + // No Publish expectation: log insert must fail before publish runs. + _ = publisher + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, logStore, registry) + _, err := controller.Cancel(context.Background(), &pb.CancelRequest{Sqid: "q/1"}) + require.Error(t, err) +} + +func TestCancel_ReturnsErrorOnPublishFailure(t *testing.T) { + ctrl := gomock.NewController(t) + + registry, publisher := newCancelTestRegistry(t, ctrl) + publisher.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).Return(fmt.Errorf("queue unavailable")) + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, newRequestLogStoreNoop(t, ctrl), registry) + ctx := context.Background() + + req := &pb.CancelRequest{Sqid: "test-queue/1"} + _, err := controller.Cancel(ctx, req) + + require.Error(t, err) +} + +// TestCancel_UnknownSqidIsUserError asserts that Cancel for a sqid with no +// request_log history fails fast with a RequestNotFoundError (user error) and +// never inserts a cancelling log row or publishes to the cancel topic. +func TestCancel_UnknownSqidIsUserError(t *testing.T) { + ctrl := gomock.NewController(t) + + logStore := storagemock.NewMockRequestLogStore(ctrl) + logStore.EXPECT().List(gomock.Any(), "ghost/1").Return(nil, storage.ErrNotFound) + // No Insert expectation: existence check must short-circuit before Insert. + + registry, publisher := newCancelTestRegistry(t, ctrl) + // No Publish expectation: existence check must short-circuit before Publish. + _ = publisher + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, logStore, registry) + _, err := controller.Cancel(context.Background(), &pb.CancelRequest{Sqid: "ghost/1"}) + require.Error(t, err) + assert.True(t, IsRequestNotFound(err)) + assert.True(t, errs.IsUserError(err)) + assert.False(t, errs.IsRetryable(err)) + + var typed *RequestNotFoundError + require.ErrorAs(t, err, &typed) + assert.Equal(t, "ghost/1", typed.Sqid) +} + +// TestCancel_RequestLogLookupFailure asserts that an infrastructure failure on +// the existence check propagates as a (non-user) error and skips the rest of +// the pipeline. +func TestCancel_RequestLogLookupFailure(t *testing.T) { + ctrl := gomock.NewController(t) + + logStore := storagemock.NewMockRequestLogStore(ctrl) + logStore.EXPECT().List(gomock.Any(), "q/1").Return(nil, fmt.Errorf("log backend down")) + + registry, publisher := newCancelTestRegistry(t, ctrl) + _ = publisher + + controller := NewCancelController(zap.NewNop().Sugar(), tally.NoopScope, logStore, registry) + _, err := controller.Cancel(context.Background(), &pb.CancelRequest{Sqid: "q/1"}) + require.Error(t, err) + assert.False(t, errs.IsUserError(err)) + assert.False(t, IsRequestNotFound(err)) +} diff --git a/gateway/proto/gateway.proto b/gateway/proto/gateway.proto index 0b191e90..a5e108ae 100644 --- a/gateway/proto/gateway.proto +++ b/gateway/proto/gateway.proto @@ -92,6 +92,22 @@ message LandResponse { string sqid = 1; } +// CancelRequest defines a request to cancel an in-flight land request. If the request is part of a batch, the entire batch is cancelled. +message CancelRequest { + // Globally unique identifier of the land request to cancel, returned by a prior Land call. + string sqid = 1; + // Optional human-readable reason for the cancellation. Recorded for observability. + string reason = 2; +} + +// CancelResponse defines the response to a cancel request. Empty on success. +// +// A successful response indicates only that the cancellation intent was accepted and enqueued — it does NOT confirm +// that the request was actually cancelled. The terminal outcome may still be "landed" or "error" if a race was lost. +// Errors are returned as standard status codes (e.g. InvalidArgument for an empty sqid). +message CancelResponse { +} + // StatusRequest defines a request to look up the current status of a previously submitted request. message StatusRequest { // Globally unique identifier for the request, as returned by Land in the LandResponse. @@ -149,6 +165,19 @@ service SubmitQueueGateway { // The processing is asynchronous and returns a LandResponse immediately. The land request is processed in the background. rpc Land(LandRequest) returns (LandResponse) {} + // Cancel cancels an in-flight land request identified by its sqid. If the request has already been included in a batch, + // the entire batch is cancelled. Cancel is idempotent — calling it for an already-terminal request is a no-op. + // + // Cancellation is ASYNCHRONOUS: Cancel returns as soon as the cancellation intent has been accepted; the actual + // state transition is performed in the background by the orchestrator and may not have completed by the time the + // caller receives a response. + // + // Cancellation is NOT GUARANTEED: a request that has already merged, or that races to completion before the cancel + // signal propagates through the pipeline, may still land (or end in an error). Callers must NOT assume that a + // successful Cancel response means the request was cancelled — the actual terminal outcome (cancelled, landed, or + // error) must be checked through the separate status / request-log API. + rpc Cancel(CancelRequest) returns (CancelResponse) {} + // Status returns the current status of a previously submitted request, identified by its sqid. // The status is eventually consistent with the request store and reconciled from the append-only request log. rpc Status(StatusRequest) returns (StatusResponse) {} diff --git a/gateway/protopb/gateway.pb.go b/gateway/protopb/gateway.pb.go index b457b053..6af0b856 100644 --- a/gateway/protopb/gateway.pb.go +++ b/gateway/protopb/gateway.pb.go @@ -15,7 +15,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.36.10 -// protoc v5.29.3 +// protoc v3.21.12 // source: gateway.proto package protopb @@ -384,6 +384,102 @@ func (x *LandResponse) GetSqid() string { return "" } +// CancelRequest defines a request to cancel an in-flight land request. If the request is part of a batch, the entire batch is cancelled. +type CancelRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Globally unique identifier of the land request to cancel, returned by a prior Land call. + Sqid string `protobuf:"bytes,1,opt,name=sqid,proto3" json:"sqid,omitempty"` + // Optional human-readable reason for the cancellation. Recorded for observability. + Reason string `protobuf:"bytes,2,opt,name=reason,proto3" json:"reason,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CancelRequest) Reset() { + *x = CancelRequest{} + mi := &file_gateway_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CancelRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CancelRequest) ProtoMessage() {} + +func (x *CancelRequest) ProtoReflect() protoreflect.Message { + mi := &file_gateway_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CancelRequest.ProtoReflect.Descriptor instead. +func (*CancelRequest) Descriptor() ([]byte, []int) { + return file_gateway_proto_rawDescGZIP(), []int{5} +} + +func (x *CancelRequest) GetSqid() string { + if x != nil { + return x.Sqid + } + return "" +} + +func (x *CancelRequest) GetReason() string { + if x != nil { + return x.Reason + } + return "" +} + +// CancelResponse defines the response to a cancel request. Empty on success. +// +// A successful response indicates only that the cancellation intent was accepted and enqueued — it does NOT confirm +// that the request was actually cancelled. The terminal outcome may still be "landed" or "error" if a race was lost. +// Errors are returned as standard status codes (e.g. InvalidArgument for an empty sqid). +type CancelResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CancelResponse) Reset() { + *x = CancelResponse{} + mi := &file_gateway_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CancelResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CancelResponse) ProtoMessage() {} + +func (x *CancelResponse) ProtoReflect() protoreflect.Message { + mi := &file_gateway_proto_msgTypes[6] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CancelResponse.ProtoReflect.Descriptor instead. +func (*CancelResponse) Descriptor() ([]byte, []int) { + return file_gateway_proto_rawDescGZIP(), []int{6} +} + // StatusRequest defines a request to look up the current status of a previously submitted request. type StatusRequest struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -395,7 +491,7 @@ type StatusRequest struct { func (x *StatusRequest) Reset() { *x = StatusRequest{} - mi := &file_gateway_proto_msgTypes[5] + mi := &file_gateway_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -407,7 +503,7 @@ func (x *StatusRequest) String() string { func (*StatusRequest) ProtoMessage() {} func (x *StatusRequest) ProtoReflect() protoreflect.Message { - mi := &file_gateway_proto_msgTypes[5] + mi := &file_gateway_proto_msgTypes[7] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -420,7 +516,7 @@ func (x *StatusRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use StatusRequest.ProtoReflect.Descriptor instead. func (*StatusRequest) Descriptor() ([]byte, []int) { - return file_gateway_proto_rawDescGZIP(), []int{5} + return file_gateway_proto_rawDescGZIP(), []int{7} } func (x *StatusRequest) GetSqid() string { @@ -447,7 +543,7 @@ type StatusResponse struct { func (x *StatusResponse) Reset() { *x = StatusResponse{} - mi := &file_gateway_proto_msgTypes[6] + mi := &file_gateway_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -459,7 +555,7 @@ func (x *StatusResponse) String() string { func (*StatusResponse) ProtoMessage() {} func (x *StatusResponse) ProtoReflect() protoreflect.Message { - mi := &file_gateway_proto_msgTypes[6] + mi := &file_gateway_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -472,7 +568,7 @@ func (x *StatusResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use StatusResponse.ProtoReflect.Descriptor instead. func (*StatusResponse) Descriptor() ([]byte, []int) { - return file_gateway_proto_rawDescGZIP(), []int{6} + return file_gateway_proto_rawDescGZIP(), []int{8} } func (x *StatusResponse) GetStatus() string { @@ -507,7 +603,7 @@ type Error struct { func (x *Error) Reset() { *x = Error{} - mi := &file_gateway_proto_msgTypes[7] + mi := &file_gateway_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -519,7 +615,7 @@ func (x *Error) String() string { func (*Error) ProtoMessage() {} func (x *Error) ProtoReflect() protoreflect.Message { - mi := &file_gateway_proto_msgTypes[7] + mi := &file_gateway_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -532,7 +628,7 @@ func (x *Error) ProtoReflect() protoreflect.Message { // Deprecated: Use Error.ProtoReflect.Descriptor instead. func (*Error) Descriptor() ([]byte, []int) { - return file_gateway_proto_rawDescGZIP(), []int{7} + return file_gateway_proto_rawDescGZIP(), []int{9} } func (x *Error) GetMessage() string { @@ -555,7 +651,7 @@ type UnrecognizedQueueError struct { func (x *UnrecognizedQueueError) Reset() { *x = UnrecognizedQueueError{} - mi := &file_gateway_proto_msgTypes[8] + mi := &file_gateway_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -567,7 +663,7 @@ func (x *UnrecognizedQueueError) String() string { func (*UnrecognizedQueueError) ProtoMessage() {} func (x *UnrecognizedQueueError) ProtoReflect() protoreflect.Message { - mi := &file_gateway_proto_msgTypes[8] + mi := &file_gateway_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -580,7 +676,7 @@ func (x *UnrecognizedQueueError) ProtoReflect() protoreflect.Message { // Deprecated: Use UnrecognizedQueueError.ProtoReflect.Descriptor instead. func (*UnrecognizedQueueError) Descriptor() ([]byte, []int) { - return file_gateway_proto_rawDescGZIP(), []int{8} + return file_gateway_proto_rawDescGZIP(), []int{10} } func (x *UnrecognizedQueueError) GetError() *Error { @@ -610,7 +706,7 @@ type RequestNotFoundError struct { func (x *RequestNotFoundError) Reset() { *x = RequestNotFoundError{} - mi := &file_gateway_proto_msgTypes[9] + mi := &file_gateway_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -622,7 +718,7 @@ func (x *RequestNotFoundError) String() string { func (*RequestNotFoundError) ProtoMessage() {} func (x *RequestNotFoundError) ProtoReflect() protoreflect.Message { - mi := &file_gateway_proto_msgTypes[9] + mi := &file_gateway_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -635,7 +731,7 @@ func (x *RequestNotFoundError) ProtoReflect() protoreflect.Message { // Deprecated: Use RequestNotFoundError.ProtoReflect.Descriptor instead. func (*RequestNotFoundError) Descriptor() ([]byte, []int) { - return file_gateway_proto_rawDescGZIP(), []int{9} + return file_gateway_proto_rawDescGZIP(), []int{11} } func (x *RequestNotFoundError) GetError() *Error { @@ -671,7 +767,11 @@ const file_gateway_proto_rawDesc = "" + "\x06change\x18\x02 \x01(\v2 .uber.submitqueue.gateway.ChangeR\x06change\x12>\n" + "\bstrategy\x18\x04 \x01(\x0e2\".uber.submitqueue.gateway.StrategyR\bstrategy\"\"\n" + "\fLandResponse\x12\x12\n" + - "\x04sqid\x18\x01 \x01(\tR\x04sqid\"#\n" + + "\x04sqid\x18\x01 \x01(\tR\x04sqid\";\n" + + "\rCancelRequest\x12\x12\n" + + "\x04sqid\x18\x01 \x01(\tR\x04sqid\x12\x16\n" + + "\x06reason\x18\x02 \x01(\tR\x06reason\"\x10\n" + + "\x0eCancelResponse\"#\n" + "\rStatusRequest\x12\x12\n" + "\x04sqid\x18\x01 \x01(\tR\x04sqid\"\xd8\x01\n" + "\x0eStatusResponse\x12\x16\n" + @@ -695,10 +795,11 @@ const file_gateway_proto_rawDesc = "" + "\n" + "\x06REBASE\x10\x01\x12\x11\n" + "\rSQUASH_REBASE\x10\x02\x12\t\n" + - "\x05MERGE\x10\x032\xa5\x02\n" + + "\x05MERGE\x10\x032\x84\x03\n" + "\x12SubmitQueueGateway\x12W\n" + "\x04Ping\x12%.uber.submitqueue.gateway.PingRequest\x1a&.uber.submitqueue.gateway.PingResponse\"\x00\x12W\n" + "\x04Land\x12%.uber.submitqueue.gateway.LandRequest\x1a&.uber.submitqueue.gateway.LandResponse\"\x00\x12]\n" + + "\x06Cancel\x12'.uber.submitqueue.gateway.CancelRequest\x1a(.uber.submitqueue.gateway.CancelResponse\"\x00\x12]\n" + "\x06Status\x12'.uber.submitqueue.gateway.StatusRequest\x1a(.uber.submitqueue.gateway.StatusResponse\"\x00B[\n" + "\x1ccom.uber.submitqueue.gatewayB\fGatewayProtoP\x01Z+github.com/uber/submitqueue/gateway/protopbb\x06proto3" @@ -715,7 +816,7 @@ func file_gateway_proto_rawDescGZIP() []byte { } var file_gateway_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_gateway_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_gateway_proto_msgTypes = make([]protoimpl.MessageInfo, 13) var file_gateway_proto_goTypes = []any{ (Strategy)(0), // 0: uber.submitqueue.gateway.Strategy (*PingRequest)(nil), // 1: uber.submitqueue.gateway.PingRequest @@ -723,27 +824,31 @@ var file_gateway_proto_goTypes = []any{ (*Change)(nil), // 3: uber.submitqueue.gateway.Change (*LandRequest)(nil), // 4: uber.submitqueue.gateway.LandRequest (*LandResponse)(nil), // 5: uber.submitqueue.gateway.LandResponse - (*StatusRequest)(nil), // 6: uber.submitqueue.gateway.StatusRequest - (*StatusResponse)(nil), // 7: uber.submitqueue.gateway.StatusResponse - (*Error)(nil), // 8: uber.submitqueue.gateway.Error - (*UnrecognizedQueueError)(nil), // 9: uber.submitqueue.gateway.UnrecognizedQueueError - (*RequestNotFoundError)(nil), // 10: uber.submitqueue.gateway.RequestNotFoundError - nil, // 11: uber.submitqueue.gateway.StatusResponse.MetadataEntry + (*CancelRequest)(nil), // 6: uber.submitqueue.gateway.CancelRequest + (*CancelResponse)(nil), // 7: uber.submitqueue.gateway.CancelResponse + (*StatusRequest)(nil), // 8: uber.submitqueue.gateway.StatusRequest + (*StatusResponse)(nil), // 9: uber.submitqueue.gateway.StatusResponse + (*Error)(nil), // 10: uber.submitqueue.gateway.Error + (*UnrecognizedQueueError)(nil), // 11: uber.submitqueue.gateway.UnrecognizedQueueError + (*RequestNotFoundError)(nil), // 12: uber.submitqueue.gateway.RequestNotFoundError + nil, // 13: uber.submitqueue.gateway.StatusResponse.MetadataEntry } var file_gateway_proto_depIdxs = []int32{ 3, // 0: uber.submitqueue.gateway.LandRequest.change:type_name -> uber.submitqueue.gateway.Change 0, // 1: uber.submitqueue.gateway.LandRequest.strategy:type_name -> uber.submitqueue.gateway.Strategy - 11, // 2: uber.submitqueue.gateway.StatusResponse.metadata:type_name -> uber.submitqueue.gateway.StatusResponse.MetadataEntry - 8, // 3: uber.submitqueue.gateway.UnrecognizedQueueError.error:type_name -> uber.submitqueue.gateway.Error - 8, // 4: uber.submitqueue.gateway.RequestNotFoundError.error:type_name -> uber.submitqueue.gateway.Error + 13, // 2: uber.submitqueue.gateway.StatusResponse.metadata:type_name -> uber.submitqueue.gateway.StatusResponse.MetadataEntry + 10, // 3: uber.submitqueue.gateway.UnrecognizedQueueError.error:type_name -> uber.submitqueue.gateway.Error + 10, // 4: uber.submitqueue.gateway.RequestNotFoundError.error:type_name -> uber.submitqueue.gateway.Error 1, // 5: uber.submitqueue.gateway.SubmitQueueGateway.Ping:input_type -> uber.submitqueue.gateway.PingRequest 4, // 6: uber.submitqueue.gateway.SubmitQueueGateway.Land:input_type -> uber.submitqueue.gateway.LandRequest - 6, // 7: uber.submitqueue.gateway.SubmitQueueGateway.Status:input_type -> uber.submitqueue.gateway.StatusRequest - 2, // 8: uber.submitqueue.gateway.SubmitQueueGateway.Ping:output_type -> uber.submitqueue.gateway.PingResponse - 5, // 9: uber.submitqueue.gateway.SubmitQueueGateway.Land:output_type -> uber.submitqueue.gateway.LandResponse - 7, // 10: uber.submitqueue.gateway.SubmitQueueGateway.Status:output_type -> uber.submitqueue.gateway.StatusResponse - 8, // [8:11] is the sub-list for method output_type - 5, // [5:8] is the sub-list for method input_type + 6, // 7: uber.submitqueue.gateway.SubmitQueueGateway.Cancel:input_type -> uber.submitqueue.gateway.CancelRequest + 8, // 8: uber.submitqueue.gateway.SubmitQueueGateway.Status:input_type -> uber.submitqueue.gateway.StatusRequest + 2, // 9: uber.submitqueue.gateway.SubmitQueueGateway.Ping:output_type -> uber.submitqueue.gateway.PingResponse + 5, // 10: uber.submitqueue.gateway.SubmitQueueGateway.Land:output_type -> uber.submitqueue.gateway.LandResponse + 7, // 11: uber.submitqueue.gateway.SubmitQueueGateway.Cancel:output_type -> uber.submitqueue.gateway.CancelResponse + 9, // 12: uber.submitqueue.gateway.SubmitQueueGateway.Status:output_type -> uber.submitqueue.gateway.StatusResponse + 9, // [9:13] is the sub-list for method output_type + 5, // [5:9] is the sub-list for method input_type 5, // [5:5] is the sub-list for extension type_name 5, // [5:5] is the sub-list for extension extendee 0, // [0:5] is the sub-list for field type_name @@ -760,7 +865,7 @@ func file_gateway_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_gateway_proto_rawDesc), len(file_gateway_proto_rawDesc)), NumEnums: 1, - NumMessages: 11, + NumMessages: 13, NumExtensions: 0, NumServices: 1, }, diff --git a/gateway/protopb/gateway.pb.yarpc.go b/gateway/protopb/gateway.pb.yarpc.go index 5d660969..57dc7f22 100644 --- a/gateway/protopb/gateway.pb.yarpc.go +++ b/gateway/protopb/gateway.pb.yarpc.go @@ -24,6 +24,7 @@ var _ = ioutil.NopCloser type SubmitQueueGatewayYARPCClient interface { Ping(context.Context, *PingRequest, ...yarpc.CallOption) (*PingResponse, error) Land(context.Context, *LandRequest, ...yarpc.CallOption) (*LandResponse, error) + Cancel(context.Context, *CancelRequest, ...yarpc.CallOption) (*CancelResponse, error) Status(context.Context, *StatusRequest, ...yarpc.CallOption) (*StatusResponse, error) } @@ -47,6 +48,7 @@ func NewSubmitQueueGatewayYARPCClient(clientConfig transport.ClientConfig, optio type SubmitQueueGatewayYARPCServer interface { Ping(context.Context, *PingRequest) (*PingResponse, error) Land(context.Context, *LandRequest) (*LandResponse, error) + Cancel(context.Context, *CancelRequest) (*CancelResponse, error) Status(context.Context, *StatusRequest) (*StatusResponse, error) } @@ -81,6 +83,16 @@ func buildSubmitQueueGatewayYARPCProcedures(params buildSubmitQueueGatewayYARPCP }, ), }, + { + MethodName: "Cancel", + Handler: protobuf.NewUnaryHandler( + protobuf.UnaryHandlerParams{ + Handle: handler.Cancel, + NewRequest: newSubmitQueueGatewayServiceCancelYARPCRequest, + AnyResolver: params.AnyResolver, + }, + ), + }, { MethodName: "Status", Handler: protobuf.NewUnaryHandler( @@ -233,6 +245,18 @@ func (c *_SubmitQueueGatewayYARPCCaller) Land(ctx context.Context, request *Land return response, err } +func (c *_SubmitQueueGatewayYARPCCaller) Cancel(ctx context.Context, request *CancelRequest, options ...yarpc.CallOption) (*CancelResponse, error) { + responseMessage, err := c.streamClient.Call(ctx, "Cancel", request, newSubmitQueueGatewayServiceCancelYARPCResponse, options...) + if responseMessage == nil { + return nil, err + } + response, ok := responseMessage.(*CancelResponse) + if !ok { + return nil, protobuf.CastError(emptySubmitQueueGatewayServiceCancelYARPCResponse, responseMessage) + } + return response, err +} + func (c *_SubmitQueueGatewayYARPCCaller) Status(ctx context.Context, request *StatusRequest, options ...yarpc.CallOption) (*StatusResponse, error) { responseMessage, err := c.streamClient.Call(ctx, "Status", request, newSubmitQueueGatewayServiceStatusYARPCResponse, options...) if responseMessage == nil { @@ -281,6 +305,22 @@ func (h *_SubmitQueueGatewayYARPCHandler) Land(ctx context.Context, requestMessa return response, err } +func (h *_SubmitQueueGatewayYARPCHandler) Cancel(ctx context.Context, requestMessage proto.Message) (proto.Message, error) { + var request *CancelRequest + var ok bool + if requestMessage != nil { + request, ok = requestMessage.(*CancelRequest) + if !ok { + return nil, protobuf.CastError(emptySubmitQueueGatewayServiceCancelYARPCRequest, requestMessage) + } + } + response, err := h.server.Cancel(ctx, request) + if response == nil { + return nil, err + } + return response, err +} + func (h *_SubmitQueueGatewayYARPCHandler) Status(ctx context.Context, requestMessage proto.Message) (proto.Message, error) { var request *StatusRequest var ok bool @@ -313,6 +353,14 @@ func newSubmitQueueGatewayServiceLandYARPCResponse() proto.Message { return &LandResponse{} } +func newSubmitQueueGatewayServiceCancelYARPCRequest() proto.Message { + return &CancelRequest{} +} + +func newSubmitQueueGatewayServiceCancelYARPCResponse() proto.Message { + return &CancelResponse{} +} + func newSubmitQueueGatewayServiceStatusYARPCRequest() proto.Message { return &StatusRequest{} } @@ -326,6 +374,8 @@ var ( emptySubmitQueueGatewayServicePingYARPCResponse = &PingResponse{} emptySubmitQueueGatewayServiceLandYARPCRequest = &LandRequest{} emptySubmitQueueGatewayServiceLandYARPCResponse = &LandResponse{} + emptySubmitQueueGatewayServiceCancelYARPCRequest = &CancelRequest{} + emptySubmitQueueGatewayServiceCancelYARPCResponse = &CancelResponse{} emptySubmitQueueGatewayServiceStatusYARPCRequest = &StatusRequest{} emptySubmitQueueGatewayServiceStatusYARPCResponse = &StatusResponse{} ) @@ -333,45 +383,47 @@ var ( var yarpcFileDescriptorClosuref1a937782ebbded5 = [][]byte{ // gateway.proto []byte{ - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x54, 0xe1, 0x6b, 0xd3, 0x40, - 0x14, 0x6f, 0x92, 0x36, 0x6b, 0x5f, 0xda, 0x51, 0x8f, 0x31, 0xc2, 0x98, 0xd8, 0x9d, 0xe8, 0x8a, - 0x42, 0x0a, 0x15, 0x65, 0x28, 0x08, 0x9d, 0x66, 0xf3, 0xc3, 0x36, 0xb6, 0xc4, 0x21, 0x28, 0x32, - 0xae, 0xed, 0x91, 0x05, 0x97, 0xa4, 0xcd, 0x5d, 0x26, 0xf5, 0xbb, 0x7f, 0x86, 0x1f, 0xfd, 0x9f, - 0xfc, 0x73, 0x24, 0x77, 0x97, 0x36, 0x82, 0xe9, 0x04, 0xbf, 0xdd, 0x7b, 0xf9, 0xfd, 0xde, 0xfb, - 0xbd, 0x7b, 0xbf, 0x0b, 0x74, 0x02, 0xc2, 0xe9, 0x57, 0xb2, 0x70, 0x66, 0x69, 0xc2, 0x13, 0x64, - 0x67, 0x63, 0x9a, 0x3a, 0x2c, 0x1b, 0x47, 0x21, 0x9f, 0x67, 0x34, 0xa3, 0x8e, 0xfa, 0x8e, 0xf7, - 0xc1, 0x3a, 0x0f, 0xe3, 0xc0, 0xa3, 0xf3, 0x8c, 0x32, 0x8e, 0x6c, 0xd8, 0x88, 0x28, 0x63, 0x24, - 0xa0, 0xb6, 0xd6, 0xd3, 0xfa, 0x2d, 0xaf, 0x08, 0xf1, 0x77, 0x0d, 0xda, 0x12, 0xc9, 0x66, 0x49, - 0xcc, 0x68, 0x35, 0x14, 0xed, 0x41, 0x9b, 0xd1, 0xf4, 0x36, 0x9c, 0xd0, 0xab, 0x98, 0x44, 0xd4, - 0xd6, 0xc5, 0x67, 0x4b, 0xe5, 0xce, 0x48, 0x44, 0xd1, 0x2e, 0xb4, 0x78, 0x18, 0x51, 0xc6, 0x49, - 0x34, 0xb3, 0x8d, 0x9e, 0xd6, 0x37, 0xbc, 0x55, 0x02, 0xed, 0x40, 0xf3, 0x3a, 0x61, 0x5c, 0x90, - 0xeb, 0x82, 0xbc, 0x8c, 0xf1, 0x2e, 0x98, 0x6f, 0xae, 0x49, 0x1c, 0x50, 0x84, 0xa0, 0x9e, 0xa5, - 0x21, 0xb3, 0xb5, 0x9e, 0xd1, 0x6f, 0x79, 0xe2, 0x8c, 0x7f, 0x68, 0x60, 0x9d, 0x90, 0x78, 0x5a, - 0xcc, 0xb3, 0x05, 0x0d, 0x31, 0xaf, 0x92, 0x28, 0x03, 0x74, 0x00, 0xe6, 0x44, 0xd4, 0x10, 0xd2, - 0xac, 0x61, 0xcf, 0xa9, 0xba, 0x1f, 0x47, 0xf6, 0xf2, 0x14, 0x1e, 0xbd, 0x86, 0x26, 0xe3, 0x29, - 0xe1, 0x34, 0x58, 0x08, 0x65, 0x9b, 0x43, 0x5c, 0xcd, 0xf5, 0x15, 0xd2, 0x5b, 0x72, 0x30, 0x86, - 0xb6, 0x94, 0xa7, 0x2e, 0x11, 0x41, 0x9d, 0xcd, 0xc3, 0xa9, 0x92, 0x27, 0xce, 0xf8, 0x21, 0x74, - 0x7c, 0x4e, 0x78, 0xc6, 0x8a, 0x21, 0xfe, 0x06, 0xfa, 0xa5, 0xc1, 0x66, 0x81, 0x52, 0xb5, 0xb6, - 0xc1, 0x64, 0x22, 0xa3, 0x80, 0x2a, 0x42, 0xf7, 0x01, 0x6e, 0x08, 0xe3, 0x57, 0x34, 0x4d, 0x93, - 0x54, 0x2d, 0xa3, 0x95, 0x67, 0xdc, 0x3c, 0x81, 0x3c, 0x68, 0x46, 0x94, 0x93, 0x29, 0xe1, 0xc4, - 0x36, 0x7a, 0x46, 0xdf, 0x1a, 0xbe, 0x58, 0x37, 0x52, 0xb9, 0xa5, 0x73, 0xaa, 0x88, 0x6e, 0xcc, - 0xd3, 0x85, 0xb7, 0xac, 0xb3, 0xf3, 0x0a, 0x3a, 0x7f, 0x7c, 0x42, 0x5d, 0x30, 0xbe, 0xd0, 0x85, - 0x12, 0x96, 0x1f, 0xf3, 0xcd, 0xdc, 0x92, 0x9b, 0xac, 0x70, 0x87, 0x0c, 0x5e, 0xea, 0x07, 0x1a, - 0xde, 0x83, 0x86, 0x54, 0x56, 0x6d, 0x46, 0x0a, 0xdb, 0x97, 0x71, 0x4a, 0x27, 0x49, 0x10, 0x87, - 0xdf, 0xe8, 0xf4, 0x22, 0xd7, 0x28, 0x39, 0xcf, 0xa1, 0x21, 0xe7, 0xd4, 0xc4, 0x66, 0x1f, 0x54, - 0x8f, 0x22, 0xf0, 0x9e, 0x44, 0xaf, 0x7c, 0xa2, 0x97, 0x7c, 0x82, 0x09, 0x6c, 0xa9, 0x1d, 0x9c, - 0x25, 0xfc, 0x28, 0xc9, 0xe2, 0xe9, 0x7f, 0x35, 0x29, 0xf6, 0xa8, 0xaf, 0xf6, 0xf8, 0x64, 0x04, - 0xcd, 0xc2, 0x26, 0xc8, 0x82, 0x8d, 0xb7, 0xee, 0xd1, 0xe8, 0xf2, 0xe4, 0x7d, 0xb7, 0x86, 0x00, - 0x4c, 0xcf, 0x3d, 0x1c, 0xf9, 0x6e, 0x57, 0x43, 0xf7, 0xa0, 0xe3, 0x5f, 0x5c, 0x8e, 0xfc, 0x77, - 0x57, 0x2a, 0xa5, 0xa3, 0x16, 0x34, 0x4e, 0x5d, 0xef, 0xd8, 0xed, 0x1a, 0xc3, 0x9f, 0x3a, 0x20, - 0x5f, 0xf4, 0x16, 0xf7, 0x70, 0x2c, 0x5b, 0xa3, 0x0f, 0x50, 0xcf, 0xdf, 0x2b, 0x7a, 0x54, 0xad, - 0xae, 0xf4, 0xf2, 0x77, 0x1e, 0xdf, 0x05, 0x93, 0x2b, 0xc7, 0xb5, 0xbc, 0x70, 0xee, 0xe1, 0x75, - 0x85, 0x4b, 0x4f, 0x70, 0x5d, 0xe1, 0xf2, 0x53, 0xc0, 0x35, 0xf4, 0x19, 0x4c, 0xe9, 0x2f, 0xb4, - 0x7f, 0xb7, 0x03, 0x65, 0xf1, 0xfe, 0xbf, 0x5a, 0x15, 0xd7, 0x0e, 0x3f, 0xc1, 0xee, 0x24, 0x89, - 0x2a, 0x09, 0x87, 0x6d, 0x75, 0x73, 0xe7, 0xf9, 0x2f, 0xf3, 0x5c, 0xfb, 0xf8, 0x34, 0x08, 0xf9, - 0x75, 0x36, 0x76, 0x26, 0x49, 0x34, 0xc8, 0x49, 0x83, 0x12, 0x69, 0xa0, 0x48, 0x03, 0xf1, 0x7f, - 0x9d, 0x8d, 0xc7, 0xa6, 0x38, 0x3c, 0xfb, 0x1d, 0x00, 0x00, 0xff, 0xff, 0xe4, 0xd1, 0x9a, 0x74, - 0x79, 0x05, 0x00, 0x00, + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x55, 0x6f, 0x6b, 0xd3, 0x40, + 0x18, 0x6f, 0x9a, 0x36, 0x6b, 0x9f, 0xb6, 0xa3, 0x1e, 0x63, 0x84, 0x32, 0xb1, 0x3b, 0xd1, 0x15, + 0x85, 0x14, 0x2a, 0xca, 0x70, 0x20, 0x74, 0x33, 0x9b, 0x2f, 0xb6, 0xb1, 0x25, 0x0e, 0x41, 0x91, + 0x71, 0x4d, 0x8f, 0x2c, 0xb8, 0x24, 0x5d, 0xee, 0x32, 0xa9, 0xaf, 0xf5, 0x63, 0xf8, 0xbd, 0xfc, + 0x38, 0x92, 0xbb, 0xcb, 0x96, 0x82, 0xc9, 0x5e, 0xf8, 0xee, 0x9e, 0xe7, 0x7e, 0xbf, 0xe7, 0xf9, + 0xdd, 0xf3, 0x27, 0x81, 0x9e, 0x4f, 0x38, 0xfd, 0x4e, 0x96, 0xd6, 0x22, 0x89, 0x79, 0x8c, 0xcc, + 0x74, 0x46, 0x13, 0x8b, 0xa5, 0xb3, 0x30, 0xe0, 0x37, 0x29, 0x4d, 0xa9, 0xa5, 0xee, 0xf1, 0x0e, + 0x74, 0xce, 0x82, 0xc8, 0x77, 0xe8, 0x4d, 0x4a, 0x19, 0x47, 0x26, 0xac, 0x85, 0x94, 0x31, 0xe2, + 0x53, 0x53, 0x1b, 0x6a, 0xa3, 0xb6, 0x93, 0x9b, 0xf8, 0x97, 0x06, 0x5d, 0x89, 0x64, 0x8b, 0x38, + 0x62, 0xb4, 0x1c, 0x8a, 0xb6, 0xa1, 0xcb, 0x68, 0x72, 0x1b, 0x78, 0xf4, 0x32, 0x22, 0x21, 0x35, + 0xeb, 0xe2, 0xba, 0xa3, 0x7c, 0xa7, 0x24, 0xa4, 0x68, 0x0b, 0xda, 0x3c, 0x08, 0x29, 0xe3, 0x24, + 0x5c, 0x98, 0xfa, 0x50, 0x1b, 0xe9, 0xce, 0xbd, 0x03, 0x0d, 0xa0, 0x75, 0x15, 0x33, 0x2e, 0xc8, + 0x0d, 0x41, 0xbe, 0xb3, 0xf1, 0x16, 0x18, 0x07, 0x57, 0x24, 0xf2, 0x29, 0x42, 0xd0, 0x48, 0x93, + 0x80, 0x99, 0xda, 0x50, 0x1f, 0xb5, 0x1d, 0x71, 0xc6, 0xbf, 0x35, 0xe8, 0x1c, 0x93, 0x68, 0x9e, + 0xbf, 0x67, 0x03, 0x9a, 0xe2, 0xbd, 0x4a, 0xa2, 0x34, 0xd0, 0x2e, 0x18, 0x9e, 0x88, 0x21, 0xa4, + 0x75, 0x26, 0x43, 0xab, 0xac, 0x3e, 0x96, 0xcc, 0xe5, 0x28, 0x3c, 0x7a, 0x07, 0x2d, 0xc6, 0x13, + 0xc2, 0xa9, 0xbf, 0x14, 0xca, 0xd6, 0x27, 0xb8, 0x9c, 0xeb, 0x2a, 0xa4, 0x73, 0xc7, 0xc1, 0x18, + 0xba, 0x52, 0x9e, 0x2a, 0x22, 0x82, 0x06, 0xbb, 0x09, 0xe6, 0x4a, 0x9e, 0x38, 0xe3, 0x3d, 0xe8, + 0x1d, 0x90, 0xc8, 0xa3, 0xd7, 0xf9, 0x23, 0xfe, 0x01, 0x42, 0x9b, 0x60, 0x24, 0x94, 0xb0, 0x38, + 0x52, 0xd5, 0x55, 0x16, 0xee, 0xc3, 0x7a, 0x4e, 0x96, 0x29, 0xf0, 0x53, 0xe8, 0xb9, 0x9c, 0xf0, + 0x94, 0x55, 0x84, 0xc3, 0x7f, 0x34, 0x58, 0xcf, 0x51, 0x4a, 0xda, 0x26, 0x18, 0x4c, 0x78, 0x14, + 0x50, 0x59, 0xe8, 0x31, 0xc0, 0x35, 0x61, 0xfc, 0x92, 0x26, 0x49, 0x9c, 0xa8, 0xec, 0xed, 0xcc, + 0x63, 0x67, 0x0e, 0xe4, 0x40, 0x2b, 0xa4, 0x9c, 0xcc, 0x09, 0x27, 0xa6, 0x3e, 0xd4, 0x47, 0x9d, + 0xc9, 0x9b, 0xaa, 0x0a, 0x15, 0x53, 0x5a, 0x27, 0x8a, 0x68, 0x47, 0x3c, 0x59, 0x3a, 0x77, 0x71, + 0x06, 0x7b, 0xd0, 0x5b, 0xb9, 0x42, 0x7d, 0xd0, 0xbf, 0xd1, 0xa5, 0x12, 0x96, 0x1d, 0xb3, 0x46, + 0xdf, 0x92, 0xeb, 0x34, 0x1f, 0x36, 0x69, 0xbc, 0xad, 0xef, 0x6a, 0x78, 0x1b, 0x9a, 0x52, 0x59, + 0xf9, 0x6c, 0x53, 0xd8, 0xbc, 0x88, 0x12, 0xea, 0xc5, 0x7e, 0x14, 0xfc, 0xa0, 0xf3, 0xf3, 0x4c, + 0xa3, 0xe4, 0xbc, 0x86, 0xa6, 0x7c, 0xa7, 0x26, 0x06, 0xe5, 0x49, 0xf9, 0x53, 0x04, 0xde, 0x91, + 0xe8, 0xfb, 0xb1, 0xab, 0x17, 0xc6, 0x0e, 0x13, 0xd8, 0x50, 0x3d, 0x38, 0x8d, 0xf9, 0x61, 0x9c, + 0x46, 0xf3, 0xff, 0x4a, 0x92, 0xf7, 0xb1, 0x7e, 0xdf, 0xc7, 0x17, 0x53, 0x68, 0xe5, 0x53, 0x87, + 0x3a, 0xb0, 0xf6, 0xde, 0x3e, 0x9c, 0x5e, 0x1c, 0x7f, 0xec, 0xd7, 0x10, 0x80, 0xe1, 0xd8, 0xfb, + 0x53, 0xd7, 0xee, 0x6b, 0xe8, 0x11, 0xf4, 0xdc, 0xf3, 0x8b, 0xa9, 0xfb, 0xe1, 0x52, 0xb9, 0xea, + 0xa8, 0x0d, 0xcd, 0x13, 0xdb, 0x39, 0xb2, 0xfb, 0xfa, 0xe4, 0xa7, 0x0e, 0xc8, 0x15, 0xb9, 0x45, + 0x1d, 0x8e, 0x64, 0x6a, 0xf4, 0x09, 0x1a, 0xd9, 0xfa, 0xa3, 0x67, 0xe5, 0xea, 0x0a, 0x1f, 0x92, + 0xc1, 0xf3, 0x87, 0x60, 0x6a, 0x3a, 0x6b, 0x59, 0xe0, 0x6c, 0x25, 0xaa, 0x02, 0x17, 0x36, 0xba, + 0x2a, 0x70, 0x71, 0xb3, 0x70, 0x0d, 0x7d, 0x05, 0x43, 0xae, 0x02, 0xda, 0xa9, 0xd8, 0xef, 0xe2, + 0xa6, 0x0d, 0x46, 0x0f, 0x03, 0x8b, 0xe1, 0xe5, 0xf8, 0x56, 0x85, 0x5f, 0xd9, 0xbc, 0xaa, 0xf0, + 0xab, 0x9b, 0x80, 0x6b, 0xfb, 0x5f, 0x60, 0xcb, 0x8b, 0xc3, 0x52, 0xc2, 0x7e, 0x57, 0x35, 0xe6, + 0x2c, 0xfb, 0xc0, 0x9f, 0x69, 0x9f, 0x5f, 0xfa, 0x01, 0xbf, 0x4a, 0x67, 0x96, 0x17, 0x87, 0xe3, + 0x8c, 0x34, 0x2e, 0x90, 0xc6, 0x8a, 0x34, 0x16, 0x7f, 0x83, 0xc5, 0x6c, 0x66, 0x88, 0xc3, 0xab, + 0xbf, 0x01, 0x00, 0x00, 0xff, 0xff, 0x0f, 0xb5, 0x0b, 0x6e, 0x27, 0x06, 0x00, 0x00, }, } diff --git a/gateway/protopb/gateway_grpc.pb.go b/gateway/protopb/gateway_grpc.pb.go index 5e2cb594..192e55a9 100644 --- a/gateway/protopb/gateway_grpc.pb.go +++ b/gateway/protopb/gateway_grpc.pb.go @@ -15,7 +15,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.5.1 -// - protoc v5.29.3 +// - protoc v3.21.12 // source: gateway.proto package protopb @@ -36,6 +36,7 @@ const _ = grpc.SupportPackageIsVersion9 const ( SubmitQueueGateway_Ping_FullMethodName = "/uber.submitqueue.gateway.SubmitQueueGateway/Ping" SubmitQueueGateway_Land_FullMethodName = "/uber.submitqueue.gateway.SubmitQueueGateway/Land" + SubmitQueueGateway_Cancel_FullMethodName = "/uber.submitqueue.gateway.SubmitQueueGateway/Cancel" SubmitQueueGateway_Status_FullMethodName = "/uber.submitqueue.gateway.SubmitQueueGateway/Status" ) @@ -50,6 +51,18 @@ type SubmitQueueGatewayClient interface { // Land lands a set of code changes into a target branch, performing the necessary validations across all other changes in the queue. // The processing is asynchronous and returns a LandResponse immediately. The land request is processed in the background. Land(ctx context.Context, in *LandRequest, opts ...grpc.CallOption) (*LandResponse, error) + // Cancel cancels an in-flight land request identified by its sqid. If the request has already been included in a batch, + // the entire batch is cancelled. Cancel is idempotent — calling it for an already-terminal request is a no-op. + // + // Cancellation is ASYNCHRONOUS: Cancel returns as soon as the cancellation intent has been accepted; the actual + // state transition is performed in the background by the orchestrator and may not have completed by the time the + // caller receives a response. + // + // Cancellation is NOT GUARANTEED: a request that has already merged, or that races to completion before the cancel + // signal propagates through the pipeline, may still land (or end in an error). Callers must NOT assume that a + // successful Cancel response means the request was cancelled — the actual terminal outcome (cancelled, landed, or + // error) must be checked through the separate status / request-log API. + Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) // Status returns the current status of a previously submitted request, identified by its sqid. // The status is eventually consistent with the request store and reconciled from the append-only request log. Status(ctx context.Context, in *StatusRequest, opts ...grpc.CallOption) (*StatusResponse, error) @@ -83,6 +96,16 @@ func (c *submitQueueGatewayClient) Land(ctx context.Context, in *LandRequest, op return out, nil } +func (c *submitQueueGatewayClient) Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(CancelResponse) + err := c.cc.Invoke(ctx, SubmitQueueGateway_Cancel_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + func (c *submitQueueGatewayClient) Status(ctx context.Context, in *StatusRequest, opts ...grpc.CallOption) (*StatusResponse, error) { cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) out := new(StatusResponse) @@ -104,6 +127,18 @@ type SubmitQueueGatewayServer interface { // Land lands a set of code changes into a target branch, performing the necessary validations across all other changes in the queue. // The processing is asynchronous and returns a LandResponse immediately. The land request is processed in the background. Land(context.Context, *LandRequest) (*LandResponse, error) + // Cancel cancels an in-flight land request identified by its sqid. If the request has already been included in a batch, + // the entire batch is cancelled. Cancel is idempotent — calling it for an already-terminal request is a no-op. + // + // Cancellation is ASYNCHRONOUS: Cancel returns as soon as the cancellation intent has been accepted; the actual + // state transition is performed in the background by the orchestrator and may not have completed by the time the + // caller receives a response. + // + // Cancellation is NOT GUARANTEED: a request that has already merged, or that races to completion before the cancel + // signal propagates through the pipeline, may still land (or end in an error). Callers must NOT assume that a + // successful Cancel response means the request was cancelled — the actual terminal outcome (cancelled, landed, or + // error) must be checked through the separate status / request-log API. + Cancel(context.Context, *CancelRequest) (*CancelResponse, error) // Status returns the current status of a previously submitted request, identified by its sqid. // The status is eventually consistent with the request store and reconciled from the append-only request log. Status(context.Context, *StatusRequest) (*StatusResponse, error) @@ -123,6 +158,9 @@ func (UnimplementedSubmitQueueGatewayServer) Ping(context.Context, *PingRequest) func (UnimplementedSubmitQueueGatewayServer) Land(context.Context, *LandRequest) (*LandResponse, error) { return nil, status.Errorf(codes.Unimplemented, "method Land not implemented") } +func (UnimplementedSubmitQueueGatewayServer) Cancel(context.Context, *CancelRequest) (*CancelResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Cancel not implemented") +} func (UnimplementedSubmitQueueGatewayServer) Status(context.Context, *StatusRequest) (*StatusResponse, error) { return nil, status.Errorf(codes.Unimplemented, "method Status not implemented") } @@ -183,6 +221,24 @@ func _SubmitQueueGateway_Land_Handler(srv interface{}, ctx context.Context, dec return interceptor(ctx, in, info, handler) } +func _SubmitQueueGateway_Cancel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(CancelRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(SubmitQueueGatewayServer).Cancel(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: SubmitQueueGateway_Cancel_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(SubmitQueueGatewayServer).Cancel(ctx, req.(*CancelRequest)) + } + return interceptor(ctx, in, info, handler) +} + func _SubmitQueueGateway_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(StatusRequest) if err := dec(in); err != nil { @@ -216,6 +272,10 @@ var SubmitQueueGateway_ServiceDesc = grpc.ServiceDesc{ MethodName: "Land", Handler: _SubmitQueueGateway_Land_Handler, }, + { + MethodName: "Cancel", + Handler: _SubmitQueueGateway_Cancel_Handler, + }, { MethodName: "Status", Handler: _SubmitQueueGateway_Status_Handler, diff --git a/orchestrator/controller/batch/BUILD.bazel b/orchestrator/controller/batch/BUILD.bazel index 8f629c53..d46674e6 100644 --- a/orchestrator/controller/batch/BUILD.bazel +++ b/orchestrator/controller/batch/BUILD.bazel @@ -31,6 +31,7 @@ go_test( "//extension/conflict/mock", "//extension/counter/mock", "//extension/queue/mock", + "//extension/storage", "//extension/storage/mock", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", diff --git a/orchestrator/controller/batch/batch.go b/orchestrator/controller/batch/batch.go index 23b2e0ef..2d8b8eb3 100644 --- a/orchestrator/controller/batch/batch.go +++ b/orchestrator/controller/batch/batch.go @@ -16,6 +16,7 @@ package batch import ( "context" + "errors" "fmt" "github.com/uber-go/tally/v4" @@ -103,6 +104,18 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) + // Short-circuit if the request has been halted — either it already reached a + // terminal state, or the cancel controller has recorded a cancellation intent + // (RequestStateCancelling). A halted request must never spawn a new batch. + if entity.IsRequestStateHalted(request.State) { + c.metricsScope.Counter("skipped_halted").Inc(1) + c.logger.Infow("skipping batch for halted request", + "request_id", request.ID, + "state", string(request.State), + ) + return nil + } + // TODO: if capacity is full, wait here for other requests to accumulate to batch them together, or include a request into an existing batch if it's not too late. // Generate a globally unique batch ID. @@ -184,6 +197,79 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r return fmt.Errorf("failed to create batch dependent index for new batchID=%s: %w", batch.ID, err) } + // Claim the request for this batch with a CAS-write that transitions the + // request to RequestStateBatched. This CAS is the serialization point + // between the batch controller and the cancel controller — without it, the + // two would race over an empty interleaving and produce an orphan batch + // containing a cancelled request. + // + // Concrete race that this CAS closes (T1..T7 are wall-clock orderings of + // independent batch- and cancel-controller goroutines): + // + // T1 batch.Get(R) → R{State: Validated, Version: 1} + // T2 cancel.Get(R) → R{State: Validated, Version: 1} + // T3 cancel.markCancelling CAS 1→2 → R{State: Cancelling, Version: 2} + // T4 cancel.findActiveBatch(R) → none (batch has not been Created yet) + // T5 cancel.cancelRequest CAS 2→3 → R{State: Cancelled, Version: 3} + // T6 batch.IsRequestStateHalted(R) → false (stale in-memory copy from T1) + // T7 batch.BatchStore.Create(B{[R]}) → orphan batch containing a cancelled R + // + // After T7 the orphan batch flows through score → speculate → merge → conclude; + // conclude does NOT gate on the source request state when writing the terminal + // state, so it would CAS the request from Cancelled back to Landed, silently + // undoing the user's cancel. + // + // The CAS below collapses that window. Whichever of batch.UpdateState(..., + // RequestStateBatched) and cancel.markCancelling(... RequestStateCancelling) + // reaches storage first wins; the loser sees storage.ErrVersionMismatch: + // - If cancel won: this CAS fails. We ack the message (cancel will drive R + // to its terminal state on its own; no batch is needed). The reverse-index + // entry above becomes a dangling BatchDependent — tolerated per the + // "downstream should handle stale entries" contract on this store. + // - If batch won: cancel.markCancelling will fail with ErrVersionMismatch + // on its next attempt, re-fetch R, observe RequestStateBatched, and take + // the batch-cancellation branch (which terminates the whole batch). + // + // Note on re-delivery: a retry of a batch message that already CAS'd R to + // Batched but failed before/after BatchStore.Create lands in this code with + // R already in RequestStateBatched. The top-level IsRequestStateHalted check + // does NOT include Batched (Batched is forward-progress, not halted), so we + // reach here and re-CAS Batched → Batched (a version-only bump). The bump + // keeps the same serialization invariant on every attempt — if cancel sneaks + // in between our Get and this CAS, our version is stale and we abandon, just + // like the first-delivery case. The cost is an extra batch (the previous + // attempt may have already created one) which is tolerated per the comment + // on BatchStore.Create below. + // + // Residual window: a thin race remains between this CAS and BatchStore.Create. + // During that window cancel.findActiveBatch can still observe R in Batched + // with no batch yet persisted, and take the request-only cancel path — which + // then leaves R in Cancelled and the batch we are about to create orphaned. + // Fully closing this requires cancel-side wait/retry when its pre-CAS + // observation was RequestStateBatched; deferred to a follow-up since the + // window is narrow (one storage round-trip) and the user-visible outcome + // (request cancelled) is still correct — the orphan batch just gets + // reconciled by conclude as if it had no requests to act on. + newRequestVersion := request.Version + 1 + if err := c.store.GetRequestStore().UpdateState(ctx, request.ID, request.Version, newRequestVersion, entity.RequestStateBatched); err != nil { + // ErrVersionMismatch == cancel (or another writer) advanced R first. Ack + // the message: there is nothing for us to do, and retrying would not help + // since the new state of R is now visible to the cancel pipeline. + if errors.Is(err, storage.ErrVersionMismatch) { + c.metricsScope.Counter("request_claim_lost_race").Inc(1) + c.logger.Infow("abandoning batch creation; request advanced concurrently (likely cancel)", + "request_id", request.ID, + "request_version", request.Version, + "unused_batch_id", batch.ID, + ) + return nil + } + c.metricsScope.Counter("request_claim_errors").Inc(1) + return fmt.Errorf("failed to claim request %s for batch %s: %w", request.ID, batch.ID, err) + } + request.Version = newRequestVersion + request.State = entity.RequestStateBatched + // Persist batch to storage. // This is the final operation that concludes the batch creation process. If it fails, BatchDependents will be pointing to a batch id that does not exist. // We do not reuse batch ids, a retry of this operation will create a new batch with a new ID. The downstream logic that operates on BatchDependent should be able to handle stale entries. diff --git a/orchestrator/controller/batch/batch_test.go b/orchestrator/controller/batch/batch_test.go index 986349bb..ed90c0b6 100644 --- a/orchestrator/controller/batch/batch_test.go +++ b/orchestrator/controller/batch/batch_test.go @@ -16,6 +16,7 @@ package batch import ( "context" + "errors" "fmt" "sync/atomic" "testing" @@ -31,6 +32,7 @@ import ( conflictmock "github.com/uber/submitqueue/extension/conflict/mock" countermock "github.com/uber/submitqueue/extension/counter/mock" queuemock "github.com/uber/submitqueue/extension/queue/mock" + "github.com/uber/submitqueue/extension/storage" storagemock "github.com/uber/submitqueue/extension/storage/mock" "go.uber.org/mock/gomock" "go.uber.org/zap/zaptest" @@ -82,6 +84,7 @@ func newTestController(t *testing.T, ctrl *gomock.Controller, cnt *countermock.M mockReqStore := storagemock.NewMockRequestStore(ctrl) req := testRequest() mockReqStore.EXPECT().Get(gomock.Any(), req.ID).Return(req, nil).AnyTimes() + mockReqStore.EXPECT().UpdateState(gomock.Any(), req.ID, req.Version, req.Version+1, entity.RequestStateBatched).Return(nil).AnyTimes() mockBatchDependentStore := storagemock.NewMockBatchDependentStore(ctrl) mockBatchDependentStore.EXPECT().Create(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() @@ -232,6 +235,7 @@ func TestController_Process_WithDependencies(t *testing.T) { mockReqStore := storagemock.NewMockRequestStore(ctrl) mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + mockReqStore.EXPECT().UpdateState(gomock.Any(), request.ID, request.Version, request.Version+1, entity.RequestStateBatched).Return(nil) mockStorage := storagemock.NewMockStorage(ctrl) mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() @@ -275,6 +279,7 @@ func TestController_Process_AnalyzerSelectsSubset(t *testing.T) { mockReqStore := storagemock.NewMockRequestStore(ctrl) mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + mockReqStore.EXPECT().UpdateState(gomock.Any(), request.ID, request.Version, request.Version+1, entity.RequestStateBatched).Return(nil) mockStorage := storagemock.NewMockStorage(ctrl) mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() @@ -335,3 +340,193 @@ func TestController_InterfaceImplementation(t *testing.T) { var _ consumer.Controller = controller } + +// A request that is halted (terminal OR Cancelling) must be short-circuited +// before the batch controller queries the batch store, allocates a batch ID, +// CAS-claims the request, or publishes. We verify by configuring a batch +// store and counter with NO EXPECTs (gomock fails on any call), a request +// store that only expects the initial Get (no UpdateState), and a publisher +// that returns a sentinel error if invoked. +// +// Cancelling is non-terminal but must halt forward progress: the cancel +// controller has already recorded the cancellation intent on the request and +// owns the terminal write. Any new batch spawned here would be an orphan +// containing a request that is about to become Cancelled. +func TestController_Process_HaltedShortCircuit(t *testing.T) { + for _, state := range []entity.RequestState{ + entity.RequestStateCancelling, + entity.RequestStateCancelled, + entity.RequestStateLanded, + entity.RequestStateError, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + + request := testRequest() + request.State = state + request.Version = 7 + + // Batch store with no EXPECTs — must not be queried. + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockReqStore := storagemock.NewMockRequestStore(ctrl) + mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + // No UpdateState expected — gomock fails if called. + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + mockStorage.EXPECT().GetRequestStore().Return(mockReqStore).AnyTimes() + + // Counter with no EXPECTs — must not be called. + cnt := countermock.NewMockCounter(ctrl) + + controller := newTestController(t, ctrl, cnt, mockStorage, nil, fmt.Errorf("should not publish")) + + msg := queue.NewMessage(request.ID, requestIDPayload(t, request.ID), request.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) + }) + } +} + +// Race-lost path: the cancel controller's markCancelling CAS landed first, +// so the batch controller's request-claim CAS (Validated → Batched) fails +// with storage.ErrVersionMismatch. The controller must ack the message (the +// cancel pipeline now owns the request) and must NOT call BatchStore.Create +// or publish to the score topic. +// +// This test exercises the race where the halted check at the top of Process +// passed against a stale in-memory copy from the initial Get (the cancel +// controller's CAS landed between our Get and our UpdateState). The CAS +// failure is the safety net that prevents an orphan batch in that window. +func TestController_Process_CASLostToCancel(t *testing.T) { + ctrl := gomock.NewController(t) + + request := testRequest() + + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockBatchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "test-queue", gomock.Any()).Return(nil, nil) + // Create must NOT be called — gomock fails if it is. + + mockBatchDependentStore := storagemock.NewMockBatchDependentStore(ctrl) + // The reverse-index Create still runs because it precedes the CAS; this is + // tolerated per the "downstream handles stale entries" contract. + mockBatchDependentStore.EXPECT().Create(gomock.Any(), gomock.Any()).Return(nil) + + mockReqStore := storagemock.NewMockRequestStore(ctrl) + mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + mockReqStore.EXPECT().UpdateState( + gomock.Any(), request.ID, request.Version, request.Version+1, entity.RequestStateBatched, + ).Return(fmt.Errorf("cas: %w", storage.ErrVersionMismatch)) + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + mockStorage.EXPECT().GetBatchDependentStore().Return(mockBatchDependentStore).AnyTimes() + mockStorage.EXPECT().GetRequestStore().Return(mockReqStore).AnyTimes() + + // Publisher with no EXPECTs — must not be called. + mockPub := queuemock.NewMockPublisher(ctrl) + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{{Key: consumer.TopicKeyScore, Name: "score", Queue: mockQ}}, + ) + require.NoError(t, err) + + controller := NewController( + zaptest.NewLogger(t).Sugar(), tally.NoopScope, registry, newSequentialCounter(ctrl), + mockStorage, all.New(), consumer.TopicKeyBatch, "orchestrator-batch", + ) + + msg := queue.NewMessage(request.ID, requestIDPayload(t, request.ID), request.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) +} + +// Race-unexpected-error: any CAS failure other than ErrVersionMismatch (e.g. +// transient storage error) must surface as an error so the message is nacked +// for retry. We must NOT call BatchStore.Create on the way out. +func TestController_Process_CASUnexpectedErrorPropagates(t *testing.T) { + ctrl := gomock.NewController(t) + + request := testRequest() + + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockBatchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "test-queue", gomock.Any()).Return(nil, nil) + // Create must NOT be called — gomock fails if it is. + + mockBatchDependentStore := storagemock.NewMockBatchDependentStore(ctrl) + mockBatchDependentStore.EXPECT().Create(gomock.Any(), gomock.Any()).Return(nil) + + casErr := fmt.Errorf("db connection lost") + mockReqStore := storagemock.NewMockRequestStore(ctrl) + mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + mockReqStore.EXPECT().UpdateState( + gomock.Any(), request.ID, request.Version, request.Version+1, entity.RequestStateBatched, + ).Return(casErr) + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + mockStorage.EXPECT().GetBatchDependentStore().Return(mockBatchDependentStore).AnyTimes() + mockStorage.EXPECT().GetRequestStore().Return(mockReqStore).AnyTimes() + + controller := newTestController(t, ctrl, newSequentialCounter(ctrl), mockStorage, nil, nil) + + msg := queue.NewMessage(request.ID, requestIDPayload(t, request.ID), request.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + err := controller.Process(context.Background(), delivery) + require.Error(t, err) + // Cause must be preserved for upstream classification. + assert.True(t, errors.Is(err, casErr)) +} + +// Recovery path: a re-delivered batch message whose prior attempt CAS'd the +// request to RequestStateBatched but failed before BatchStore.Create. The +// halted check at the top of Process does NOT include Batched (Batched is +// forward-progress, not halted), so we reach the CAS again and re-bump the +// version on the request (Batched → Batched, version+1). The batch is then +// re-created with a new batch ID, which is tolerated per the existing +// duplicate-handling comment on BatchStore.Create. +func TestController_Process_RecoveryAfterPriorCAS(t *testing.T) { + ctrl := gomock.NewController(t) + + request := testRequest() + request.State = entity.RequestStateBatched + request.Version = 2 // prior attempt bumped from 1 → 2 + + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockBatchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "test-queue", gomock.Any()).Return(nil, nil) + mockBatchStore.EXPECT().Create(gomock.Any(), gomock.Any()).Return(nil) + + mockBatchDependentStore := storagemock.NewMockBatchDependentStore(ctrl) + mockBatchDependentStore.EXPECT().Create(gomock.Any(), gomock.Any()).Return(nil) + + mockReqStore := storagemock.NewMockRequestStore(ctrl) + mockReqStore.EXPECT().Get(gomock.Any(), request.ID).Return(request, nil) + mockReqStore.EXPECT().UpdateState( + gomock.Any(), request.ID, request.Version, request.Version+1, entity.RequestStateBatched, + ).Return(nil) + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + mockStorage.EXPECT().GetBatchDependentStore().Return(mockBatchDependentStore).AnyTimes() + mockStorage.EXPECT().GetRequestStore().Return(mockReqStore).AnyTimes() + + controller := newTestController(t, ctrl, newSequentialCounter(ctrl), mockStorage, nil, nil) + + msg := queue.NewMessage(request.ID, requestIDPayload(t, request.ID), request.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) +} diff --git a/orchestrator/controller/build/build.go b/orchestrator/controller/build/build.go index de61374b..984cf691 100644 --- a/orchestrator/controller/build/build.go +++ b/orchestrator/controller/build/build.go @@ -100,6 +100,20 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) + // If the batch is halted (terminal OR cancelling), skip triggering CI and + // ack. This is a forward-progress controller: per the cancel design, the + // speculate controller owns cancelling any in-flight Build and driving the + // batch to its terminal state, so the build stage simply short-circuits + // while speculate does the work. No external CI is ever kicked off. + if entity.IsBatchStateHalted(batch.State) { + metrics.NamedCounter(c.metricsScope, opName, "skipped_halted", 1) + c.logger.Infow("skipping build for halted batch", + "batch_id", batch.ID, + "state", string(batch.State), + ) + return nil + } + // Assemble base (dependency batches in order) and head (this batch). base, err := c.collectChanges(ctx, batch.Dependencies) if err != nil { diff --git a/orchestrator/controller/build/build_test.go b/orchestrator/controller/build/build_test.go index a015907a..143efaed 100644 --- a/orchestrator/controller/build/build_test.go +++ b/orchestrator/controller/build/build_test.go @@ -343,3 +343,42 @@ func TestController_InterfaceImplementation(t *testing.T) { var _ consumer.Controller = controller } + +// A batch in any halted state (terminal OR cancelling) must short-circuit: +// the build controller acks without triggering an external CI run and without +// publishing anything. Per the cancel design the speculate controller owns +// cancelling in-flight builds and driving the batch terminal, so the build +// stage simply does no work. Cancelling is included because the cancel +// controller is mid-flight; both halted branches reach the same observable +// behaviour (no build performed). +func TestController_Process_HaltedShortCircuit(t *testing.T) { + for _, state := range []entity.BatchState{ + entity.BatchStateCancelled, + entity.BatchStateCancelling, + entity.BatchStateSucceeded, + entity.BatchStateFailed, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + + batch := testBatch() + batch.State = state + store := newMockStorage(ctrl, batch) + + // No Trigger expectation: a stray CI trigger on a halted batch + // fails the test. + br := buildrunnermock.NewMockBuildRunner(ctrl) + + // Sentinel publish error: the halted path must not publish. If it + // does, Process surfaces this error and require.NoError catches it. + controller := newTestController(t, ctrl, store, br, fmt.Errorf("should not publish")) + + msg := queue.NewMessage(batch.ID, batchIDPayload(t, batch.ID), batch.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) + }) + } +} diff --git a/orchestrator/controller/buildsignal/buildsignal.go b/orchestrator/controller/buildsignal/buildsignal.go index 05ac22de..717be5be 100644 --- a/orchestrator/controller/buildsignal/buildsignal.go +++ b/orchestrator/controller/buildsignal/buildsignal.go @@ -138,6 +138,25 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r return fmt.Errorf("failed to get status for build %s: %w", buildID.ID, err) } + // Short-circuit if the batch is already halted (terminal OR cancelling). + // Speculate is already idempotent on terminal, but skipping the publish + // avoids noise. For Cancelling batches the cancel controller owns the + // terminal write and the downstream fan-out, so further pipeline work + // would race against it; silent ack is the only safe action. + batch, err := c.store.GetBatchStore().Get(ctx, build.BatchID) + if err != nil { + metrics.NamedCounter(c.metricsScope, opName, "storage_errors", 1) + return fmt.Errorf("failed to get batch %s: %w", build.BatchID, err) + } + if entity.IsBatchStateHalted(batch.State) { + metrics.NamedCounter(c.metricsScope, opName, "skipped_halted", 1) + c.logger.Infow("skipping buildsignal publish for halted batch", + "batch_id", batch.ID, + "state", string(batch.State), + ) + return nil + } + build.Status = status if err := c.store.GetBuildStore().UpdateStatus(ctx, build.ID, status); err != nil { diff --git a/orchestrator/controller/buildsignal/buildsignal_test.go b/orchestrator/controller/buildsignal/buildsignal_test.go index 08498120..83fd9841 100644 --- a/orchestrator/controller/buildsignal/buildsignal_test.go +++ b/orchestrator/controller/buildsignal/buildsignal_test.go @@ -40,6 +40,7 @@ type testHarness struct { controller *Controller br *buildrunnermock.MockBuildRunner buildStore *storagemock.MockBuildStore + batchStore *storagemock.MockBatchStore signalPub *queuemock.MockPublisher speculatePub *queuemock.MockPublisher } @@ -62,8 +63,10 @@ func newTestHarness(t *testing.T, ctrl *gomock.Controller) *testHarness { require.NoError(t, err) buildStore := storagemock.NewMockBuildStore(ctrl) + batchStore := storagemock.NewMockBatchStore(ctrl) store := storagemock.NewMockStorage(ctrl) store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() c := NewController( zaptest.NewLogger(t).Sugar(), @@ -78,6 +81,7 @@ func newTestHarness(t *testing.T, ctrl *gomock.Controller) *testHarness { controller: c, br: br, buildStore: buildStore, + batchStore: batchStore, signalPub: signalPub, speculatePub: speculatePub, } @@ -130,6 +134,7 @@ func TestController_Process_Terminal(t *testing.T) { h.buildStore.EXPECT().Get(gomock.Any(), build.ID).Return(build, nil) h.br.EXPECT().Status(gomock.Any(), entity.BuildID{ID: build.ID}).Return(tt.status, entity.BuildMetadata{}, nil) + h.batchStore.EXPECT().Get(gomock.Any(), build.BatchID).Return(entity.Batch{ID: build.BatchID, State: entity.BatchStateSpeculating}, nil) h.buildStore.EXPECT().UpdateStatus(gomock.Any(), build.ID, tt.status).Return(nil) h.speculatePub.EXPECT(). Publish(gomock.Any(), "speculate", gomock.AssignableToTypeOf(entityqueue.Message{})). @@ -169,6 +174,7 @@ func TestController_Process_NonTerminal(t *testing.T) { h.buildStore.EXPECT().Get(gomock.Any(), build.ID).Return(build, nil) h.br.EXPECT().Status(gomock.Any(), entity.BuildID{ID: build.ID}).Return(tt.status, entity.BuildMetadata{}, nil) + h.batchStore.EXPECT().Get(gomock.Any(), build.BatchID).Return(entity.Batch{ID: build.BatchID, State: entity.BatchStateSpeculating}, nil) h.buildStore.EXPECT().UpdateStatus(gomock.Any(), build.ID, tt.status).Return(nil) h.speculatePub.EXPECT().Publish(gomock.Any(), "speculate", gomock.Any()).Return(nil).Times(1) h.signalPub.EXPECT(). @@ -211,6 +217,7 @@ func TestController_Process_UpdateStatusError(t *testing.T) { h.buildStore.EXPECT().Get(gomock.Any(), build.ID).Return(build, nil) h.br.EXPECT().Status(gomock.Any(), entity.BuildID{ID: build.ID}).Return(entity.BuildStatusRunning, nil, nil) + h.batchStore.EXPECT().Get(gomock.Any(), build.BatchID).Return(entity.Batch{ID: build.BatchID, State: entity.BatchStateSpeculating}, nil) h.buildStore.EXPECT().UpdateStatus(gomock.Any(), build.ID, entity.BuildStatusRunning). Return(errors.New("db unreachable")) // No Publish / PublishAfter expected after the store failure. @@ -233,6 +240,7 @@ func TestController_Process_RepublishError(t *testing.T) { h.buildStore.EXPECT().Get(gomock.Any(), build.ID).Return(build, nil) h.br.EXPECT().Status(gomock.Any(), entity.BuildID{ID: build.ID}).Return(entity.BuildStatusRunning, entity.BuildMetadata{}, nil) + h.batchStore.EXPECT().Get(gomock.Any(), build.BatchID).Return(entity.Batch{ID: build.BatchID, State: entity.BatchStateSpeculating}, nil) h.buildStore.EXPECT().UpdateStatus(gomock.Any(), build.ID, entity.BuildStatusRunning).Return(nil) h.speculatePub.EXPECT().Publish(gomock.Any(), "speculate", gomock.Any()).Return(nil).Times(1) h.signalPub.EXPECT(). @@ -273,3 +281,34 @@ func TestController_Process_MalformedPayload(t *testing.T) { err := h.controller.Process(context.Background(), d) require.Error(t, err) } + +// A halted batch (terminal OR cancelling) must short-circuit: just ack, no +// status persist and no publish to speculate. For terminal: speculate is +// already idempotent on terminal, but skipping the publish keeps the system +// from re-emitting noise. For Cancelling: the cancel controller owns the +// terminal write and downstream fan-out, so any further pipeline work would +// race against it. +func TestController_Process_HaltedShortCircuit(t *testing.T) { + for _, state := range []entity.BatchState{ + entity.BatchStateCancelled, + entity.BatchStateCancelling, + entity.BatchStateSucceeded, + entity.BatchStateFailed, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + h := newTestHarness(t, ctrl) + + build := entity.Build{ID: "b-halt", BatchID: "batch-halt", Status: entity.BuildStatusAccepted} + + h.buildStore.EXPECT().Get(gomock.Any(), build.ID).Return(build, nil) + h.br.EXPECT().Status(gomock.Any(), entity.BuildID{ID: build.ID}).Return(entity.BuildStatusRunning, entity.BuildMetadata{}, nil) + h.batchStore.EXPECT().Get(gomock.Any(), build.BatchID).Return(entity.Batch{ID: build.BatchID, State: state}, nil) + // Halted: no UpdateStatus, no speculate Publish, no buildsignal + // PublishAfter. The harness publishers have no expectations, so any + // publish fails the test. + + require.NoError(t, h.controller.Process(context.Background(), buildDelivery(t, ctrl, build))) + }) + } +} diff --git a/orchestrator/controller/cancel/BUILD.bazel b/orchestrator/controller/cancel/BUILD.bazel new file mode 100644 index 00000000..24ade4dc --- /dev/null +++ b/orchestrator/controller/cancel/BUILD.bazel @@ -0,0 +1,36 @@ +load("@rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "cancel", + srcs = ["cancel.go"], + importpath = "github.com/uber/submitqueue/orchestrator/controller/cancel", + visibility = ["//visibility:public"], + deps = [ + "//core/consumer", + "//core/request", + "//entity", + "//entity/queue", + "//extension/storage", + "@com_github_uber_go_tally_v4//:tally", + "@org_uber_go_zap//:zap", + ], +) + +go_test( + name = "cancel_test", + srcs = ["cancel_test.go"], + embed = [":cancel"], + deps = [ + "//core/consumer", + "//entity", + "//entity/queue", + "//extension/queue/mock", + "//extension/storage", + "//extension/storage/mock", + "@com_github_stretchr_testify//assert", + "@com_github_stretchr_testify//require", + "@com_github_uber_go_tally_v4//:tally", + "@org_uber_go_mock//gomock", + "@org_uber_go_zap//zaptest", + ], +) diff --git a/orchestrator/controller/cancel/cancel.go b/orchestrator/controller/cancel/cancel.go new file mode 100644 index 00000000..431939ef --- /dev/null +++ b/orchestrator/controller/cancel/cancel.go @@ -0,0 +1,337 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cancel implements the orchestrator-side cancel controller. +// +// The controller consumes CancelRequest messages from the cancel topic and +// records the user's cancellation intent. Two distinct paths exist after the +// request-level intent (RequestStateCancelling) is recorded: +// +// - The request has not yet been enrolled into a batch — the controller +// transitions Cancelling → RequestStateCancelled directly and emits a +// RequestStatusCancelled log entry. This path is fully owned by the cancel +// controller. +// +// - The request is already part of an active batch — the controller performs +// a single intent CAS on the batch (advancing it to BatchStateCancelling) +// and hands off to the speculate controller by publishing the batch ID to +// TopicKeySpeculate. The speculate controller then owns: cancelling any +// in-flight Build entity for the batch, fanning out to dependents, the +// terminal CAS to BatchStateCancelled, and publishing to conclude. Cancel +// does no terminal write and no downstream fan-out on the batch path. +// +// The split exists so that the terminal write and the work that must precede +// it (cancelling builds, respeculating dependents) live in the same controller +// — speculate is the single writer of every non-Cancelling batch state and is +// already wired with the build/dependent stores. Forward-progress controllers +// (score, build, buildsignal, merge) observe BatchStateCancelling via +// IsBatchStateHalted and short-circuit while speculate drives the batch to +// its terminal state. +// +// The controller is idempotent: re-delivery of the same CancelRequest after +// the terminal request transition is a no-op; re-delivery after the +// Cancelling write skips the mark-cancelling step and proceeds straight to +// the batch lookup. On the batch path, re-delivery against an already +// Cancelling batch re-publishes to TopicKeySpeculate (a cheap no-op nudge +// the speculate controller absorbs). +// +// Concurrent producers surface as storage.ErrVersionMismatch; the controller +// returns the wrapped error as-is and relies on the base controller layer to +// classify it as retryable so the next attempt sees the new state and takes +// the other branch. storage.ErrNotFound on the initial Get (the start +// controller has not yet persisted the request) is returned as-is for the +// same reason. +package cancel + +import ( + "context" + "fmt" + + "github.com/uber-go/tally/v4" + "github.com/uber/submitqueue/core/consumer" + corerequest "github.com/uber/submitqueue/core/request" + "github.com/uber/submitqueue/entity" + entityqueue "github.com/uber/submitqueue/entity/queue" + "github.com/uber/submitqueue/extension/storage" + "go.uber.org/zap" +) + +// Controller handles cancel queue messages. Implements consumer.Controller. +type Controller struct { + logger *zap.SugaredLogger + metricsScope tally.Scope + store storage.Storage + registry consumer.TopicRegistry + topicKey consumer.TopicKey + consumerGroup string +} + +// Verify Controller implements consumer.Controller interface at compile time. +var _ consumer.Controller = (*Controller)(nil) + +// NewController creates a new cancel controller for the orchestrator. +func NewController( + logger *zap.SugaredLogger, + scope tally.Scope, + store storage.Storage, + registry consumer.TopicRegistry, + topicKey consumer.TopicKey, + consumerGroup string, +) *Controller { + return &Controller{ + logger: logger.Named("cancel_controller"), + metricsScope: scope.SubScope("cancel_controller"), + store: store, + registry: registry, + topicKey: topicKey, + consumerGroup: consumerGroup, + } +} + +// Process processes a cancel delivery from the queue. +// Returns nil to ack (success), or error to nack (retry). +func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) error { + c.metricsScope.Counter("received").Inc(1) + + msg := delivery.Message() + + cancelReq, err := entity.CancelRequestFromBytes(msg.Payload) + if err != nil { + c.metricsScope.Counter("deserialize_errors").Inc(1) + return fmt.Errorf("failed to deserialize cancel request: %w", err) + } + + request, err := c.store.GetRequestStore().Get(ctx, cancelReq.ID) + if err != nil { + c.metricsScope.Counter("storage_errors").Inc(1) + return fmt.Errorf("failed to get request %s: %w", cancelReq.ID, err) + } + + c.logger.Infow("received cancel event", + "request_id", request.ID, + "queue", request.Queue, + "state", string(request.State), + "version", request.Version, + "reason", cancelReq.Reason, + "attempt", delivery.Attempt(), + "partition_key", msg.PartitionKey, + ) + + if entity.IsRequestStateTerminal(request.State) { + c.metricsScope.Counter("already_terminal").Inc(1) + return nil + } + + // Step 1: record the cancellation intent on the request itself by transitioning + // to RequestStateCancelling. This is non-terminal; forward-progress controllers + // (validate, batch) treat it as halted, but conclude may still write a different + // terminal state if a concurrent merge or failure wins the race. + request, err = c.markCancelling(ctx, request) + if err != nil { + return err + } + + // Look for an active batch that already contains this request. + batch, found, err := c.findActiveBatch(ctx, request) + if err != nil { + return err + } + + if !found { + return c.cancelRequest(ctx, request, cancelReq.Reason) + } + return c.cancelBatch(ctx, batch) +} + +// markCancelling transitions the request to RequestStateCancelling (intent) if it +// isn't already in that state. Returns the updated request (with the post-CAS +// version and state) on success, or the original request on the idempotent path +// where the prior delivery already wrote Cancelling. +// +// storage.ErrVersionMismatch (a concurrent writer — most likely conclude +// observing a batch transition) is returned as-is for the base controller to +// classify and retry; the next attempt re-fetches and re-evaluates (it may now +// be terminal, in which case the top-level terminal-check acks). +func (c *Controller) markCancelling(ctx context.Context, request entity.Request) (entity.Request, error) { + if request.State == entity.RequestStateCancelling { + // Idempotent re-delivery: prior pass already recorded intent. + c.metricsScope.Counter("already_cancelling").Inc(1) + return request, nil + } + newVersion := request.Version + 1 + if err := c.store.GetRequestStore().UpdateState(ctx, request.ID, request.Version, newVersion, entity.RequestStateCancelling); err != nil { + c.metricsScope.Counter("request_update_errors").Inc(1) + return entity.Request{}, fmt.Errorf("failed to mark request %s as cancelling: %w", request.ID, err) + } + request.Version = newVersion + request.State = entity.RequestStateCancelling + c.metricsScope.Counter("request_cancelling").Inc(1) + return request, nil +} + +// findActiveBatch scans all active batches in the request's queue for one whose +// Contains list includes the request. Returns (batch, true, nil) on a hit, +// (zero, false, nil) when the request is not yet batched, and any storage +// error otherwise. +// +// BatchStateCancelling is included in the active-state list so an idempotent +// redelivery of the cancel message (the prior pass wrote the intent but the +// speculate hand-off publish failed) still resolves the batch and re-attempts +// the publish. +func (c *Controller) findActiveBatch(ctx context.Context, request entity.Request) (entity.Batch, bool, error) { + // TODO: Scans all the batches in flight - make it more efficient? + active, err := c.store.GetBatchStore().GetByQueueAndStates(ctx, request.Queue, []entity.BatchState{ + entity.BatchStateCreated, + entity.BatchStateScored, + entity.BatchStateSpeculating, + entity.BatchStateMerging, + entity.BatchStateCancelling, + }) + if err != nil { + c.metricsScope.Counter("batch_store_errors").Inc(1) + return entity.Batch{}, false, fmt.Errorf("failed to get active batches for queue=%s: %w", request.Queue, err) + } + + for _, b := range active { + for _, rid := range b.Contains { + if rid == request.ID { + return b, true, nil + } + } + } + return entity.Batch{}, false, nil +} + +// cancelRequest performs the terminal CAS (Cancelling → Cancelled) for a request +// that is not part of any active batch, and emits the RequestStatusCancelled log +// entry. storage.ErrVersionMismatch here means a concurrent writer (typically +// conclude after a racing batch terminal transition) advanced the request between +// our mark-cancelling CAS and this terminal CAS — returned as-is for the base +// controller to classify and retry; the next pass will observe the new state +// (likely terminal) and ack via the top-level terminal-check. +func (c *Controller) cancelRequest(ctx context.Context, request entity.Request, reason string) error { + newVersion := request.Version + 1 + if err := c.store.GetRequestStore().UpdateState(ctx, request.ID, request.Version, newVersion, entity.RequestStateCancelled); err != nil { + c.metricsScope.Counter("request_update_errors").Inc(1) + return fmt.Errorf("failed to cancel request %s: %w", request.ID, err) + } + + metadata := map[string]string{} + if reason != "" { + metadata["reason"] = reason + } + logEntry := entity.NewRequestLog(request.ID, entity.RequestStatusCancelled, newVersion, "", metadata) + if err := corerequest.PublishLog(ctx, c.registry, logEntry, request.ID); err != nil { + c.metricsScope.Counter("log_publish_errors").Inc(1) + return fmt.Errorf("failed to publish cancel log for request %s: %w", request.ID, err) + } + + c.logger.Infow("request cancelled (not batched)", + "request_id", request.ID, + "queue", request.Queue, + ) + c.metricsScope.Counter("request_cancelled").Inc(1) + return nil +} + +// cancelBatch records the cancellation intent on the batch and hands off to +// the speculate controller. It performs the intent CAS (Cancelling) if not +// already in that state, then publishes the batch ID to TopicKeySpeculate. +// +// The speculate controller owns everything from there: cancelling any +// in-flight Build entity for the batch, fanning out to dependents, the +// terminal CAS to BatchStateCancelled, and publishing to conclude. Cancel +// does not perform a terminal write and does not emit per-request logs on +// the batch path (the gateway already wrote RequestStatusCancelling with +// the reason at intent time; conclude writes the terminal log entries when +// it reconciles request state). +// +// On idempotent redelivery the batch may already be in BatchStateCancelling +// (a prior pass wrote the intent but the publish failed). In that case the +// intent CAS is skipped and we just re-publish — speculate absorbs the +// duplicate as a cheap no-op nudge. +func (c *Controller) cancelBatch(ctx context.Context, batch entity.Batch) error { + c.logger.Infow("handing batch cancellation off to speculate", + "batch_id", batch.ID, + "queue", batch.Queue, + "batch_state", string(batch.State), + ) + + if batch.State != entity.BatchStateCancelling { + newVersion := batch.Version + 1 + if err := c.store.GetBatchStore().UpdateState(ctx, batch.ID, batch.Version, newVersion, entity.BatchStateCancelling); err != nil { + c.metricsScope.Counter("batch_update_errors").Inc(1) + // storage.ErrVersionMismatch here means the batch advanced concurrently + // (e.g. speculate / merge progressed). Returned as-is for the base + // controller to classify and retry; the re-fetch will see the new state + // and either short-circuit (already terminal) or attempt the transition + // again. + return fmt.Errorf("failed to mark batch %s as cancelling: %w", batch.ID, err) + } + batch.Version = newVersion + batch.State = entity.BatchStateCancelling + c.metricsScope.Counter("batch_cancelling").Inc(1) + } else { + c.metricsScope.Counter("batch_already_cancelling").Inc(1) + } + + if err := c.publishBatchID(ctx, consumer.TopicKeySpeculate, batch.ID, batch.Queue); err != nil { + c.metricsScope.Counter("publish_errors").Inc(1) + return fmt.Errorf("failed to hand off cancelled batch %s to speculate: %w", batch.ID, err) + } + + c.metricsScope.Counter("batch_handed_off").Inc(1) + return nil +} + +// publishBatchID publishes a BatchID-payload message to the specified topic key. +func (c *Controller) publishBatchID(ctx context.Context, key consumer.TopicKey, batchID string, partitionKey string) error { + bid := entity.BatchID{ID: batchID} + payload, err := bid.ToBytes() + if err != nil { + return fmt.Errorf("failed to serialize batch ID: %w", err) + } + + msg := entityqueue.NewMessage(batchID, payload, partitionKey, nil) + + q, ok := c.registry.Queue(key) + if !ok { + return fmt.Errorf("no queue registered for topic key %s", key) + } + + topicName, ok := c.registry.TopicName(key) + if !ok { + return fmt.Errorf("no topic name registered for topic key %s", key) + } + + if err := q.Publisher().Publish(ctx, topicName, msg); err != nil { + return fmt.Errorf("failed to publish message: %w", err) + } + return nil +} + +// Name returns the controller name for logging and metrics. +func (c *Controller) Name() string { + return "cancel" +} + +// TopicKey returns the topic key this controller subscribes to. +func (c *Controller) TopicKey() consumer.TopicKey { + return c.topicKey +} + +// ConsumerGroup returns the consumer group for offset tracking. +func (c *Controller) ConsumerGroup() string { + return c.consumerGroup +} diff --git a/orchestrator/controller/cancel/cancel_test.go b/orchestrator/controller/cancel/cancel_test.go new file mode 100644 index 00000000..7b280954 --- /dev/null +++ b/orchestrator/controller/cancel/cancel_test.go @@ -0,0 +1,395 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cancel + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/uber-go/tally/v4" + "github.com/uber/submitqueue/core/consumer" + "github.com/uber/submitqueue/entity" + "github.com/uber/submitqueue/entity/queue" + queuemock "github.com/uber/submitqueue/extension/queue/mock" + "github.com/uber/submitqueue/extension/storage" + storagemock "github.com/uber/submitqueue/extension/storage/mock" + "go.uber.org/mock/gomock" + "go.uber.org/zap/zaptest" +) + +// cancelPayload serializes a CancelRequest to JSON bytes for test message payloads. +func cancelPayload(t *testing.T, id, reason string) []byte { + payload, err := entity.CancelRequest{ID: id, Reason: reason}.ToBytes() + require.NoError(t, err) + return payload +} + +// newRegistry builds a registry that registers a no-op publisher for every +// topic the cancel controller may publish to (speculate on the batch path, +// log on the unbatched-request path). Returns the registry plus the +// publisher mock so callers may attach EXPECTs or read captured messages. +func newRegistry(t *testing.T, ctrl *gomock.Controller) (consumer.TopicRegistry, *queuemock.MockPublisher) { + t.Helper() + pub := queuemock.NewMockPublisher(ctrl) + q := queuemock.NewMockQueue(ctrl) + q.EXPECT().Publisher().Return(pub).AnyTimes() + + reg, err := consumer.NewTopicRegistry([]consumer.TopicConfig{ + {Key: consumer.TopicKeyCancel, Name: "cancel", Queue: q}, + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: q}, + {Key: consumer.TopicKeyLog, Name: "log", Queue: q}, + }) + require.NoError(t, err) + return reg, pub +} + +func newController(t *testing.T, store storage.Storage, registry consumer.TopicRegistry) *Controller { + return NewController(zaptest.NewLogger(t).Sugar(), tally.NoopScope, store, registry, consumer.TopicKeyCancel, "orchestrator-cancel") +} + +func newDelivery(t *testing.T, ctrl *gomock.Controller, payload []byte, partitionKey string) consumer.Delivery { + msg := queue.NewMessage("cancel-msg", payload, partitionKey, nil) + d := queuemock.NewMockDelivery(ctrl) + d.EXPECT().Message().Return(msg).AnyTimes() + d.EXPECT().Attempt().Return(1).AnyTimes() + return d +} + +func TestNewController(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + + store := storagemock.NewMockStorage(ctrl) + controller := newController(t, store, registry) + + require.NotNil(t, controller) + assert.Equal(t, consumer.TopicKeyCancel, controller.TopicKey()) + assert.Equal(t, "orchestrator-cancel", controller.ConsumerGroup()) + assert.Equal(t, "cancel", controller.Name()) + + var _ consumer.Controller = controller +} + +func TestProcess_AlreadyTerminal_NoOp(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + // No Publish expectations: idempotent re-delivery does nothing. + _ = pub + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{ + ID: "q/1", Queue: "q", State: entity.RequestStateCancelled, Version: 5, + }, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.NoError(t, err) +} + +func TestProcess_RequestNotFound_Retryable(t *testing.T) { + ctrl := gomock.NewController(t) + registry, _ := newRegistry(t, ctrl) + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{}, storage.ErrNotFound) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.Error(t, err) + assert.ErrorIs(t, err, storage.ErrNotFound) +} + +func TestProcess_CancelsUnbatchedRequest(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + + var publishedTopics []string + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, topic string, _ queue.Message) error { + publishedTopics = append(publishedTopics, topic) + return nil + }).AnyTimes() + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{ + ID: "q/1", Queue: "q", State: entity.RequestStateStarted, Version: 2, + }, nil) + // Two-step transition: first mark Cancelling, then Cancelled. + gomock.InOrder( + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(2), int32(3), entity.RequestStateCancelling).Return(nil), + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(3), int32(4), entity.RequestStateCancelled).Return(nil), + ) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return(nil, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", "user changed mind"), "q/1")) + require.NoError(t, err) + + // Only the request log entry should have been published. + assert.Equal(t, []string{"log"}, publishedTopics) +} + +// TestProcess_AlreadyCancelling_SkipsMarkCancelling exercises the idempotent +// re-delivery path for the unbatched-request flow: the prior pass already +// wrote RequestStateCancelling, so the mark-cancelling CAS must be skipped +// and we proceed straight to the batch lookup + terminal CAS. +func TestProcess_AlreadyCancelling_SkipsMarkCancelling(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{ + ID: "q/1", Queue: "q", State: entity.RequestStateCancelling, Version: 3, + }, nil) + // Only the terminal CAS — the mark-cancelling step is a no-op. + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(3), int32(4), entity.RequestStateCancelled).Return(nil) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return(nil, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.NoError(t, err) +} + +// TestProcess_MarkCancellingVersionMismatch_Retryable covers the case where the +// first CAS (mark-cancelling) loses to a concurrent writer. The underlying +// storage.ErrVersionMismatch must be preserved in the error chain so the base +// controller can classify it as retryable; the next pass re-fetches and +// re-evaluates (possibly observing a terminal state and acking). +func TestProcess_MarkCancellingVersionMismatch_Retryable(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + _ = pub + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{ + ID: "q/1", Queue: "q", State: entity.RequestStateStarted, Version: 2, + }, nil) + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(2), int32(3), entity.RequestStateCancelling). + Return(storage.ErrVersionMismatch) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.Error(t, err) + assert.ErrorIs(t, err, storage.ErrVersionMismatch) +} + +func TestProcess_UnbatchedVersionMismatch_Retryable(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + _ = pub + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{ + ID: "q/1", Queue: "q", State: entity.RequestStateStarted, Version: 2, + }, nil) + gomock.InOrder( + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(2), int32(3), entity.RequestStateCancelling).Return(nil), + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(3), int32(4), entity.RequestStateCancelled). + Return(storage.ErrVersionMismatch), + ) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return(nil, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.Error(t, err) + assert.ErrorIs(t, err, storage.ErrVersionMismatch) +} + +// TestProcess_BatchPath_HandsOffToSpeculate asserts the entire batch path: +// the request intent CAS runs, the batch intent CAS to Cancelling runs, and +// exactly one publish lands on the speculate topic with the batch ID as the +// message ID. The controller does NOT perform a terminal batch CAS, does +// NOT publish to conclude, and does NOT emit a per-request log on this path +// (the gateway already wrote the Cancelling intent log; conclude writes the +// terminal log when it reconciles request state). +func TestProcess_BatchPath_HandsOffToSpeculate(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + + type pubRec struct { + topic string + msgID string + } + var records []pubRec + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, topic string, msg queue.Message) error { + records = append(records, pubRec{topic: topic, msgID: msg.ID}) + return nil + }).AnyTimes() + + req := entity.Request{ID: "q/1", Queue: "q", State: entity.RequestStateStarted, Version: 2} + batch := entity.Batch{ + ID: "q/batch/1", + Queue: "q", + Contains: []string{"q/1", "q/2"}, + State: entity.BatchStateSpeculating, + Version: 3, + } + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(req, nil) + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(2), int32(3), entity.RequestStateCancelling).Return(nil) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return([]entity.Batch{batch}, nil) + // Single batch CAS: intent only. No terminal CAS. + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(3), int32(4), entity.BatchStateCancelling).Return(nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + // BatchDependentStore and BuildStore must NOT be touched — speculate owns those now. + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", "stop"), "q/1")) + require.NoError(t, err) + + assert.Equal(t, []pubRec{{topic: "speculate", msgID: "q/batch/1"}}, records) +} + +// TestProcess_BatchAlreadyCancelling_RepublishesToSpeculate covers the +// idempotent redelivery path on the batch flow: a prior pass wrote the +// intent CAS but the speculate publish failed. The cancel controller must +// observe the active (Cancelling) batch via findActiveBatch, skip the +// intent CAS, and re-publish the batch ID to TopicKeySpeculate so the +// speculate controller can pick the work up. +func TestProcess_BatchAlreadyCancelling_RepublishesToSpeculate(t *testing.T) { + ctrl := gomock.NewController(t) + registry, pub := newRegistry(t, ctrl) + + var publishedTopics []string + pub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, topic string, _ queue.Message) error { + publishedTopics = append(publishedTopics, topic) + return nil + }).AnyTimes() + + // Request is already in RequestStateCancelling from the prior pass; the + // batch is in BatchStateCancelling from the same prior pass. + req := entity.Request{ID: "q/1", Queue: "q", State: entity.RequestStateCancelling, Version: 3} + batch := entity.Batch{ + ID: "q/batch/1", + Queue: "q", + Contains: []string{"q/1"}, + State: entity.BatchStateCancelling, + Version: 4, + } + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(req, nil) + // No request UpdateState — already in Cancelling. + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return([]entity.Batch{batch}, nil) + // No batch UpdateState — already in Cancelling. + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.NoError(t, err) + + assert.Equal(t, []string{"speculate"}, publishedTopics) +} + +// TestProcess_BatchIntentVersionMismatch_Retryable covers the case where the +// intent CAS (mark batch Cancelling) loses to a concurrent batch state +// transition (e.g. speculate just advanced it). storage.ErrVersionMismatch +// must be preserved so the base controller can classify the failure as +// retryable. +func TestProcess_BatchIntentVersionMismatch_Retryable(t *testing.T) { + ctrl := gomock.NewController(t) + registry, _ := newRegistry(t, ctrl) + + req := entity.Request{ID: "q/1", Queue: "q", State: entity.RequestStateStarted, Version: 2} + batch := entity.Batch{ID: "q/batch/1", Queue: "q", Contains: []string{"q/1"}, State: entity.BatchStateCreated, Version: 1} + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(req, nil) + reqStore.EXPECT().UpdateState(gomock.Any(), "q/1", int32(2), int32(3), entity.RequestStateCancelling).Return(nil) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().GetByQueueAndStates(gomock.Any(), "q", gomock.Any()).Return([]entity.Batch{batch}, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelling). + Return(storage.ErrVersionMismatch) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.Error(t, err) + assert.ErrorIs(t, err, storage.ErrVersionMismatch) +} + +func TestProcess_DeserializeError(t *testing.T) { + ctrl := gomock.NewController(t) + registry, _ := newRegistry(t, ctrl) + + store := storagemock.NewMockStorage(ctrl) + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, []byte("not json"), "q/1")) + require.Error(t, err) +} + +func TestProcess_RequestStoreError(t *testing.T) { + ctrl := gomock.NewController(t) + registry, _ := newRegistry(t, ctrl) + + reqStore := storagemock.NewMockRequestStore(ctrl) + reqStore.EXPECT().Get(gomock.Any(), "q/1").Return(entity.Request{}, fmt.Errorf("db down")) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetRequestStore().Return(reqStore).AnyTimes() + + controller := newController(t, store, registry) + err := controller.Process(context.Background(), newDelivery(t, ctrl, cancelPayload(t, "q/1", ""), "q/1")) + require.Error(t, err) +} diff --git a/orchestrator/controller/conclude/conclude.go b/orchestrator/controller/conclude/conclude.go index a955d472..3c37bf87 100644 --- a/orchestrator/controller/conclude/conclude.go +++ b/orchestrator/controller/conclude/conclude.go @@ -92,11 +92,10 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) - // TODO: Handle cancellation - // Map batch terminal state to request state. - // We expect the batch to be in a terminal state - // as updated by the merge controller. + // We expect the batch to be in a terminal state as written by the merge + // controller (Succeeded) or the speculate controller (Failed via + // failOnDependency, Cancelled via cancelBatch). requestState, err := batchStateToRequestState(batch.State) if err != nil { metrics.NamedCounter(c.metricsScope, "process", "unexpected_state_errors", 1) @@ -150,6 +149,8 @@ func batchStateToRequestState(state entity.BatchState) (entity.RequestState, err return entity.RequestStateLanded, nil case entity.BatchStateFailed: return entity.RequestStateError, nil + case entity.BatchStateCancelled: + return entity.RequestStateCancelled, nil default: return entity.RequestStateUnknown, fmt.Errorf("non-terminal batch state: %s", state) } diff --git a/orchestrator/controller/conclude/conclude_test.go b/orchestrator/controller/conclude/conclude_test.go index 45596ad7..ed13a9b7 100644 --- a/orchestrator/controller/conclude/conclude_test.go +++ b/orchestrator/controller/conclude/conclude_test.go @@ -144,7 +144,7 @@ func TestController_Process(t *testing.T) { }, }, { - name: "cancelled batch returns error", + name: "cancelled batch cancels requests", batch: entity.Batch{ ID: "test-queue/batch/3", Queue: "test-queue", @@ -162,12 +162,17 @@ func TestController_Process(t *testing.T) { Version: 2, }, nil) + mockRequestStore := storagemock.NewMockRequestStore(ctrl) + mockRequestStore.EXPECT().Get(gomock.Any(), "test-queue/10").Return(entity.Request{ + ID: "test-queue/10", Version: 4, State: entity.RequestStateProcessing, + }, nil) + mockRequestStore.EXPECT().UpdateState(gomock.Any(), "test-queue/10", int32(4), int32(5), entity.RequestStateCancelled).Return(nil) + mockStorage := storagemock.NewMockStorage(ctrl) mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + mockStorage.EXPECT().GetRequestStore().Return(mockRequestStore).AnyTimes() return mockStorage }, - wantErr: true, - retryable: false, }, { name: "non-terminal batch state returns error", diff --git a/orchestrator/controller/merge/merge.go b/orchestrator/controller/merge/merge.go index d09c12db..45dbcd9f 100644 --- a/orchestrator/controller/merge/merge.go +++ b/orchestrator/controller/merge/merge.go @@ -102,6 +102,16 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) + // Cancelling intent: the cancel controller marked this batch as not landing + // and handed it off to speculate. Silently ack — do not push (the inherent + // push-already-committed race is acknowledged elsewhere) and do not fan out + // (speculate owns the terminal write to Cancelled and the downstream + // dependent / conclude publishes). + if batch.State == entity.BatchStateCancelling { + coremetrics.NamedCounter(c.metricsScope, "process", "skipped_cancelling", 1) + return nil + } + // Idempotency: if the batch is already in a terminal state, a previous // attempt has already merged (or failed) — just re-fan-out the events // in case downstream stages missed them. diff --git a/orchestrator/controller/merge/merge_test.go b/orchestrator/controller/merge/merge_test.go index b08480ef..6d7f5d4b 100644 --- a/orchestrator/controller/merge/merge_test.go +++ b/orchestrator/controller/merge/merge_test.go @@ -305,14 +305,61 @@ func TestController_Process_PushInfraFailureReturnsError(t *testing.T) { } func TestController_Process_TerminalBatchSkipsPushButFansOut(t *testing.T) { + for _, state := range []entity.BatchState{ + entity.BatchStateSucceeded, + entity.BatchStateFailed, + entity.BatchStateCancelled, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + + const batchID = "test-queue/batch/4" + + batch := entity.Batch{ + ID: batchID, + Queue: "test-queue", + State: state, + Version: 7, + } + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batchID).Return(batch, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + // Push must NOT be called for an already-terminal batch. + mockPusher := pushermock.NewMockPusher(ctrl) + + c := NewController( + zaptest.NewLogger(t).Sugar(), + tally.NoopScope, + store, + newRegistry(t, ctrl, nil), + mockPusher, + consumer.TopicKeyMerge, + "orchestrator-merge", + ) + + err := c.Process(context.Background(), newDelivery(t, ctrl, batchID, batch.Queue)) + require.NoError(t, err) + }) + } +} + +// BatchStateCancelling must be silently acked: no push, and crucially no +// fan-out (no publish to conclude or speculate). The cancel controller owns +// the terminal write and the downstream publishes; conclude would error on +// a non-terminal Cancelling batch. +func TestController_Process_CancellingShortCircuit(t *testing.T) { ctrl := gomock.NewController(t) - const batchID = "test-queue/batch/4" + const batchID = "test-queue/batch/4c" batch := entity.Batch{ ID: batchID, Queue: "test-queue", - State: entity.BatchStateSucceeded, + State: entity.BatchStateCancelling, Version: 7, } @@ -322,20 +369,29 @@ func TestController_Process_TerminalBatchSkipsPushButFansOut(t *testing.T) { store := storagemock.NewMockStorage(ctrl) store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() - // Push must NOT be called for an already-terminal batch. + // Pusher and publisher with no EXPECTs — neither must be called. mockPusher := pushermock.NewMockPusher(ctrl) + mockPub := queuemock.NewMockPublisher(ctrl) + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry([]consumer.TopicConfig{ + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + }) + require.NoError(t, err) c := NewController( zaptest.NewLogger(t).Sugar(), tally.NoopScope, store, - newRegistry(t, ctrl, nil), + registry, mockPusher, consumer.TopicKeyMerge, "orchestrator-merge", ) - err := c.Process(context.Background(), newDelivery(t, ctrl, batchID, batch.Queue)) + err = c.Process(context.Background(), newDelivery(t, ctrl, batchID, batch.Queue)) require.NoError(t, err) } diff --git a/orchestrator/controller/score/score.go b/orchestrator/controller/score/score.go index a6f06b85..88dd1e83 100644 --- a/orchestrator/controller/score/score.go +++ b/orchestrator/controller/score/score.go @@ -103,6 +103,33 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) + // Short-circuit when the batch is in BatchStateCancelling — the cancel + // controller has handed the batch off to speculate, which owns the terminal + // write to Cancelled and the downstream dependent / conclude publishes. We + // must not race it to conclude (conclude requires terminal). Silently ack. + if batch.State == entity.BatchStateCancelling { + c.metricsScope.Counter("skipped_cancelling").Inc(1) + c.logger.Infow("skipping score for cancelling batch", + "batch_id", batch.ID, + ) + return nil + } + + // Short-circuit if the batch is already terminal. Score never writes a + // terminal state, so it owns no recovery here: whichever controller wrote + // the terminal state (speculate.cancelBatch / failOnDependency, or merge) + // already published to conclude, and speculate's terminal self-heal + // republishes conclude on every redelivery of a terminal batch. Silently + // ack — same pattern as build / buildsignal on halted. + if batch.State.IsTerminal() { + c.metricsScope.Counter("skipped_terminal").Inc(1) + c.logger.Infow("skipping score for terminal batch", + "batch_id", batch.ID, + "state", string(batch.State), + ) + return nil + } + // Score each request's change and take the minimum (worst-case) as the batch score batchScore, err := c.scoreBatch(ctx, batch) if err != nil { diff --git a/orchestrator/controller/score/score_test.go b/orchestrator/controller/score/score_test.go index 3af9c4ad..5ac8e416 100644 --- a/orchestrator/controller/score/score_test.go +++ b/orchestrator/controller/score/score_test.go @@ -97,6 +97,7 @@ func newTestController(t *testing.T, ctrl *gomock.Controller, store *storagemock registry, err := consumer.NewTopicRegistry( []consumer.TopicConfig{ {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, {Key: consumer.TopicKeyLog, Name: "log", Queue: mockQ}, }, ) @@ -307,3 +308,106 @@ func TestController_InterfaceImplementation(t *testing.T) { var _ consumer.Controller = controller } + +// A batch already in a terminal state (e.g. cancelled while the score message +// was in flight) must be short-circuited: no scoring, no UpdateScoreAndState, +// and no fan-out — score owns no terminal write, so it owns no recovery; the +// controller that wrote the terminal state already published to conclude, and +// speculate's terminal self-heal republishes on redelivery. +func TestController_Process_TerminalShortCircuit(t *testing.T) { + for _, state := range []entity.BatchState{ + entity.BatchStateCancelled, + entity.BatchStateSucceeded, + entity.BatchStateFailed, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + + batch := testBatch() + batch.State = state + + // Batch store: only Get; no UpdateScoreAndState (gomock fails if called). + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockBatchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + + // Scorer with no EXPECTs — must not be called. + mockScorer := scorermock.NewMockScorer(ctrl) + + logger := zaptest.NewLogger(t).Sugar() + scope := tally.NoopScope + + // Publisher with no EXPECTs — must not be called (no fan-out on terminal). + mockPub := queuemock.NewMockPublisher(ctrl) + + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeyLog, Name: "log", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + controller := NewController(logger, scope, mockStorage, mockScorer, registry, consumer.TopicKeyScore, "orchestrator-score") + + msg := queue.NewMessage(batch.ID, batchIDPayload(t, batch.ID), batch.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) + }) + } +} + +// A batch in BatchStateCancelling must be silently acked: no scoring, no +// UpdateScoreAndState, and crucially NO conclude publish (speculate owns the +// terminal write to Cancelled and the downstream dependent / conclude +// publishes — conclude would also error on a non-terminal Cancelling batch). +func TestController_Process_CancellingShortCircuit(t *testing.T) { + ctrl := gomock.NewController(t) + + batch := testBatch() + batch.State = entity.BatchStateCancelling + + mockBatchStore := storagemock.NewMockBatchStore(ctrl) + mockBatchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + + mockStorage := storagemock.NewMockStorage(ctrl) + mockStorage.EXPECT().GetBatchStore().Return(mockBatchStore).AnyTimes() + + // Scorer with no EXPECTs — must not be called. + mockScorer := scorermock.NewMockScorer(ctrl) + + logger := zaptest.NewLogger(t).Sugar() + scope := tally.NoopScope + + // Publisher with no EXPECTs — must not be called (no fan-out for Cancelling). + mockPub := queuemock.NewMockPublisher(ctrl) + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeyLog, Name: "log", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + controller := NewController(logger, scope, mockStorage, mockScorer, registry, consumer.TopicKeyScore, "orchestrator-score") + + msg := queue.NewMessage(batch.ID, batchIDPayload(t, batch.ID), batch.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) +} diff --git a/orchestrator/controller/speculate/BUILD.bazel b/orchestrator/controller/speculate/BUILD.bazel index 6aa8e7e7..f7fe7c2e 100644 --- a/orchestrator/controller/speculate/BUILD.bazel +++ b/orchestrator/controller/speculate/BUILD.bazel @@ -26,6 +26,7 @@ go_test( "//entity", "//entity/queue", "//extension/queue/mock", + "//extension/storage", "//extension/storage/mock", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", diff --git a/orchestrator/controller/speculate/speculate.go b/orchestrator/controller/speculate/speculate.go index d5250be1..553b83c1 100644 --- a/orchestrator/controller/speculate/speculate.go +++ b/orchestrator/controller/speculate/speculate.go @@ -16,6 +16,7 @@ package speculate import ( "context" + "errors" "fmt" "github.com/uber-go/tally/v4" @@ -38,9 +39,15 @@ import ( // - Speculating → if all deps are Succeeded, publish to merge and // transition to Merging; otherwise no-op (or fail-fast if a dep is // in a non-succeeding terminal state). +// - Cancelling → cancel any in-flight Build entity, respeculate +// dependents, CAS to terminal Cancelled, publish to conclude. The +// cancel controller hands the batch off in this state and speculate +// drives it to terminal. // - Merging → no-op (owned by the merge controller). // - Terminal → re-fan-out to conclude for self-healing in case a -// prior publish was lost. +// prior publish was lost. For terminal Cancelled, also re-fan-out +// dependents so a crash between the terminal CAS and the dependent +// publish does not strand them. // // The controller is re-triggered on every relevant downstream event // (buildsignal, merge), so each call simply re-evaluates the current @@ -99,12 +106,25 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r return fmt.Errorf("failed to get batch %s: %w", bid.ID, err) } - // Terminal state: re-fan-out to conclude for self-healing. The batch is - // already done; if a previous publish was lost, downstream stages will - // otherwise never reconcile. Re-publishing is safe because conclude is - // idempotent on the batch ID. + // Cancelling intent: the cancel controller has handed this batch off to + // speculate to drive to terminal. Cancel in-flight builds, fan out to + // dependents, CAS to terminal Cancelled, and publish to conclude. + if batch.State == entity.BatchStateCancelling { + return c.cancelBatch(ctx, batch) + } + + // Terminal state: re-fan-out for self-healing in case a previous publish + // was lost. Always re-publish to conclude (idempotent on the batch ID). + // For Cancelled specifically also re-publish to dependents — a crash + // between the terminal CAS and the dependent publish would otherwise + // leave them stuck waiting on a Cancelled dep. if batch.State.IsTerminal() { metrics.NamedCounter(c.metricsScope, opName, "self_heal_terminal", 1) + if batch.State == entity.BatchStateCancelled { + if err := c.respeculateDependents(ctx, batch); err != nil { + return err + } + } return c.fanout(ctx, batch.ID, batch.Queue) } @@ -151,10 +171,11 @@ func (c *Controller) startSpeculation(ctx context.Context, batch entity.Batch) e } // tryFinalize publishes to merge and transitions to Merging iff every -// dependency batch has reached Succeeded. If any dep is Failed/Cancelled, -// the batch cannot land on top of it; we mark it Failed and hand off to -// conclude so the request state and log are reconciled. Otherwise (some -// deps still in flight) it no-ops and waits for the next event. +// dependency batch has reached Succeeded. Cancelled deps are treated as +// out-of-the-way: the cancelled batch will never land, so it can no longer +// conflict — drop it from the chain and proceed. Failed deps still cascade +// via failOnDependency. If some deps are still in flight, the call is a +// no-op and waits for the next event. // // TODO: when a dependency fails we currently fail this batch outright. // We will need to respeculate the failed paths — drop the failed dep @@ -171,7 +192,15 @@ func (c *Controller) tryFinalize(ctx context.Context, batch entity.Batch) error switch d.State { case entity.BatchStateSucceeded: // ok - case entity.BatchStateFailed, entity.BatchStateCancelled: + case entity.BatchStateCancelled: + // Out-of-the-way: the cancelled batch will never land, so it can + // no longer conflict. Drop it from the chain and continue. + c.metricsScope.Counter("dependency_cancelled_skipped").Inc(1) + c.logger.Infow("dependency cancelled; dropping from speculation chain", + "batch_id", batch.ID, + "dependency_id", d.ID, + ) + case entity.BatchStateFailed: return c.failOnDependency(ctx, batch, d) default: pending = append(pending, d.ID) @@ -228,6 +257,141 @@ func (c *Controller) failOnDependency(ctx context.Context, batch entity.Batch, d return nil } +// cancelBatch drives a batch from BatchStateCancelling to BatchStateCancelled. +// The cancel controller records the user's intent (Cancelling) and hands the +// batch off; speculate owns the rest because all the work that must precede +// the terminal write — flipping in-flight builds, respeculating dependents — +// already lives in the speculate domain. The terminal transition is the +// single writer of every non-Cancelling batch state across the system. +// +// Order matters for correctness: +// +// 1. Cancel the in-flight Build entity (build.ID == batch.ID; one Get + one +// UpdateStatus covers all builds for this batch). A future external CI +// integration hooks in here. Idempotent: tolerate ErrNotFound (no build +// was scheduled), skip if already terminal. +// +// 2. CAS the batch to terminal Cancelled. This must happen BEFORE the +// dependent fan-out: tryFinalize only drops a Cancelled dep from the +// chain, so dependents woken with the dep still in Cancelling would +// wait pending and never get pinged again. +// +// 3. Re-publish each downstream dependent to speculate so they can drop +// this cancelled batch from their chain and advance (or finalize, if +// this was their last outstanding dep). +// +// 4. Publish to conclude so contained requests reach RequestStateCancelled. +// +// A crash between steps 2 and 3/4 is recovered on redelivery via the +// terminal self-heal branch, which re-runs the dependent fan-out and the +// conclude publish for already-Cancelled batches. +// +// storage.ErrVersionMismatch on the terminal CAS is returned as-is for the +// base controller to classify as retryable; the redelivery will land in the +// self-heal branch and complete the fan-out. +func (c *Controller) cancelBatch(ctx context.Context, batch entity.Batch) error { + metrics.NamedCounter(c.metricsScope, opName, "cancel_batch", 1) + c.logger.Infow("cancelling batch", + "batch_id", batch.ID, + "queue", batch.Queue, + ) + + // TODO(respeculate-collateral): re-enqueue Land for every request in batch.Contains + // except the user-cancelled request. Today the whole batch dies (per spec) and the + // collateral requests need a fresh request ID and a re-publish to TopicKeyStart so + // they can be re-batched without the cancelled change. + + if err := c.cancelBuild(ctx, batch); err != nil { + return err + } + + newVersion := batch.Version + 1 + if err := c.store.GetBatchStore().UpdateState(ctx, batch.ID, batch.Version, newVersion, entity.BatchStateCancelled); err != nil { + metrics.NamedCounter(c.metricsScope, opName, "storage_errors", 1) + return fmt.Errorf("failed to update batch %s state to cancelled: %w", batch.ID, err) + } + batch.Version = newVersion + batch.State = entity.BatchStateCancelled + + if err := c.respeculateDependents(ctx, batch); err != nil { + return err + } + + if err := c.publish(ctx, consumer.TopicKeyConclude, batch.ID, batch.Queue); err != nil { + metrics.NamedCounter(c.metricsScope, opName, "publish_errors", 1) + return fmt.Errorf("failed to publish to conclude: %w", err) + } + + return nil +} + +// cancelBuild flips any in-flight Build entity for the batch to +// BuildStatusCancelled. Builds use build.ID == batch.ID, so a single Get +// covers every build scheduled for the batch. Tolerates ErrNotFound (no +// build was ever scheduled — the batch was cancelled before speculation +// started building) and skips already-terminal builds. +// +// This is the hook point for a future external CI integration: today the +// system has no external runner, so the local state flip is the complete +// cancellation. Once a runner exists, it must be invoked here before the +// local UpdateStatus. +func (c *Controller) cancelBuild(ctx context.Context, batch entity.Batch) error { + build, err := c.store.GetBuildStore().Get(ctx, batch.ID) + if err != nil { + if errors.Is(err, storage.ErrNotFound) { + metrics.NamedCounter(c.metricsScope, opName, "cancel_build_not_found", 1) + return nil + } + metrics.NamedCounter(c.metricsScope, opName, "storage_errors", 1) + return fmt.Errorf("failed to get build for batch %s: %w", batch.ID, err) + } + + if build.Status.IsTerminal() { + metrics.NamedCounter(c.metricsScope, opName, "cancel_build_already_terminal", 1) + return nil + } + + if err := c.store.GetBuildStore().UpdateStatus(ctx, batch.ID, entity.BuildStatusCancelled); err != nil { + metrics.NamedCounter(c.metricsScope, opName, "storage_errors", 1) + return fmt.Errorf("failed to cancel build for batch %s: %w", batch.ID, err) + } + metrics.NamedCounter(c.metricsScope, opName, "cancel_build_done", 1) + return nil +} + +// respeculateDependents publishes a speculate event for every batch that +// depends on the given batch. The batch controller creates a BatchDependent +// row (with Dependents possibly empty) for every batch it persists, so a +// missing row at this point is a storage invariant violation, not a normal +// "no dependents" case — surface ErrNotFound as a regular storage error so +// the message nacks and either an operator or the batch controller's own +// crash-recovery can resolve the inconsistency. +// +// Called both from the cancelBatch terminal flow and from the terminal +// self-heal branch on redelivery of an already-Cancelled batch. +func (c *Controller) respeculateDependents(ctx context.Context, batch entity.Batch) error { + bd, err := c.store.GetBatchDependentStore().Get(ctx, batch.ID) + if err != nil { + metrics.NamedCounter(c.metricsScope, opName, "storage_errors", 1) + return fmt.Errorf("failed to get batch dependents for batch %s: %w", batch.ID, err) + } + + for _, depID := range bd.Dependents { + // Alternative: process each dependent inline (load batch, run the + // equivalent of tryFinalize) instead of publishing back to ourselves. + // Rejected for now: per-message retry isolation, fresh per-dependent + // reads, consumer-pool parallelism / backpressure, and the existing + // state-machine dispatch in Process all argue for the publish. Revisit + // if the extra message hop ever shows up as latency or cost. + if err := c.publish(ctx, consumer.TopicKeySpeculate, depID, batch.Queue); err != nil { + metrics.NamedCounter(c.metricsScope, opName, "publish_errors", 1) + return fmt.Errorf("failed to publish dependent batch %s to speculate: %w", depID, err) + } + metrics.NamedCounter(c.metricsScope, opName, "dependent_respeculated", 1) + } + return nil +} + // fetchDependencies loads each batch in batch.Dependencies. Any storage error // is surfaced as a retryable infra failure; missing dependencies should not // happen in practice, but if one does it is treated the same as a transient diff --git a/orchestrator/controller/speculate/speculate_test.go b/orchestrator/controller/speculate/speculate_test.go index 997149bb..e38b0227 100644 --- a/orchestrator/controller/speculate/speculate_test.go +++ b/orchestrator/controller/speculate/speculate_test.go @@ -27,6 +27,7 @@ import ( "github.com/uber/submitqueue/entity" "github.com/uber/submitqueue/entity/queue" queuemock "github.com/uber/submitqueue/extension/queue/mock" + "github.com/uber/submitqueue/extension/storage" storagemock "github.com/uber/submitqueue/extension/storage/mock" "go.uber.org/mock/gomock" "go.uber.org/zap/zaptest" @@ -182,35 +183,46 @@ func TestController_Process_WaitingOnDep(t *testing.T) { require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) } -// tryFinalize: a dep in a non-succeeding terminal state must fail the batch -// (Speculating → Failed) and publish to conclude. Otherwise the batch livelocks. +// tryFinalize: a failed dep must fail the batch (Speculating → Failed) and +// publish to conclude. Otherwise the batch livelocks. func TestController_Process_FailedDepFailsBatch(t *testing.T) { - tests := []struct { - name string - depState entity.BatchState - }{ - {name: "dep_failed", depState: entity.BatchStateFailed}, - {name: "dep_cancelled", depState: entity.BatchStateCancelled}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - dep := entity.Batch{ID: "test-queue/batch/0", Queue: "test-queue", State: tt.depState, Version: 1} - batch := testBatch(entity.BatchStateSpeculating, dep.ID) - batch.Contains = []string{"test-queue/req/1", "test-queue/req/2"} + ctrl := gomock.NewController(t) + dep := entity.Batch{ID: "test-queue/batch/0", Queue: "test-queue", State: entity.BatchStateFailed, Version: 1} + batch := testBatch(entity.BatchStateSpeculating, dep.ID) + batch.Contains = []string{"test-queue/req/1", "test-queue/req/2"} - batchStore := storagemock.NewMockBatchStore(ctrl) - batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) - batchStore.EXPECT().Get(gomock.Any(), dep.ID).Return(dep, nil) - batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateFailed).Return(nil) + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().Get(gomock.Any(), dep.ID).Return(dep, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateFailed).Return(nil) - store := storagemock.NewMockStorage(ctrl) - store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() - controller := newTestController(t, ctrl, store, nil) - require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) - }) - } + controller := newTestController(t, ctrl, store, nil) + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) +} + +// tryFinalize: a cancelled dep is treated as out-of-the-way — it will never +// land and can no longer conflict. The dep is dropped from the chain and the +// batch advances to Merging as if the cancelled dep had succeeded. +func TestController_Process_CancelledDepSkipped(t *testing.T) { + ctrl := gomock.NewController(t) + depCancelled := entity.Batch{ID: "test-queue/batch/0a", Queue: "test-queue", State: entity.BatchStateCancelled, Version: 2} + depSucceeded := entity.Batch{ID: "test-queue/batch/0b", Queue: "test-queue", State: entity.BatchStateSucceeded, Version: 5} + batch := testBatch(entity.BatchStateSpeculating, depCancelled.ID, depSucceeded.ID) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().Get(gomock.Any(), depCancelled.ID).Return(depCancelled, nil) + batchStore.EXPECT().Get(gomock.Any(), depSucceeded.ID).Return(depSucceeded, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateMerging).Return(nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + + controller := newTestController(t, ctrl, store, nil) + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) } // Merging is owned by the merge controller — speculate is a no-op for it. @@ -230,12 +242,13 @@ func TestController_Process_MergingNoOp(t *testing.T) { } // Terminal states re-fan-out to conclude for self-healing in case a previous -// publish was lost. State must not change (no UpdateState). +// publish was lost. State must not change (no UpdateState). The Cancelled +// terminal also re-fans-out dependents and is covered separately in +// TestController_Process_CancelledTerminalSelfHealsDependents. func TestController_Process_TerminalSelfHeals(t *testing.T) { for _, state := range []entity.BatchState{ entity.BatchStateSucceeded, entity.BatchStateFailed, - entity.BatchStateCancelled, } { t.Run(string(state), func(t *testing.T) { ctrl := gomock.NewController(t) @@ -270,6 +283,279 @@ func TestController_Process_TerminalSelfHeals(t *testing.T) { } } +// Cancelled is terminal: redelivery must re-fan-out dependents (so a crash +// between the terminal CAS and the dependent publish does not strand them) +// AND re-publish to conclude. State must not change (no UpdateState; no +// build cancel). The BuildStore must not be touched on this self-heal path. +func TestController_Process_CancelledTerminalSelfHealsDependents(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelled) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + // No UpdateState expected. + + depStore := storagemock.NewMockBatchDependentStore(ctrl) + depStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.BatchDependent{ + BatchID: batch.ID, + Dependents: []string{"test-queue/batch/2", "test-queue/batch/3"}, + Version: 1, + }, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBatchDependentStore().Return(depStore).AnyTimes() + // BuildStore must NOT be touched on the terminal self-heal path. + + type pubRec struct { + topic string + msgID string + } + var records []pubRec + mockPub := queuemock.NewMockPublisher(ctrl) + mockPub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, topic string, msg queue.Message) error { + records = append(records, pubRec{topic: topic, msgID: msg.ID}) + return nil + }).AnyTimes() + + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + logger := zaptest.NewLogger(t).Sugar() + controller := NewController(logger, tally.NoopScope, store, registry, consumer.TopicKeySpeculate, "orchestrator-speculate") + + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) + + assert.Equal(t, []pubRec{ + {topic: "speculate", msgID: "test-queue/batch/2"}, + {topic: "speculate", msgID: "test-queue/batch/3"}, + {topic: "conclude", msgID: batch.ID}, + }, records) +} + +// Cancelling drives the terminal-cancellation flow: cancel any in-flight +// build, CAS the batch to Cancelled, fan out dependents, publish to +// conclude. Validates the full happy-path order with a running build and +// a couple of dependents. Order matters: dependents must publish AFTER the +// terminal CAS so the woken dependents observe the dep as Cancelled (and +// drop it from their chain) rather than as still-Cancelling (which would +// leave them waiting on a state nobody is going to nudge). +func TestController_Process_CancellingTerminalFlow(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelling) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelled).Return(nil) + + buildStore := storagemock.NewMockBuildStore(ctrl) + buildStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.Build{ + ID: batch.ID, BatchID: batch.ID, Status: entity.BuildStatusRunning, + }, nil) + buildStore.EXPECT().UpdateStatus(gomock.Any(), batch.ID, entity.BuildStatusCancelled).Return(nil) + + depStore := storagemock.NewMockBatchDependentStore(ctrl) + depStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.BatchDependent{ + BatchID: batch.ID, + Dependents: []string{"test-queue/batch/2", "test-queue/batch/3"}, + Version: 1, + }, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + store.EXPECT().GetBatchDependentStore().Return(depStore).AnyTimes() + + type pubRec struct { + topic string + msgID string + } + var records []pubRec + mockPub := queuemock.NewMockPublisher(ctrl) + mockPub.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn( + func(_ context.Context, topic string, msg queue.Message) error { + records = append(records, pubRec{topic: topic, msgID: msg.ID}) + return nil + }).AnyTimes() + + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + logger := zaptest.NewLogger(t).Sugar() + controller := NewController(logger, tally.NoopScope, store, registry, consumer.TopicKeySpeculate, "orchestrator-speculate") + + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) + + assert.Equal(t, []pubRec{ + {topic: "speculate", msgID: "test-queue/batch/2"}, + {topic: "speculate", msgID: "test-queue/batch/3"}, + {topic: "conclude", msgID: batch.ID}, + }, records) +} + +// If the build for the batch has already reached a terminal status (e.g. CI +// finished naturally between the cancel intent and the speculate pickup), the +// cancellation must not re-flip it — UpdateStatus must never fire. The rest +// of the flow (terminal batch CAS, dependent fan-out, conclude) still runs. +func TestController_Process_CancellingBuildAlreadyTerminal(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelling) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelled).Return(nil) + + buildStore := storagemock.NewMockBuildStore(ctrl) + buildStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.Build{ + ID: batch.ID, BatchID: batch.ID, Status: entity.BuildStatusSucceeded, + }, nil) + // No UpdateStatus expected — the build is already terminal. + + depStore := storagemock.NewMockBatchDependentStore(ctrl) + depStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.BatchDependent{ + BatchID: batch.ID, Version: 1, + }, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + store.EXPECT().GetBatchDependentStore().Return(depStore).AnyTimes() + + controller := newTestController(t, ctrl, store, nil) + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) +} + +// If no Build entity exists for the batch (e.g. cancel arrived before +// speculation started building), the BuildStore.Get NotFound must be +// tolerated and the rest of the cancellation flow must continue. +func TestController_Process_CancellingNoBuildYet(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelling) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelled).Return(nil) + + buildStore := storagemock.NewMockBuildStore(ctrl) + buildStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.Build{}, storage.ErrNotFound) + // No UpdateStatus expected. + + depStore := storagemock.NewMockBatchDependentStore(ctrl) + depStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.BatchDependent{ + BatchID: batch.ID, Version: 1, + }, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + store.EXPECT().GetBatchDependentStore().Return(depStore).AnyTimes() + + controller := newTestController(t, ctrl, store, nil) + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) +} + +// A batch whose BatchDependent row exists with an empty Dependents list must +// still drive itself to terminal and publish to conclude. This is the normal +// "no dependents" path: the batch controller creates the row with an empty +// list at batch creation time and it stays empty if no later batch conflicts. +func TestController_Process_CancellingNoDependents(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelling) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelled).Return(nil) + + buildStore := storagemock.NewMockBuildStore(ctrl) + buildStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.Build{}, storage.ErrNotFound) + + depStore := storagemock.NewMockBatchDependentStore(ctrl) + depStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.BatchDependent{BatchID: batch.ID, Dependents: []string{}, Version: 1}, nil) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + store.EXPECT().GetBatchDependentStore().Return(depStore).AnyTimes() + + mockPub := queuemock.NewMockPublisher(ctrl) + mockPub.EXPECT().Publish(gomock.Any(), "conclude", gomock.Any()).Return(nil).Times(1) + + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + logger := zaptest.NewLogger(t).Sugar() + controller := NewController(logger, tally.NoopScope, store, registry, consumer.TopicKeySpeculate, "orchestrator-speculate") + + require.NoError(t, runProcess(t, ctrl, controller, batch.ID)) +} + +// storage.ErrVersionMismatch on the terminal CAS must surface as an error +// with the underlying sentinel in the chain so the base controller can +// classify it as retryable. The dependent fan-out and conclude publish must +// NOT run if the terminal CAS failed — on redelivery the self-heal branch +// will pick up the (now-terminal) state and complete the fan-out. +func TestController_Process_CancellingTerminalCASVersionMismatch(t *testing.T) { + ctrl := gomock.NewController(t) + batch := testBatch(entity.BatchStateCancelling) + + batchStore := storagemock.NewMockBatchStore(ctrl) + batchStore.EXPECT().Get(gomock.Any(), batch.ID).Return(batch, nil) + batchStore.EXPECT().UpdateState(gomock.Any(), batch.ID, int32(1), int32(2), entity.BatchStateCancelled). + Return(storage.ErrVersionMismatch) + + buildStore := storagemock.NewMockBuildStore(ctrl) + buildStore.EXPECT().Get(gomock.Any(), batch.ID).Return(entity.Build{}, storage.ErrNotFound) + + store := storagemock.NewMockStorage(ctrl) + store.EXPECT().GetBatchStore().Return(batchStore).AnyTimes() + store.EXPECT().GetBuildStore().Return(buildStore).AnyTimes() + // BatchDependentStore must NOT be touched — terminal CAS failed before fan-out. + + // No publish expected (terminal CAS failed before fan-out). + mockPub := queuemock.NewMockPublisher(ctrl) + mockQ := queuemock.NewMockQueue(ctrl) + mockQ.EXPECT().Publisher().Return(mockPub).AnyTimes() + + registry, err := consumer.NewTopicRegistry( + []consumer.TopicConfig{ + {Key: consumer.TopicKeyConclude, Name: "conclude", Queue: mockQ}, + {Key: consumer.TopicKeySpeculate, Name: "speculate", Queue: mockQ}, + }, + ) + require.NoError(t, err) + + logger := zaptest.NewLogger(t).Sugar() + controller := NewController(logger, tally.NoopScope, store, registry, consumer.TopicKeySpeculate, "orchestrator-speculate") + + err = runProcess(t, ctrl, controller, batch.ID) + require.Error(t, err) + assert.ErrorIs(t, err, storage.ErrVersionMismatch) +} + // An unrecognized state must surface as an error so the message is nacked // instead of silently acked — silently acking would drop the event. func TestController_Process_UnrecognizedState(t *testing.T) { diff --git a/orchestrator/controller/validate/validate.go b/orchestrator/controller/validate/validate.go index 8998e6fb..c4e976c4 100644 --- a/orchestrator/controller/validate/validate.go +++ b/orchestrator/controller/validate/validate.go @@ -108,6 +108,19 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r "partition_key", msg.PartitionKey, ) + // Short-circuit if the request has been halted — either it already reached a + // terminal state, or the cancel controller has recorded a cancellation intent + // (RequestStateCancelling). Without this guard we would still publish to batch + // and spawn a batch for a request that should never proceed. + if entity.IsRequestStateHalted(request.State) { + coremetrics.NamedCounter(c.metricsScope, "process", "skipped_halted", 1) + c.logger.Infow("skipping validate for halted request", + "request_id", request.ID, + "state", string(request.State), + ) + return nil + } + // Duplicate detection: look for any other in-flight request that has already // claimed an overlapping URI in this queue. Per-queue partition leasing // (see core/consumer + extension/queue) guarantees serial processing within diff --git a/orchestrator/controller/validate/validate_test.go b/orchestrator/controller/validate/validate_test.go index e8672b2d..84f1cfe2 100644 --- a/orchestrator/controller/validate/validate_test.go +++ b/orchestrator/controller/validate/validate_test.go @@ -473,3 +473,43 @@ func TestController_Process_ChangeStoreQueryFailure(t *testing.T) { require.Error(t, err) assert.False(t, errs.IsUserError(err), "infra error should not be classified as user error") } + +// A request already in a terminal state (e.g. cancelled while the validate +// message was in flight) must be short-circuited before any extension is +// touched and before any publish happens. We verify this by registering a +// merge checker and change store with NO expectations — gomock fails the test +// if either is called — and a publisher that returns an error if invoked. +func TestController_Process_TerminalShortCircuit(t *testing.T) { + for _, state := range []entity.RequestState{ + entity.RequestStateCancelled, + entity.RequestStateLanded, + entity.RequestStateError, + } { + t.Run(string(state), func(t *testing.T) { + ctrl := gomock.NewController(t) + + request := entity.Request{ + ID: "test-queue/123", + Queue: "test-queue", + State: state, + Version: 5, + } + store, _ := newMockStorage(ctrl, request) + + // No EXPECTs on merge checker or change store: gomock will fail if either is called. + mc := mergecheckermock.NewMockMergeChecker(ctrl) + cs := changemock.NewMockChangeStore(ctrl) + + // Sentinel publish error: if Process publishes, it returns a non-nil err, + // which the require.NoError below will catch. + controller := newTestController(t, ctrl, store, cs, mc, fmt.Errorf("should not publish")) + + msg := queue.NewMessage(request.ID, requestIDPayload(t, request.ID), request.Queue, nil) + delivery := queuemock.NewMockDelivery(ctrl) + delivery.EXPECT().Message().Return(msg).AnyTimes() + delivery.EXPECT().Attempt().Return(1).AnyTimes() + + require.NoError(t, controller.Process(context.Background(), delivery)) + }) + } +} diff --git a/orchestrator/protopb/orchestrator.pb.go b/orchestrator/protopb/orchestrator.pb.go index de3d688f..aec3ee03 100644 --- a/orchestrator/protopb/orchestrator.pb.go +++ b/orchestrator/protopb/orchestrator.pb.go @@ -1,7 +1,21 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.36.10 -// protoc v5.29.3 +// protoc v3.21.12 // source: orchestrator.proto package protopb diff --git a/orchestrator/protopb/orchestrator_grpc.pb.go b/orchestrator/protopb/orchestrator_grpc.pb.go index 12c6af09..8ed08b27 100644 --- a/orchestrator/protopb/orchestrator_grpc.pb.go +++ b/orchestrator/protopb/orchestrator_grpc.pb.go @@ -1,7 +1,21 @@ +// Copyright (c) 2025 Uber Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.5.1 -// - protoc v5.29.3 +// - protoc v3.21.12 // source: orchestrator.proto package protopb diff --git a/test/e2e/BUILD.bazel b/test/e2e/BUILD.bazel index 97fc8d32..7093ccd1 100644 --- a/test/e2e/BUILD.bazel +++ b/test/e2e/BUILD.bazel @@ -24,5 +24,7 @@ go_test( "@com_github_stretchr_testify//require", "@com_github_stretchr_testify//suite", "@org_golang_google_grpc//:grpc", + "@org_golang_google_grpc//codes", + "@org_golang_google_grpc//status", ], ) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 2d2ddcd0..bb54d54b 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -35,6 +35,8 @@ import ( orchestratorpb "github.com/uber/submitqueue/orchestrator/protopb" "github.com/uber/submitqueue/test/testutil" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) type E2EIntegrationSuite struct { @@ -166,3 +168,43 @@ func (s *E2EIntegrationSuite) TestLandRequest_SinglePR() { require.NotEmpty(s.T(), resp.Sqid, "SQID should not be empty") s.log.Logf("Land request (single PR) succeeded: sqid=%s", resp.Sqid) } + +// TestCancelRequest_InvalidSqid verifies the gateway rejects an empty sqid +// synchronously before publishing anything to the cancel queue. +func (s *E2EIntegrationSuite) TestCancelRequest_InvalidSqid() { + _, err := s.gatewayClient.Cancel(s.ctx, &gatewaypb.CancelRequest{Sqid: ""}) + require.Error(s.T(), err, "Cancel with empty sqid should fail") + + st, ok := status.FromError(err) + require.True(s.T(), ok, "expected a gRPC status error") + assert.Equal(s.T(), codes.InvalidArgument, st.Code(), + "empty sqid should map to InvalidArgument; got %s", st.Code()) +} + +// TestCancelRequest_BeforeBatch is intentionally a thin smoke test of the +// Land + Cancel RPC envelope: Land to mint a sqid, then Cancel that sqid, and +// assert both calls return OK. It does not poll for terminal state, log +// entries, or any orchestrator-side progression. +// +// TODO(e2e): harden this test once the e2e fixture story is in better shape. +// Add async assertions that the request reaches RequestStateCancelled, that +// request_log contains both `cancelling` and `cancelled` entries, and that a +// second Cancel is idempotent. +func (s *E2EIntegrationSuite) TestCancelRequest_BeforeBatch() { + t := s.T() + + landReq := &gatewaypb.LandRequest{ + Queue: "e2e-cancel-queue", + Change: &gatewaypb.Change{Uris: []string{"github://uber/e2e-nonexistent/pull/9999/deadbeef"}}, + Strategy: gatewaypb.Strategy_REBASE, + } + landResp, err := s.gatewayClient.Land(s.ctx, landReq) + require.NoError(t, err, "Land failed") + require.NotEmpty(t, landResp.Sqid) + + _, err = s.gatewayClient.Cancel(s.ctx, &gatewaypb.CancelRequest{ + Sqid: landResp.Sqid, + Reason: "e2e cancel smoke test", + }) + require.NoError(t, err, "Cancel failed") +}