From f2471e4db90cc38b1239ec9e352652f257b871c2 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 16:50:44 +0000 Subject: [PATCH 01/25] Integrate deployment metadata service for server-side locking and state Add client integration with the deployment metadata service API for server-side deployment locking and resource state tracking. Gated behind DATABRICKS_BUNDLE_DEPLOYMENT_SERVICE=true environment variable. Co-authored-by: Isaac --- bundle/deploy/metadata/service/client.go | 183 +++++++++++++ bundle/deploy/metadata/service/heartbeat.go | 37 +++ bundle/deploy/metadata/service/types.go | 189 ++++++++++++++ bundle/deploy/state_update.go | 6 + bundle/env/deployment_metadata.go | 15 ++ bundle/phases/deploy.go | 6 + bundle/phases/deploy_metadata.go | 269 ++++++++++++++++++++ bundle/phases/destroy.go | 6 + bundle/phases/destroy_metadata.go | 169 ++++++++++++ 9 files changed, 880 insertions(+) create mode 100644 bundle/deploy/metadata/service/client.go create mode 100644 bundle/deploy/metadata/service/heartbeat.go create mode 100644 bundle/deploy/metadata/service/types.go create mode 100644 bundle/env/deployment_metadata.go create mode 100644 bundle/phases/deploy_metadata.go create mode 100644 bundle/phases/destroy_metadata.go diff --git a/bundle/deploy/metadata/service/client.go b/bundle/deploy/metadata/service/client.go new file mode 100644 index 0000000000..ffe2fb36fc --- /dev/null +++ b/bundle/deploy/metadata/service/client.go @@ -0,0 +1,183 @@ +package service + +import ( + "context" + "fmt" + "net/http" + + "errors" + + "github.com/databricks/databricks-sdk-go" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/databricks/databricks-sdk-go/client" +) + +const basePath = "/api/2.0/bundle" + +// Client wraps the Databricks API client for the deployment metadata service. +type Client struct { + api *client.DatabricksClient +} + +// NewClient creates a new deployment metadata service client from a workspace client. +func NewClient(w *databricks.WorkspaceClient) (*Client, error) { + apiClient, err := client.New(w.Config) + if err != nil { + return nil, fmt.Errorf("failed to create deployment metadata API client: %w", err) + } + return &Client{api: apiClient}, nil +} + +// CreateDeployment creates a new deployment. +func (c *Client) CreateDeployment(ctx context.Context, deploymentID string, deployment *Deployment) (*Deployment, error) { + resp := &Deployment{} + path := fmt.Sprintf("%s/deployments", basePath) + err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateDeploymentRequest{ + DeploymentID: deploymentID, + Deployment: deployment, + }, resp) + if err != nil { + return nil, mapError("create deployment", err) + } + return resp, nil +} + +// GetDeployment retrieves a deployment by ID. +func (c *Client) GetDeployment(ctx context.Context, deploymentID string) (*Deployment, error) { + resp := &Deployment{} + path := fmt.Sprintf("%s/deployments/%s", basePath, deploymentID) + err := c.api.Do(ctx, http.MethodGet, path, nil, nil, nil, resp) + if err != nil { + return nil, mapError("get deployment", err) + } + return resp, nil +} + +// DeleteDeployment soft-deletes a deployment. +func (c *Client) DeleteDeployment(ctx context.Context, deploymentID string) error { + path := fmt.Sprintf("%s/deployments/%s", basePath, deploymentID) + err := c.api.Do(ctx, http.MethodDelete, path, nil, nil, nil, nil) + if err != nil { + return mapError("delete deployment", err) + } + return nil +} + +// CreateVersion creates a new version (acquires the deployment lock). +func (c *Client) CreateVersion(ctx context.Context, deploymentID string, versionID string, version *Version) (*Version, error) { + resp := &Version{} + path := fmt.Sprintf("%s/deployments/%s/versions", basePath, deploymentID) + err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateVersionRequest{ + Parent: fmt.Sprintf("deployments/%s", deploymentID), + Version: version, + VersionID: versionID, + }, resp) + if err != nil { + return nil, mapError("create version", err) + } + return resp, nil +} + +// GetVersion retrieves a version. +func (c *Client) GetVersion(ctx context.Context, deploymentID, versionID string) (*Version, error) { + resp := &Version{} + path := fmt.Sprintf("%s/deployments/%s/versions/%s", basePath, deploymentID, versionID) + err := c.api.Do(ctx, http.MethodGet, path, nil, nil, nil, resp) + if err != nil { + return nil, mapError("get version", err) + } + return resp, nil +} + +// Heartbeat renews the lock lease for an in-progress version. +func (c *Client) Heartbeat(ctx context.Context, deploymentID, versionID string) (*HeartbeatResponse, error) { + resp := &HeartbeatResponse{} + path := fmt.Sprintf("%s/deployments/%s/versions/%s/heartbeat", basePath, deploymentID, versionID) + err := c.api.Do(ctx, http.MethodPost, path, nil, nil, struct{}{}, resp) + if err != nil { + return nil, mapError("heartbeat", err) + } + return resp, nil +} + +// CompleteVersion marks a version as completed (releases the deployment lock). +func (c *Client) CompleteVersion(ctx context.Context, deploymentID, versionID string, reason VersionComplete, force bool) (*Version, error) { + resp := &Version{} + path := fmt.Sprintf("%s/deployments/%s/versions/%s/complete", basePath, deploymentID, versionID) + err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CompleteVersionRequest{ + Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + CompletionReason: reason, + Force: force, + }, resp) + if err != nil { + return nil, mapError("complete version", err) + } + return resp, nil +} + +// CreateOperation records a resource operation for a version. +func (c *Client) CreateOperation(ctx context.Context, deploymentID, versionID, resourceKey string, operation *Operation) (*Operation, error) { + resp := &Operation{} + path := fmt.Sprintf("%s/deployments/%s/versions/%s/operations", basePath, deploymentID, versionID) + err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateOperationRequest{ + Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + ResourceKey: resourceKey, + Operation: operation, + }, resp) + if err != nil { + return nil, mapError("create operation", err) + } + return resp, nil +} + +// ListResources lists all resources for a deployment. +func (c *Client) ListResources(ctx context.Context, deploymentID string) ([]Resource, error) { + var allResources []Resource + pageToken := "" + + for { + resp := &ListResourcesResponse{} + path := fmt.Sprintf("%s/deployments/%s/resources", basePath, deploymentID) + + q := map[string]any{ + "parent": fmt.Sprintf("deployments/%s", deploymentID), + "page_size": 1000, + } + if pageToken != "" { + q["page_token"] = pageToken + } + + err := c.api.Do(ctx, http.MethodGet, path, nil, q, nil, resp) + if err != nil { + return nil, mapError("list resources", err) + } + + allResources = append(allResources, resp.Resources...) + if resp.NextPageToken == "" { + break + } + pageToken = resp.NextPageToken + } + + return allResources, nil +} + +// mapError translates API errors into user-friendly messages. +func mapError(operation string, err error) error { + var apiErr *apierr.APIError + if !errors.As(err, &apiErr) { + return fmt.Errorf("%s: %w", operation, err) + } + + switch apiErr.StatusCode { + case http.StatusConflict: + return fmt.Errorf("%s: deployment is locked by another active deployment. "+ + "Use --force-lock to override", operation) + case http.StatusNotFound: + return fmt.Errorf("%s: resource not found: %w", operation, err) + case http.StatusBadRequest: + return fmt.Errorf("%s: bad request: %s", operation, apiErr.Message) + default: + return fmt.Errorf("%s: %w", operation, err) + } +} diff --git a/bundle/deploy/metadata/service/heartbeat.go b/bundle/deploy/metadata/service/heartbeat.go new file mode 100644 index 0000000000..d32e0a24f0 --- /dev/null +++ b/bundle/deploy/metadata/service/heartbeat.go @@ -0,0 +1,37 @@ +package service + +import ( + "context" + "time" + + "github.com/databricks/cli/libs/log" +) + +const DefaultHeartbeatInterval = 2 * time.Minute + +// StartHeartbeat starts a background goroutine that sends heartbeats to keep +// the deployment lock alive. Returns a cancel function to stop the heartbeat. +func StartHeartbeat(ctx context.Context, client *Client, deploymentID, versionID string, interval time.Duration) context.CancelFunc { + ctx, cancel := context.WithCancel(ctx) + + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + _, err := client.Heartbeat(ctx, deploymentID, versionID) + if err != nil { + log.Warnf(ctx, "Failed to send deployment heartbeat: %v", err) + } else { + log.Debugf(ctx, "Deployment heartbeat sent for deployment=%s version=%s", deploymentID, versionID) + } + } + } + }() + + return cancel +} diff --git a/bundle/deploy/metadata/service/types.go b/bundle/deploy/metadata/service/types.go new file mode 100644 index 0000000000..05e0cf03b1 --- /dev/null +++ b/bundle/deploy/metadata/service/types.go @@ -0,0 +1,189 @@ +package service + +import "time" + +// Enum types matching the proto definitions. + +type DeploymentStatus int +type VersionStatus int +type VersionComplete int +type VersionType int +type OperationStatus int +type OperationActionType int +type DeploymentResourceType int + +const ( + DeploymentStatusUnspecified DeploymentStatus = 0 + DeploymentStatusActive DeploymentStatus = 1 + DeploymentStatusFailed DeploymentStatus = 2 + DeploymentStatusInProgress DeploymentStatus = 3 + DeploymentStatusDeleted DeploymentStatus = 4 +) + +const ( + VersionStatusUnspecified VersionStatus = 0 + VersionStatusInProgress VersionStatus = 1 + VersionStatusCompleted VersionStatus = 2 +) + +const ( + VersionCompleteUnspecified VersionComplete = 0 + VersionCompleteSuccess VersionComplete = 1 + VersionCompleteFailure VersionComplete = 2 + VersionCompleteForceAbort VersionComplete = 3 + VersionCompleteLeaseExpire VersionComplete = 4 +) + +const ( + VersionTypeUnspecified VersionType = 0 + VersionTypeDeploy VersionType = 1 + VersionTypeDestroy VersionType = 2 +) + +const ( + OperationStatusUnspecified OperationStatus = 0 + OperationStatusSucceeded OperationStatus = 1 + OperationStatusFailed OperationStatus = 2 +) + +const ( + OperationActionTypeUnspecified OperationActionType = 0 + OperationActionTypeResize OperationActionType = 1 + OperationActionTypeUpdate OperationActionType = 2 + OperationActionTypeUpdateWithID OperationActionType = 3 + OperationActionTypeCreate OperationActionType = 4 + OperationActionTypeRecreate OperationActionType = 5 + OperationActionTypeDelete OperationActionType = 6 + OperationActionTypeBind OperationActionType = 7 + OperationActionTypeBindAndUpdate OperationActionType = 8 + OperationActionTypeInitRegister OperationActionType = 9 +) + +const ( + ResourceTypeUnspecified DeploymentResourceType = 0 + ResourceTypeJob DeploymentResourceType = 1 + ResourceTypePipeline DeploymentResourceType = 2 + ResourceTypeModel DeploymentResourceType = 4 + ResourceTypeRegisteredModel DeploymentResourceType = 5 + ResourceTypeExperiment DeploymentResourceType = 6 + ResourceTypeServingEndpoint DeploymentResourceType = 7 + ResourceTypeQualityMonitor DeploymentResourceType = 8 + ResourceTypeSchema DeploymentResourceType = 9 + ResourceTypeVolume DeploymentResourceType = 10 + ResourceTypeCluster DeploymentResourceType = 11 + ResourceTypeDashboard DeploymentResourceType = 12 + ResourceTypeApp DeploymentResourceType = 13 + ResourceTypeCatalog DeploymentResourceType = 14 + ResourceTypeExternalLocation DeploymentResourceType = 15 + ResourceTypeSecretScope DeploymentResourceType = 16 + ResourceTypeAlert DeploymentResourceType = 17 + ResourceTypeSQLWarehouse DeploymentResourceType = 18 + ResourceTypeDatabaseInstance DeploymentResourceType = 19 + ResourceTypeDatabaseCatalog DeploymentResourceType = 20 + ResourceTypeSyncedDBTable DeploymentResourceType = 21 + ResourceTypePostgresProject DeploymentResourceType = 22 + ResourceTypePostgresBranch DeploymentResourceType = 23 + ResourceTypePostgresEndpoint DeploymentResourceType = 24 +) + +// Deployment represents a bundle deployment registered with the control plane. +type Deployment struct { + Name string `json:"name,omitempty"` + DisplayName string `json:"display_name,omitempty"` + TargetName string `json:"target_name,omitempty"` + Status DeploymentStatus `json:"status,omitempty"` + LastVersionID string `json:"last_version_id,omitempty"` + CreatedBy string `json:"created_by,omitempty"` + CreateTime *time.Time `json:"create_time,omitempty"` + UpdateTime *time.Time `json:"update_time,omitempty"` + DestroyTime *time.Time `json:"destroy_time,omitempty"` + DestroyedBy string `json:"destroyed_by,omitempty"` +} + +// Version represents a single invocation of deploy/destroy against a deployment. +type Version struct { + Name string `json:"name,omitempty"` + VersionID string `json:"version_id,omitempty"` + CreatedBy string `json:"created_by,omitempty"` + CreateTime *time.Time `json:"create_time,omitempty"` + CompleteTime *time.Time `json:"complete_time,omitempty"` + CliVersion string `json:"cli_version,omitempty"` + Status VersionStatus `json:"status,omitempty"` + VersionType VersionType `json:"version_type,omitempty"` + CompletionReason VersionComplete `json:"completion_reason,omitempty"` + CompletedBy string `json:"completed_by,omitempty"` + DisplayName string `json:"display_name,omitempty"` + TargetName string `json:"target_name,omitempty"` +} + +// Operation records the result of applying a resource change. +type Operation struct { + Name string `json:"name,omitempty"` + ResourceKey string `json:"resource_key,omitempty"` + ActionType OperationActionType `json:"action_type,omitempty"` + State any `json:"state,omitempty"` + ResourceID string `json:"resource_id,omitempty"` + CreateTime *time.Time `json:"create_time,omitempty"` + Status OperationStatus `json:"status,omitempty"` + ErrorMessage string `json:"error_message,omitempty"` +} + +// Resource represents a resource managed by a deployment. +type Resource struct { + Name string `json:"name,omitempty"` + ResourceKey string `json:"resource_key,omitempty"` + State any `json:"state,omitempty"` + ResourceID string `json:"resource_id,omitempty"` + LastActionType OperationActionType `json:"last_action_type,omitempty"` + LastVersionID string `json:"last_version_id,omitempty"` + ResourceType DeploymentResourceType `json:"resource_type,omitempty"` +} + +// Request/Response types. + +type CreateDeploymentRequest struct { + DeploymentID string `json:"deployment_id"` + Deployment *Deployment `json:"deployment"` +} + +type ListDeploymentsResponse struct { + Deployments []Deployment `json:"deployments"` + NextPageToken string `json:"next_page_token,omitempty"` +} + +type CreateVersionRequest struct { + Parent string `json:"parent"` + Version *Version `json:"version"` + VersionID string `json:"version_id"` +} + +type ListVersionsResponse struct { + Versions []Version `json:"versions"` + NextPageToken string `json:"next_page_token,omitempty"` +} + +type HeartbeatResponse struct { + ExpireTime *time.Time `json:"expire_time,omitempty"` +} + +type CompleteVersionRequest struct { + Name string `json:"name"` + CompletionReason VersionComplete `json:"completion_reason"` + Force bool `json:"force,omitempty"` +} + +type CreateOperationRequest struct { + Parent string `json:"parent"` + ResourceKey string `json:"resource_key"` + Operation *Operation `json:"operation"` +} + +type ListOperationsResponse struct { + Operations []Operation `json:"operations"` + NextPageToken string `json:"next_page_token,omitempty"` +} + +type ListResourcesResponse struct { + Resources []Resource `json:"resources"` + NextPageToken string `json:"next_page_token,omitempty"` +} diff --git a/bundle/deploy/state_update.go b/bundle/deploy/state_update.go index 55cf2393bf..06326c8a93 100644 --- a/bundle/deploy/state_update.go +++ b/bundle/deploy/state_update.go @@ -81,6 +81,12 @@ func StateUpdate() bundle.Mutator { return &stateUpdate{} } +// LoadState loads the deployment state from the local cache directory. +// If no state file exists, a new default DeploymentState is returned. +func LoadState(ctx context.Context, b *bundle.Bundle) (*DeploymentState, error) { + return load(ctx, b) +} + func load(ctx context.Context, b *bundle.Bundle) (*DeploymentState, error) { // If the file does not exist, return a new DeploymentState. statePath, err := getPathToStateFile(ctx, b) diff --git a/bundle/env/deployment_metadata.go b/bundle/env/deployment_metadata.go new file mode 100644 index 0000000000..60e896c045 --- /dev/null +++ b/bundle/env/deployment_metadata.go @@ -0,0 +1,15 @@ +package env + +import "context" + +// deploymentServiceVariable names the environment variable that controls whether the +// deployment metadata service is used for locking and resource state management. +const deploymentServiceVariable = "DATABRICKS_BUNDLE_DEPLOYMENT_SERVICE" + +// DeploymentService returns the environment variable that controls whether the +// deployment metadata service is used for locking and resource state management. +func DeploymentService(ctx context.Context) (string, bool) { + return get(ctx, []string{ + deploymentServiceVariable, + }) +} diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 4613a7a211..7a1fa6e778 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -9,6 +9,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" + "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" "github.com/databricks/cli/bundle/deploy/metadata" @@ -139,6 +140,11 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li // The deploy phase deploys artifacts and resources. // If readPlanPath is provided, the plan is loaded from that file instead of being calculated. func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, engine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { + if v, _ := env.DeploymentService(ctx); v == "true" { + deployWithMetadataService(ctx, b, outputHandler, engine, libs, plan) + return + } + log.Info(ctx, "Phase: deploy") // Core mutators that CRUD resources and modify deployment state. These diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go new file mode 100644 index 0000000000..bbe1197b5b --- /dev/null +++ b/bundle/phases/deploy_metadata.go @@ -0,0 +1,269 @@ +package phases + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/artifacts" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/deploy" + "github.com/databricks/cli/bundle/deploy/files" + "github.com/databricks/cli/bundle/deploy/metadata" + metadataservice "github.com/databricks/cli/bundle/deploy/metadata/service" + "github.com/databricks/cli/bundle/deploy/terraform" + "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/libraries" + "github.com/databricks/cli/bundle/metrics" + "github.com/databricks/cli/bundle/permissions" + "github.com/databricks/cli/bundle/scripts" + "github.com/databricks/cli/bundle/statemgmt" + "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/logdiag" + "github.com/databricks/cli/libs/sync" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/google/uuid" +) + +func deployWithMetadataService(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, targetEngine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { + log.Info(ctx, "Phase: deploy (with metadata service)") + + bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPreDeploy)) + if logdiag.HasError(ctx) { + return + } + + // Create the metadata service client. + svc, err := metadataservice.NewClient(b.WorkspaceClient()) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) + return + } + + // Load local deployment state to get the deployment ID and sequence number. + state, err := deploy.LoadState(ctx, b) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to load deployment state: %w", err)) + return + } + + // Generate a deployment ID if one doesn't exist yet. + if state.ID == uuid.Nil { + state.ID = uuid.New() + } + deploymentID := state.ID.String() + + // Ensure the deployment exists in the metadata service. + _, err = svc.CreateDeployment(ctx, deploymentID, &metadataservice.Deployment{ + TargetName: b.Config.Bundle.Target, + }) + if err != nil && !isAlreadyExists(err) { + logdiag.LogError(ctx, fmt.Errorf("failed to create deployment: %w", err)) + return + } + + // Create a version to acquire the deployment lock. + versionID := fmt.Sprintf("%d", state.Seq+1) + version, err := svc.CreateVersion(ctx, deploymentID, versionID, &metadataservice.Version{ + CliVersion: build.GetInfo().Version, + VersionType: metadataservice.VersionTypeDeploy, + TargetName: b.Config.Bundle.Target, + }) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to acquire deployment lock: %w", err)) + return + } + + log.Infof(ctx, "Acquired deployment lock: deployment=%s version=%s", deploymentID, version.VersionID) + + // Start heartbeat to keep the lock alive. + stopHeartbeat := metadataservice.StartHeartbeat(ctx, svc, deploymentID, versionID, metadataservice.DefaultHeartbeatInterval) + + // Ensure we always complete the version (release the lock) and stop heartbeat. + var deployFailed bool + defer func() { + stopHeartbeat() + + reason := metadataservice.VersionCompleteSuccess + if deployFailed || logdiag.HasError(ctx) { + reason = metadataservice.VersionCompleteFailure + } + + _, completeErr := svc.CompleteVersion(ctx, deploymentID, versionID, reason, false) + if completeErr != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) + } else { + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%d", deploymentID, versionID, reason) + } + }() + + // Upload libraries. + bundle.ApplySeqContext(ctx, b, + artifacts.CleanUp(), + libraries.Upload(libs), + ) + if logdiag.HasError(ctx) { + deployFailed = true + return + } + + // Upload files, update state, apply permissions. + bundle.ApplySeqContext(ctx, b, + files.Upload(outputHandler), + deploy.StateUpdate(), + deploy.StatePush(), + permissions.ApplyWorkspaceRootPermissions(), + metrics.TrackUsedCompute(), + deploy.ResourcePathMkdir(), + ) + if logdiag.HasError(ctx) { + deployFailed = true + return + } + + // Calculate or load the deploy plan. + if plan != nil { + _, localPath := b.StateFilenameDirect(ctx) + err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(), localPath, plan) + if err != nil { + logdiag.LogError(ctx, err) + deployFailed = true + return + } + } else { + plan = RunPlan(ctx, b, targetEngine) + } + if logdiag.HasError(ctx) { + deployFailed = true + return + } + + // Seek approval for potentially destructive changes. + haveApproval, err := approvalForDeploy(ctx, b, plan) + if err != nil { + logdiag.LogError(ctx, err) + deployFailed = true + return + } + if !haveApproval { + cmdio.LogString(ctx, "Deployment cancelled!") + return + } + + // Apply the deployment. + deployCoreWithMetadata(ctx, b, plan, targetEngine, svc, deploymentID, versionID) + if logdiag.HasError(ctx) { + deployFailed = true + return + } + + logDeployTelemetry(ctx, b) + bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPostDeploy)) +} + +// deployCoreWithMetadata applies the deployment plan and reports operations to +// the metadata service. +func deployCoreWithMetadata(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType, svc *metadataservice.Client, deploymentID, versionID string) { + cmdio.LogString(ctx, "Deploying resources...") + + if targetEngine.IsDirect() { + b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) + } else { + bundle.ApplyContext(ctx, b, terraform.Apply()) + } + + // Push resource state even on failure. + statemgmt.PushResourcesState(ctx, b, targetEngine) + + // Report operations to the metadata service (best-effort). + reportOperations(ctx, b, svc, deploymentID, versionID, plan) + + if logdiag.HasError(ctx) { + return + } + + bundle.ApplySeqContext(ctx, b, + statemgmt.Load(targetEngine), + metadata.Compute(), + metadata.Upload(), + statemgmt.UploadStateForYamlSync(targetEngine), + ) + + if !logdiag.HasError(ctx) { + cmdio.LogString(ctx, "Deployment complete!") + } +} + +// reportOperations reports each resource operation to the metadata service. +// This is best-effort: failures are logged as warnings, not fatal errors. +func reportOperations(ctx context.Context, b *bundle.Bundle, svc *metadataservice.Client, deploymentID, versionID string, plan *deployplan.Plan) { + if plan == nil { + return + } + + // Fetch existing resources to determine if this is the first time we're + // tracking each resource in the metadata service. + knownResources := map[string]bool{} + existing, err := svc.ListResources(ctx, deploymentID) + if err != nil { + log.Warnf(ctx, "Failed to list existing resources from metadata service, will use INITIAL_REGISTER for all: %v", err) + } else { + for _, r := range existing { + knownResources[r.ResourceKey] = true + } + } + + for resourceKey, entry := range plan.Plan { + var actionType metadataservice.OperationActionType + if knownResources[resourceKey] { + // Resource is already tracked; use the plan's action type. + actionType = planActionToOperationAction(entry.Action) + } else { + // First time tracking this resource in the service. + actionType = metadataservice.OperationActionTypeInitRegister + } + + if actionType == metadataservice.OperationActionTypeUnspecified { + continue + } + + _, err := svc.CreateOperation(ctx, deploymentID, versionID, resourceKey, &metadataservice.Operation{ + ResourceKey: resourceKey, + Status: metadataservice.OperationStatusSucceeded, + ActionType: actionType, + }) + if err != nil { + log.Warnf(ctx, "Failed to report operation for resource %s: %v", resourceKey, err) + } + } +} + +func planActionToOperationAction(action deployplan.ActionType) metadataservice.OperationActionType { + switch action { + case deployplan.Create: + return metadataservice.OperationActionTypeCreate + case deployplan.Update: + return metadataservice.OperationActionTypeUpdate + case deployplan.Delete: + return metadataservice.OperationActionTypeDelete + case deployplan.Recreate: + return metadataservice.OperationActionTypeRecreate + default: + return metadataservice.OperationActionTypeUnspecified + } +} + +// isAlreadyExists checks if an error indicates the resource already exists (HTTP 409). +func isAlreadyExists(err error) bool { + var apiErr *apierr.APIError + if errors.As(err, &apiErr) && apiErr.StatusCode == http.StatusConflict { + return true + } + return false +} diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index e6be00b579..374b533ca7 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -8,6 +8,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/mutator" + "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" "github.com/databricks/cli/bundle/deploy/terraform" @@ -115,6 +116,11 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e // The destroy phase deletes artifacts and resources. func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { + if v, _ := env.DeploymentService(ctx); v == "true" { + destroyWithMetadataService(ctx, b, engine) + return + } + log.Info(ctx, "Phase: destroy") ok, err := assertRootPathExists(ctx, b) diff --git a/bundle/phases/destroy_metadata.go b/bundle/phases/destroy_metadata.go new file mode 100644 index 0000000000..acb776d540 --- /dev/null +++ b/bundle/phases/destroy_metadata.go @@ -0,0 +1,169 @@ +package phases + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/config/mutator" + "github.com/databricks/cli/bundle/deploy" + "github.com/databricks/cli/bundle/deploy/files" + metadataservice "github.com/databricks/cli/bundle/deploy/metadata/service" + "github.com/databricks/cli/bundle/deploy/terraform" + "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/logdiag" + "github.com/databricks/databricks-sdk-go/apierr" +) + +func destroyWithMetadataService(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { + log.Info(ctx, "Phase: destroy (with metadata service)") + + ok, err := assertRootPathExists(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return + } + if !ok { + cmdio.LogString(ctx, "No active deployment found to destroy!") + return + } + + // Create the metadata service client. + svc, err := metadataservice.NewClient(b.WorkspaceClient()) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) + return + } + + // Load local deployment state to get the deployment ID and sequence number. + state, err := deploy.LoadState(ctx, b) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to load deployment state: %w", err)) + return + } + + deploymentID := state.ID.String() + + // Check that the deployment exists. + _, err = svc.GetDeployment(ctx, deploymentID) + if err != nil { + var apiErr *apierr.APIError + if errors.As(err, &apiErr) && apiErr.StatusCode == http.StatusNotFound { + log.Infof(ctx, "No deployment found in metadata service for %s, nothing to destroy", deploymentID) + cmdio.LogString(ctx, "No active deployment found to destroy!") + return + } + logdiag.LogError(ctx, fmt.Errorf("failed to get deployment: %w", err)) + return + } + + // Create a version to acquire the deployment lock. + versionID := fmt.Sprintf("%d", state.Seq+1) + _, err = svc.CreateVersion(ctx, deploymentID, versionID, &metadataservice.Version{ + CliVersion: build.GetInfo().Version, + VersionType: metadataservice.VersionTypeDestroy, + TargetName: b.Config.Bundle.Target, + }) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to acquire deployment lock: %w", err)) + return + } + + log.Infof(ctx, "Acquired deployment lock for destroy: deployment=%s version=%s", deploymentID, versionID) + + // Start heartbeat to keep the lock alive. + stopHeartbeat := metadataservice.StartHeartbeat(ctx, svc, deploymentID, versionID, metadataservice.DefaultHeartbeatInterval) + + var destroyFailed bool + defer func() { + stopHeartbeat() + + reason := metadataservice.VersionCompleteSuccess + if destroyFailed || logdiag.HasError(ctx) { + reason = metadataservice.VersionCompleteFailure + } + + _, completeErr := svc.CompleteVersion(ctx, deploymentID, versionID, reason, false) + if completeErr != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) + } else { + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%d", deploymentID, versionID, reason) + } + }() + + // Calculate the destroy plan. + if !targetEngine.IsDirect() { + bundle.ApplySeqContext(ctx, b, + mutator.ResolveVariableReferencesWithoutResources("artifacts"), + mutator.ResolveVariableReferencesOnlyResources("artifacts"), + terraform.Interpolate(), + terraform.Write(), + terraform.Plan(terraform.PlanGoal("destroy")), + ) + } + + if logdiag.HasError(ctx) { + destroyFailed = true + return + } + + var plan *deployplan.Plan + if targetEngine.IsDirect() { + _, localPath := b.StateFilenameDirect(ctx) + plan, err = b.DeploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(), nil, localPath) + if err != nil { + logdiag.LogError(ctx, err) + destroyFailed = true + return + } + } else { + tf := b.Terraform + if tf == nil { + logdiag.LogError(ctx, fmt.Errorf("terraform not initialized")) + destroyFailed = true + return + } + + plan, err = terraform.ShowPlanFile(ctx, tf, b.TerraformPlanPath) + if err != nil { + logdiag.LogError(ctx, err) + destroyFailed = true + return + } + } + + hasApproval, err := approvalForDestroy(ctx, b, plan) + if err != nil { + logdiag.LogError(ctx, err) + destroyFailed = true + return + } + + if hasApproval { + if targetEngine.IsDirect() { + b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) + } else { + bundle.ApplyContext(ctx, b, terraform.Apply()) + } + + if logdiag.HasError(ctx) { + destroyFailed = true + return + } + + bundle.ApplyContext(ctx, b, files.Delete()) + + if !logdiag.HasError(ctx) { + cmdio.LogString(ctx, "Destroy complete!") + } + } else { + cmdio.LogString(ctx, "Destroy cancelled!") + } +} From 9d055f4f1804260918ed553e6e5262933c6c13b2 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 30 Mar 2026 17:41:35 +0000 Subject: [PATCH 02/25] Fix correctness bugs and improve code quality from self-review - Use background context with timeout for CompleteVersion in defer blocks, so the lock is released even if the parent context is cancelled (e.g. Ctrl+C) - Add nil state.ID guard in destroy to avoid querying with zero UUID - Fix misleading --force-lock error message to explain lock expiry behavior - Fix import ordering Co-authored-by: Isaac --- bundle/deploy/metadata/service/client.go | 6 +++--- bundle/phases/deploy_metadata.go | 8 +++++++- bundle/phases/destroy_metadata.go | 13 ++++++++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/bundle/deploy/metadata/service/client.go b/bundle/deploy/metadata/service/client.go index ffe2fb36fc..df25bb39ce 100644 --- a/bundle/deploy/metadata/service/client.go +++ b/bundle/deploy/metadata/service/client.go @@ -2,11 +2,10 @@ package service import ( "context" + "errors" "fmt" "net/http" - "errors" - "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/apierr" "github.com/databricks/databricks-sdk-go/client" @@ -172,7 +171,8 @@ func mapError(operation string, err error) error { switch apiErr.StatusCode { case http.StatusConflict: return fmt.Errorf("%s: deployment is locked by another active deployment. "+ - "Use --force-lock to override", operation) + "If the prior deployment failed, the lock will expire automatically after 5 minutes. "+ + "You can also force-acquire the lock by running deploy with the --force-lock flag", operation) case http.StatusNotFound: return fmt.Errorf("%s: resource not found: %w", operation, err) case http.StatusBadRequest: diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index bbe1197b5b..e9f41a56cf 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "net/http" + "time" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/artifacts" @@ -95,7 +96,12 @@ func deployWithMetadataService(ctx context.Context, b *bundle.Bundle, outputHand reason = metadataservice.VersionCompleteFailure } - _, completeErr := svc.CompleteVersion(ctx, deploymentID, versionID, reason, false) + // Use a separate context for cleanup so the lock is released even if the + // parent context was cancelled (e.g. user hit Ctrl+C). + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + _, completeErr := svc.CompleteVersion(cleanupCtx, deploymentID, versionID, reason, false) if completeErr != nil { log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) } else { diff --git a/bundle/phases/destroy_metadata.go b/bundle/phases/destroy_metadata.go index acb776d540..d7992e72cc 100644 --- a/bundle/phases/destroy_metadata.go +++ b/bundle/phases/destroy_metadata.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "net/http" + "time" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" @@ -20,6 +21,7 @@ import ( "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/databricks-sdk-go/apierr" + "github.com/google/uuid" ) func destroyWithMetadataService(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { @@ -49,6 +51,10 @@ func destroyWithMetadataService(ctx context.Context, b *bundle.Bundle, targetEng return } + if state.ID == uuid.Nil { + cmdio.LogString(ctx, "No active deployment found to destroy!") + return + } deploymentID := state.ID.String() // Check that the deployment exists. @@ -90,7 +96,12 @@ func destroyWithMetadataService(ctx context.Context, b *bundle.Bundle, targetEng reason = metadataservice.VersionCompleteFailure } - _, completeErr := svc.CompleteVersion(ctx, deploymentID, versionID, reason, false) + // Use a separate context for cleanup so the lock is released even if the + // parent context was cancelled (e.g. user hit Ctrl+C). + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + _, completeErr := svc.CompleteVersion(cleanupCtx, deploymentID, versionID, reason, false) if completeErr != nil { log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) } else { From 342fef82db47a5521bfd570a8bc5731190cdb805 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 30 Mar 2026 19:57:23 +0000 Subject: [PATCH 03/25] Refactor to SDK-style tempdms package and unify deploy/destroy flows Move the deployment metadata service client from bundle/deploy/metadata/service to libs/tempdms with SDK-style method signatures (single request struct param). When the protos land in the Go SDK, migration is just an import path change. Unify deploy and destroy flows: instead of separate *WithMetadataService functions that duplicated all mutator calls, the core logic stays in Deploy() and Destroy() with conditional lock management based on the env var. Co-authored-by: Isaac --- bundle/deploy/metadata/service/client.go | 183 -------- bundle/phases/deploy.go | 102 +++-- bundle/phases/deploy_metadata.go | 234 +++------- bundle/phases/destroy.go | 70 ++- bundle/phases/destroy_metadata.go | 181 +------- .../metadata/service => phases}/heartbeat.go | 14 +- libs/tempdms/api.go | 164 +++++++ .../service => libs/tempdms}/types.go | 146 ++++--- libs/testserver/deployment_metadata.go | 400 ++++++++++++++++++ libs/testserver/fake_workspace.go | 3 + libs/testserver/handlers.go | 38 ++ 11 files changed, 893 insertions(+), 642 deletions(-) delete mode 100644 bundle/deploy/metadata/service/client.go rename bundle/{deploy/metadata/service => phases}/heartbeat.go (56%) create mode 100644 libs/tempdms/api.go rename {bundle/deploy/metadata/service => libs/tempdms}/types.go (58%) create mode 100644 libs/testserver/deployment_metadata.go diff --git a/bundle/deploy/metadata/service/client.go b/bundle/deploy/metadata/service/client.go deleted file mode 100644 index df25bb39ce..0000000000 --- a/bundle/deploy/metadata/service/client.go +++ /dev/null @@ -1,183 +0,0 @@ -package service - -import ( - "context" - "errors" - "fmt" - "net/http" - - "github.com/databricks/databricks-sdk-go" - "github.com/databricks/databricks-sdk-go/apierr" - "github.com/databricks/databricks-sdk-go/client" -) - -const basePath = "/api/2.0/bundle" - -// Client wraps the Databricks API client for the deployment metadata service. -type Client struct { - api *client.DatabricksClient -} - -// NewClient creates a new deployment metadata service client from a workspace client. -func NewClient(w *databricks.WorkspaceClient) (*Client, error) { - apiClient, err := client.New(w.Config) - if err != nil { - return nil, fmt.Errorf("failed to create deployment metadata API client: %w", err) - } - return &Client{api: apiClient}, nil -} - -// CreateDeployment creates a new deployment. -func (c *Client) CreateDeployment(ctx context.Context, deploymentID string, deployment *Deployment) (*Deployment, error) { - resp := &Deployment{} - path := fmt.Sprintf("%s/deployments", basePath) - err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateDeploymentRequest{ - DeploymentID: deploymentID, - Deployment: deployment, - }, resp) - if err != nil { - return nil, mapError("create deployment", err) - } - return resp, nil -} - -// GetDeployment retrieves a deployment by ID. -func (c *Client) GetDeployment(ctx context.Context, deploymentID string) (*Deployment, error) { - resp := &Deployment{} - path := fmt.Sprintf("%s/deployments/%s", basePath, deploymentID) - err := c.api.Do(ctx, http.MethodGet, path, nil, nil, nil, resp) - if err != nil { - return nil, mapError("get deployment", err) - } - return resp, nil -} - -// DeleteDeployment soft-deletes a deployment. -func (c *Client) DeleteDeployment(ctx context.Context, deploymentID string) error { - path := fmt.Sprintf("%s/deployments/%s", basePath, deploymentID) - err := c.api.Do(ctx, http.MethodDelete, path, nil, nil, nil, nil) - if err != nil { - return mapError("delete deployment", err) - } - return nil -} - -// CreateVersion creates a new version (acquires the deployment lock). -func (c *Client) CreateVersion(ctx context.Context, deploymentID string, versionID string, version *Version) (*Version, error) { - resp := &Version{} - path := fmt.Sprintf("%s/deployments/%s/versions", basePath, deploymentID) - err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateVersionRequest{ - Parent: fmt.Sprintf("deployments/%s", deploymentID), - Version: version, - VersionID: versionID, - }, resp) - if err != nil { - return nil, mapError("create version", err) - } - return resp, nil -} - -// GetVersion retrieves a version. -func (c *Client) GetVersion(ctx context.Context, deploymentID, versionID string) (*Version, error) { - resp := &Version{} - path := fmt.Sprintf("%s/deployments/%s/versions/%s", basePath, deploymentID, versionID) - err := c.api.Do(ctx, http.MethodGet, path, nil, nil, nil, resp) - if err != nil { - return nil, mapError("get version", err) - } - return resp, nil -} - -// Heartbeat renews the lock lease for an in-progress version. -func (c *Client) Heartbeat(ctx context.Context, deploymentID, versionID string) (*HeartbeatResponse, error) { - resp := &HeartbeatResponse{} - path := fmt.Sprintf("%s/deployments/%s/versions/%s/heartbeat", basePath, deploymentID, versionID) - err := c.api.Do(ctx, http.MethodPost, path, nil, nil, struct{}{}, resp) - if err != nil { - return nil, mapError("heartbeat", err) - } - return resp, nil -} - -// CompleteVersion marks a version as completed (releases the deployment lock). -func (c *Client) CompleteVersion(ctx context.Context, deploymentID, versionID string, reason VersionComplete, force bool) (*Version, error) { - resp := &Version{} - path := fmt.Sprintf("%s/deployments/%s/versions/%s/complete", basePath, deploymentID, versionID) - err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CompleteVersionRequest{ - Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), - CompletionReason: reason, - Force: force, - }, resp) - if err != nil { - return nil, mapError("complete version", err) - } - return resp, nil -} - -// CreateOperation records a resource operation for a version. -func (c *Client) CreateOperation(ctx context.Context, deploymentID, versionID, resourceKey string, operation *Operation) (*Operation, error) { - resp := &Operation{} - path := fmt.Sprintf("%s/deployments/%s/versions/%s/operations", basePath, deploymentID, versionID) - err := c.api.Do(ctx, http.MethodPost, path, nil, nil, CreateOperationRequest{ - Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), - ResourceKey: resourceKey, - Operation: operation, - }, resp) - if err != nil { - return nil, mapError("create operation", err) - } - return resp, nil -} - -// ListResources lists all resources for a deployment. -func (c *Client) ListResources(ctx context.Context, deploymentID string) ([]Resource, error) { - var allResources []Resource - pageToken := "" - - for { - resp := &ListResourcesResponse{} - path := fmt.Sprintf("%s/deployments/%s/resources", basePath, deploymentID) - - q := map[string]any{ - "parent": fmt.Sprintf("deployments/%s", deploymentID), - "page_size": 1000, - } - if pageToken != "" { - q["page_token"] = pageToken - } - - err := c.api.Do(ctx, http.MethodGet, path, nil, q, nil, resp) - if err != nil { - return nil, mapError("list resources", err) - } - - allResources = append(allResources, resp.Resources...) - if resp.NextPageToken == "" { - break - } - pageToken = resp.NextPageToken - } - - return allResources, nil -} - -// mapError translates API errors into user-friendly messages. -func mapError(operation string, err error) error { - var apiErr *apierr.APIError - if !errors.As(err, &apiErr) { - return fmt.Errorf("%s: %w", operation, err) - } - - switch apiErr.StatusCode { - case http.StatusConflict: - return fmt.Errorf("%s: deployment is locked by another active deployment. "+ - "If the prior deployment failed, the lock will expire automatically after 5 minutes. "+ - "You can also force-acquire the lock by running deploy with the --force-lock flag", operation) - case http.StatusNotFound: - return fmt.Errorf("%s: resource not found: %w", operation, err) - case http.StatusBadRequest: - return fmt.Errorf("%s: bad request: %s", operation, apiErr.Message) - default: - return fmt.Errorf("%s: %w", operation, err) - } -} diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 7a1fa6e778..5ca8745f06 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -3,19 +3,20 @@ package phases import ( "context" "errors" + "fmt" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/artifacts" "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" - "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" "github.com/databricks/cli/bundle/deploy/metadata" "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/metrics" "github.com/databricks/cli/bundle/permissions" @@ -25,6 +26,7 @@ import ( "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/cli/libs/sync" + "github.com/databricks/cli/libs/tempdms" ) func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) (bool, error) { @@ -98,9 +100,11 @@ func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.P return approved, nil } -func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType) { - // Core mutators that CRUD resources and modify deployment state. These - // mutators need informed consent if they are potentially destructive. +// postApplyHook is called after the deployment plan is applied (terraform/direct Apply). +// It can be used for additional state reporting (e.g. to the metadata service). +type postApplyHook func(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) + +func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType, hook postApplyHook) { cmdio.LogString(ctx, "Deploying resources...") if targetEngine.IsDirect() { @@ -109,8 +113,14 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Even if deployment failed, there might be updates in states that we need to upload + // Even if deployment failed, there might be updates in states that we need to upload. statemgmt.PushResourcesState(ctx, b, targetEngine) + + // Run any additional post-apply logic (e.g. metadata service operation reporting). + if hook != nil { + hook(ctx, b, plan) + } + if logdiag.HasError(ctx) { return } @@ -139,33 +149,55 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li // The deploy phase deploys artifacts and resources. // If readPlanPath is provided, the plan is loaded from that file instead of being calculated. -func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, engine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { - if v, _ := env.DeploymentService(ctx); v == "true" { - deployWithMetadataService(ctx, b, outputHandler, engine, libs, plan) - return - } +func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, targetEngine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { + useMetadataService, _ := env.DeploymentService(ctx) - log.Info(ctx, "Phase: deploy") - - // Core mutators that CRUD resources and modify deployment state. These - // mutators need informed consent if they are potentially destructive. - bundle.ApplySeqContext(ctx, b, - scripts.Execute(config.ScriptPreDeploy), - lock.Acquire(), - ) + if useMetadataService == "true" { + log.Info(ctx, "Phase: deploy (with metadata service)") + } else { + log.Info(ctx, "Phase: deploy") + } + bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPreDeploy)) if logdiag.HasError(ctx) { - // lock is not acquired here return } - // lock is acquired here - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) - }() + // Acquire the deployment lock. + var svc *tempdms.DeploymentMetadataAPI + var deploymentID, versionID string + var failed bool + + if useMetadataService == "true" { + var err error + svc, err = tempdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) + return + } + + var cleanup func(failed bool) + deploymentID, versionID, cleanup, err = deployMetadataLock(ctx, b, svc, tempdms.VersionTypeDeploy) + if err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + cleanup(failed || logdiag.HasError(ctx)) + }() + } else { + bundle.ApplyContext(ctx, b, lock.Acquire()) + if logdiag.HasError(ctx) { + return + } + defer func() { + bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) + }() + } uploadLibraries(ctx, b, libs) if logdiag.HasError(ctx) { + failed = true return } @@ -177,40 +209,50 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand metrics.TrackUsedCompute(), deploy.ResourcePathMkdir(), ) - if logdiag.HasError(ctx) { + failed = true return } if plan != nil { - // Initialize DeploymentBundle for applying the loaded plan + // Initialize DeploymentBundle for applying the loaded plan. _, localPath := b.StateFilenameDirect(ctx) err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(), localPath, plan) if err != nil { logdiag.LogError(ctx, err) + failed = true return } } else { - plan = RunPlan(ctx, b, engine) + plan = RunPlan(ctx, b, targetEngine) } - if logdiag.HasError(ctx) { + failed = true return } haveApproval, err := approvalForDeploy(ctx, b, plan) if err != nil { logdiag.LogError(ctx, err) + failed = true return } - if haveApproval { - deployCore(ctx, b, plan, engine) - } else { + if !haveApproval { cmdio.LogString(ctx, "Deployment cancelled!") return } + // Build the post-apply hook for metadata service reporting (nil for file-based). + var hook postApplyHook + if useMetadataService == "true" { + hook = func(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) { + reportOperations(ctx, svc, deploymentID, versionID, plan) + } + } + + deployCore(ctx, b, plan, targetEngine, hook) if logdiag.HasError(ctx) { + failed = true return } diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index e9f41a56cf..40d1d7d620 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -8,92 +8,72 @@ import ( "time" "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/artifacts" - "github.com/databricks/cli/bundle/config" - "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" - "github.com/databricks/cli/bundle/deploy/files" - "github.com/databricks/cli/bundle/deploy/metadata" - metadataservice "github.com/databricks/cli/bundle/deploy/metadata/service" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" - "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/bundle/libraries" - "github.com/databricks/cli/bundle/metrics" - "github.com/databricks/cli/bundle/permissions" - "github.com/databricks/cli/bundle/scripts" - "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/internal/build" - "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" - "github.com/databricks/cli/libs/sync" + "github.com/databricks/cli/libs/tempdms" "github.com/databricks/databricks-sdk-go/apierr" "github.com/google/uuid" ) -func deployWithMetadataService(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, targetEngine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { - log.Info(ctx, "Phase: deploy (with metadata service)") - - bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPreDeploy)) - if logdiag.HasError(ctx) { - return - } - - // Create the metadata service client. - svc, err := metadataservice.NewClient(b.WorkspaceClient()) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) - return - } - +// deployMetadataLock implements the lock acquire/release lifecycle using the +// deployment metadata service (CreateVersion / CompleteVersion). +// +// It returns a cleanup function that must be deferred by the caller to release +// the lock and stop the heartbeat, as well as any error from acquiring the lock. +func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.DeploymentMetadataAPI, versionType tempdms.VersionType) (deploymentID, versionID string, cleanup func(failed bool), err error) { // Load local deployment state to get the deployment ID and sequence number. - state, err := deploy.LoadState(ctx, b) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to load deployment state: %w", err)) - return + state, loadErr := deploy.LoadState(ctx, b) + if loadErr != nil { + return "", "", nil, fmt.Errorf("failed to load deployment state: %w", loadErr) } // Generate a deployment ID if one doesn't exist yet. if state.ID == uuid.Nil { state.ID = uuid.New() } - deploymentID := state.ID.String() + deploymentID = state.ID.String() // Ensure the deployment exists in the metadata service. - _, err = svc.CreateDeployment(ctx, deploymentID, &metadataservice.Deployment{ - TargetName: b.Config.Bundle.Target, + _, createErr := svc.CreateDeployment(ctx, tempdms.CreateDeploymentRequest{ + DeploymentID: deploymentID, + Deployment: &tempdms.Deployment{ + TargetName: b.Config.Bundle.Target, + }, }) - if err != nil && !isAlreadyExists(err) { - logdiag.LogError(ctx, fmt.Errorf("failed to create deployment: %w", err)) - return + if createErr != nil && !isAlreadyExists(createErr) { + return "", "", nil, fmt.Errorf("failed to create deployment: %w", createErr) } // Create a version to acquire the deployment lock. - versionID := fmt.Sprintf("%d", state.Seq+1) - version, err := svc.CreateVersion(ctx, deploymentID, versionID, &metadataservice.Version{ - CliVersion: build.GetInfo().Version, - VersionType: metadataservice.VersionTypeDeploy, - TargetName: b.Config.Bundle.Target, + versionID = fmt.Sprintf("%d", state.Seq+1) + version, versionErr := svc.CreateVersion(ctx, tempdms.CreateVersionRequest{ + DeploymentID: deploymentID, + Parent: fmt.Sprintf("deployments/%s", deploymentID), + VersionID: versionID, + Version: &tempdms.Version{ + CliVersion: build.GetInfo().Version, + VersionType: versionType, + TargetName: b.Config.Bundle.Target, + }, }) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to acquire deployment lock: %w", err)) - return + if versionErr != nil { + return "", "", nil, fmt.Errorf("failed to acquire deployment lock: %w", versionErr) } log.Infof(ctx, "Acquired deployment lock: deployment=%s version=%s", deploymentID, version.VersionID) // Start heartbeat to keep the lock alive. - stopHeartbeat := metadataservice.StartHeartbeat(ctx, svc, deploymentID, versionID, metadataservice.DefaultHeartbeatInterval) + stopHeartbeat := startHeartbeat(ctx, svc, deploymentID, versionID, defaultHeartbeatInterval) - // Ensure we always complete the version (release the lock) and stop heartbeat. - var deployFailed bool - defer func() { + cleanup = func(failed bool) { stopHeartbeat() - reason := metadataservice.VersionCompleteSuccess - if deployFailed || logdiag.HasError(ctx) { - reason = metadataservice.VersionCompleteFailure + reason := tempdms.VersionCompleteSuccess + if failed { + reason = tempdms.VersionCompleteFailure } // Use a separate context for cleanup so the lock is released even if the @@ -101,114 +81,25 @@ func deployWithMetadataService(ctx context.Context, b *bundle.Bundle, outputHand cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - _, completeErr := svc.CompleteVersion(cleanupCtx, deploymentID, versionID, reason, false) + _, completeErr := svc.CompleteVersion(cleanupCtx, tempdms.CompleteVersionRequest{ + DeploymentID: deploymentID, + VersionID: versionID, + Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + CompletionReason: reason, + }) if completeErr != nil { log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) } else { log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%d", deploymentID, versionID, reason) } - }() - - // Upload libraries. - bundle.ApplySeqContext(ctx, b, - artifacts.CleanUp(), - libraries.Upload(libs), - ) - if logdiag.HasError(ctx) { - deployFailed = true - return - } - - // Upload files, update state, apply permissions. - bundle.ApplySeqContext(ctx, b, - files.Upload(outputHandler), - deploy.StateUpdate(), - deploy.StatePush(), - permissions.ApplyWorkspaceRootPermissions(), - metrics.TrackUsedCompute(), - deploy.ResourcePathMkdir(), - ) - if logdiag.HasError(ctx) { - deployFailed = true - return } - // Calculate or load the deploy plan. - if plan != nil { - _, localPath := b.StateFilenameDirect(ctx) - err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(), localPath, plan) - if err != nil { - logdiag.LogError(ctx, err) - deployFailed = true - return - } - } else { - plan = RunPlan(ctx, b, targetEngine) - } - if logdiag.HasError(ctx) { - deployFailed = true - return - } - - // Seek approval for potentially destructive changes. - haveApproval, err := approvalForDeploy(ctx, b, plan) - if err != nil { - logdiag.LogError(ctx, err) - deployFailed = true - return - } - if !haveApproval { - cmdio.LogString(ctx, "Deployment cancelled!") - return - } - - // Apply the deployment. - deployCoreWithMetadata(ctx, b, plan, targetEngine, svc, deploymentID, versionID) - if logdiag.HasError(ctx) { - deployFailed = true - return - } - - logDeployTelemetry(ctx, b) - bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPostDeploy)) -} - -// deployCoreWithMetadata applies the deployment plan and reports operations to -// the metadata service. -func deployCoreWithMetadata(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType, svc *metadataservice.Client, deploymentID, versionID string) { - cmdio.LogString(ctx, "Deploying resources...") - - if targetEngine.IsDirect() { - b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) - } else { - bundle.ApplyContext(ctx, b, terraform.Apply()) - } - - // Push resource state even on failure. - statemgmt.PushResourcesState(ctx, b, targetEngine) - - // Report operations to the metadata service (best-effort). - reportOperations(ctx, b, svc, deploymentID, versionID, plan) - - if logdiag.HasError(ctx) { - return - } - - bundle.ApplySeqContext(ctx, b, - statemgmt.Load(targetEngine), - metadata.Compute(), - metadata.Upload(), - statemgmt.UploadStateForYamlSync(targetEngine), - ) - - if !logdiag.HasError(ctx) { - cmdio.LogString(ctx, "Deployment complete!") - } + return deploymentID, versionID, cleanup, nil } // reportOperations reports each resource operation to the metadata service. // This is best-effort: failures are logged as warnings, not fatal errors. -func reportOperations(ctx context.Context, b *bundle.Bundle, svc *metadataservice.Client, deploymentID, versionID string, plan *deployplan.Plan) { +func reportOperations(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, deploymentID, versionID string, plan *deployplan.Plan) { if plan == nil { return } @@ -216,7 +107,10 @@ func reportOperations(ctx context.Context, b *bundle.Bundle, svc *metadataservic // Fetch existing resources to determine if this is the first time we're // tracking each resource in the metadata service. knownResources := map[string]bool{} - existing, err := svc.ListResources(ctx, deploymentID) + existing, err := svc.ListResources(ctx, tempdms.ListResourcesRequest{ + DeploymentID: deploymentID, + Parent: fmt.Sprintf("deployments/%s", deploymentID), + }) if err != nil { log.Warnf(ctx, "Failed to list existing resources from metadata service, will use INITIAL_REGISTER for all: %v", err) } else { @@ -226,23 +120,27 @@ func reportOperations(ctx context.Context, b *bundle.Bundle, svc *metadataservic } for resourceKey, entry := range plan.Plan { - var actionType metadataservice.OperationActionType + var actionType tempdms.OperationActionType if knownResources[resourceKey] { - // Resource is already tracked; use the plan's action type. actionType = planActionToOperationAction(entry.Action) } else { - // First time tracking this resource in the service. - actionType = metadataservice.OperationActionTypeInitRegister + actionType = tempdms.OperationActionTypeInitRegister } - if actionType == metadataservice.OperationActionTypeUnspecified { + if actionType == tempdms.OperationActionTypeUnspecified { continue } - _, err := svc.CreateOperation(ctx, deploymentID, versionID, resourceKey, &metadataservice.Operation{ - ResourceKey: resourceKey, - Status: metadataservice.OperationStatusSucceeded, - ActionType: actionType, + _, err := svc.CreateOperation(ctx, tempdms.CreateOperationRequest{ + DeploymentID: deploymentID, + VersionID: versionID, + Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + ResourceKey: resourceKey, + Operation: &tempdms.Operation{ + ResourceKey: resourceKey, + Status: tempdms.OperationStatusSucceeded, + ActionType: actionType, + }, }) if err != nil { log.Warnf(ctx, "Failed to report operation for resource %s: %v", resourceKey, err) @@ -250,18 +148,18 @@ func reportOperations(ctx context.Context, b *bundle.Bundle, svc *metadataservic } } -func planActionToOperationAction(action deployplan.ActionType) metadataservice.OperationActionType { +func planActionToOperationAction(action deployplan.ActionType) tempdms.OperationActionType { switch action { case deployplan.Create: - return metadataservice.OperationActionTypeCreate + return tempdms.OperationActionTypeCreate case deployplan.Update: - return metadataservice.OperationActionTypeUpdate + return tempdms.OperationActionTypeUpdate case deployplan.Delete: - return metadataservice.OperationActionTypeDelete + return tempdms.OperationActionTypeDelete case deployplan.Recreate: - return metadataservice.OperationActionTypeRecreate + return tempdms.OperationActionTypeRecreate default: - return metadataservice.OperationActionTypeUnspecified + return tempdms.OperationActionTypeUnspecified } } diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 374b533ca7..81e52a3445 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -3,20 +3,22 @@ package phases import ( "context" "errors" + "fmt" "net/http" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/mutator" - "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" + "github.com/databricks/cli/libs/tempdms" "github.com/databricks/databricks-sdk-go/apierr" ) @@ -95,11 +97,10 @@ func approvalForDestroy(ctx context.Context, b *bundle.Bundle, plan *deployplan. return approved, nil } -func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, engine engine.EngineType) { - if engine.IsDirect() { +func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType) { + if targetEngine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) } else { - // Core destructive mutators for destroy. These require informed user consent. bundle.ApplyContext(ctx, b, terraform.Apply()) } @@ -115,35 +116,54 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e } // The destroy phase deletes artifacts and resources. -func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { - if v, _ := env.DeploymentService(ctx); v == "true" { - destroyWithMetadataService(ctx, b, engine) - return - } +func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { + useMetadataService, _ := env.DeploymentService(ctx) - log.Info(ctx, "Phase: destroy") + if useMetadataService == "true" { + log.Info(ctx, "Phase: destroy (with metadata service)") + } else { + log.Info(ctx, "Phase: destroy") + } ok, err := assertRootPathExists(ctx, b) if err != nil { logdiag.LogError(ctx, err) return } - if !ok { cmdio.LogString(ctx, "No active deployment found to destroy!") return } - bundle.ApplyContext(ctx, b, lock.Acquire()) - if logdiag.HasError(ctx) { - return - } + // Acquire the deployment lock. + var failed bool - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDestroy)) - }() + if useMetadataService == "true" { + svc, svcErr := tempdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) + if svcErr != nil { + logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", svcErr)) + return + } + + _, _, cleanup, lockErr := deployMetadataLock(ctx, b, svc, tempdms.VersionTypeDestroy) + if lockErr != nil { + logdiag.LogError(ctx, lockErr) + return + } + defer func() { + cleanup(failed || logdiag.HasError(ctx)) + }() + } else { + bundle.ApplyContext(ctx, b, lock.Acquire()) + if logdiag.HasError(ctx) { + return + } + defer func() { + bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDestroy)) + }() + } - if !engine.IsDirect() { + if !targetEngine.IsDirect() { bundle.ApplySeqContext(ctx, b, // We need to resolve artifact variable (how we do it in build phase) // because some of the to-be-destroyed resource might use this variable. @@ -158,27 +178,31 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { } if logdiag.HasError(ctx) { + failed = true return } var plan *deployplan.Plan - if engine.IsDirect() { + if targetEngine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) plan, err = b.DeploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(), nil, localPath) if err != nil { logdiag.LogError(ctx, err) + failed = true return } } else { tf := b.Terraform if tf == nil { logdiag.LogError(ctx, errors.New("terraform not initialized")) + failed = true return } plan, err = terraform.ShowPlanFile(ctx, tf, b.TerraformPlanPath) if err != nil { logdiag.LogError(ctx, err) + failed = true return } } @@ -186,11 +210,15 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { hasApproval, err := approvalForDestroy(ctx, b, plan) if err != nil { logdiag.LogError(ctx, err) + failed = true return } if hasApproval { - destroyCore(ctx, b, plan, engine) + destroyCore(ctx, b, plan, targetEngine) + if logdiag.HasError(ctx) { + failed = true + } } else { cmdio.LogString(ctx, "Destroy cancelled!") } diff --git a/bundle/phases/destroy_metadata.go b/bundle/phases/destroy_metadata.go index d7992e72cc..6cfa47ecc0 100644 --- a/bundle/phases/destroy_metadata.go +++ b/bundle/phases/destroy_metadata.go @@ -1,180 +1,3 @@ +// This file is intentionally left minimal. The destroy flow with metadata service +// support has been unified into destroy.go using the deployMetadataLock helper. package phases - -import ( - "context" - "errors" - "fmt" - "net/http" - "time" - - "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/config/engine" - "github.com/databricks/cli/bundle/config/mutator" - "github.com/databricks/cli/bundle/deploy" - "github.com/databricks/cli/bundle/deploy/files" - metadataservice "github.com/databricks/cli/bundle/deploy/metadata/service" - "github.com/databricks/cli/bundle/deploy/terraform" - "github.com/databricks/cli/bundle/deployplan" - "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/internal/build" - "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/log" - "github.com/databricks/cli/libs/logdiag" - "github.com/databricks/databricks-sdk-go/apierr" - "github.com/google/uuid" -) - -func destroyWithMetadataService(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { - log.Info(ctx, "Phase: destroy (with metadata service)") - - ok, err := assertRootPathExists(ctx, b) - if err != nil { - logdiag.LogError(ctx, err) - return - } - if !ok { - cmdio.LogString(ctx, "No active deployment found to destroy!") - return - } - - // Create the metadata service client. - svc, err := metadataservice.NewClient(b.WorkspaceClient()) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) - return - } - - // Load local deployment state to get the deployment ID and sequence number. - state, err := deploy.LoadState(ctx, b) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to load deployment state: %w", err)) - return - } - - if state.ID == uuid.Nil { - cmdio.LogString(ctx, "No active deployment found to destroy!") - return - } - deploymentID := state.ID.String() - - // Check that the deployment exists. - _, err = svc.GetDeployment(ctx, deploymentID) - if err != nil { - var apiErr *apierr.APIError - if errors.As(err, &apiErr) && apiErr.StatusCode == http.StatusNotFound { - log.Infof(ctx, "No deployment found in metadata service for %s, nothing to destroy", deploymentID) - cmdio.LogString(ctx, "No active deployment found to destroy!") - return - } - logdiag.LogError(ctx, fmt.Errorf("failed to get deployment: %w", err)) - return - } - - // Create a version to acquire the deployment lock. - versionID := fmt.Sprintf("%d", state.Seq+1) - _, err = svc.CreateVersion(ctx, deploymentID, versionID, &metadataservice.Version{ - CliVersion: build.GetInfo().Version, - VersionType: metadataservice.VersionTypeDestroy, - TargetName: b.Config.Bundle.Target, - }) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to acquire deployment lock: %w", err)) - return - } - - log.Infof(ctx, "Acquired deployment lock for destroy: deployment=%s version=%s", deploymentID, versionID) - - // Start heartbeat to keep the lock alive. - stopHeartbeat := metadataservice.StartHeartbeat(ctx, svc, deploymentID, versionID, metadataservice.DefaultHeartbeatInterval) - - var destroyFailed bool - defer func() { - stopHeartbeat() - - reason := metadataservice.VersionCompleteSuccess - if destroyFailed || logdiag.HasError(ctx) { - reason = metadataservice.VersionCompleteFailure - } - - // Use a separate context for cleanup so the lock is released even if the - // parent context was cancelled (e.g. user hit Ctrl+C). - cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - _, completeErr := svc.CompleteVersion(cleanupCtx, deploymentID, versionID, reason, false) - if completeErr != nil { - log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) - } else { - log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%d", deploymentID, versionID, reason) - } - }() - - // Calculate the destroy plan. - if !targetEngine.IsDirect() { - bundle.ApplySeqContext(ctx, b, - mutator.ResolveVariableReferencesWithoutResources("artifacts"), - mutator.ResolveVariableReferencesOnlyResources("artifacts"), - terraform.Interpolate(), - terraform.Write(), - terraform.Plan(terraform.PlanGoal("destroy")), - ) - } - - if logdiag.HasError(ctx) { - destroyFailed = true - return - } - - var plan *deployplan.Plan - if targetEngine.IsDirect() { - _, localPath := b.StateFilenameDirect(ctx) - plan, err = b.DeploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(), nil, localPath) - if err != nil { - logdiag.LogError(ctx, err) - destroyFailed = true - return - } - } else { - tf := b.Terraform - if tf == nil { - logdiag.LogError(ctx, fmt.Errorf("terraform not initialized")) - destroyFailed = true - return - } - - plan, err = terraform.ShowPlanFile(ctx, tf, b.TerraformPlanPath) - if err != nil { - logdiag.LogError(ctx, err) - destroyFailed = true - return - } - } - - hasApproval, err := approvalForDestroy(ctx, b, plan) - if err != nil { - logdiag.LogError(ctx, err) - destroyFailed = true - return - } - - if hasApproval { - if targetEngine.IsDirect() { - b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) - } else { - bundle.ApplyContext(ctx, b, terraform.Apply()) - } - - if logdiag.HasError(ctx) { - destroyFailed = true - return - } - - bundle.ApplyContext(ctx, b, files.Delete()) - - if !logdiag.HasError(ctx) { - cmdio.LogString(ctx, "Destroy complete!") - } - } else { - cmdio.LogString(ctx, "Destroy cancelled!") - } -} diff --git a/bundle/deploy/metadata/service/heartbeat.go b/bundle/phases/heartbeat.go similarity index 56% rename from bundle/deploy/metadata/service/heartbeat.go rename to bundle/phases/heartbeat.go index d32e0a24f0..1f9b3d41d1 100644 --- a/bundle/deploy/metadata/service/heartbeat.go +++ b/bundle/phases/heartbeat.go @@ -1,17 +1,18 @@ -package service +package phases import ( "context" "time" "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/tempdms" ) -const DefaultHeartbeatInterval = 2 * time.Minute +const defaultHeartbeatInterval = 2 * time.Minute -// StartHeartbeat starts a background goroutine that sends heartbeats to keep +// startHeartbeat starts a background goroutine that sends heartbeats to keep // the deployment lock alive. Returns a cancel function to stop the heartbeat. -func StartHeartbeat(ctx context.Context, client *Client, deploymentID, versionID string, interval time.Duration) context.CancelFunc { +func startHeartbeat(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, deploymentID, versionID string, interval time.Duration) context.CancelFunc { ctx, cancel := context.WithCancel(ctx) go func() { @@ -23,7 +24,10 @@ func StartHeartbeat(ctx context.Context, client *Client, deploymentID, versionID case <-ctx.Done(): return case <-ticker.C: - _, err := client.Heartbeat(ctx, deploymentID, versionID) + _, err := svc.Heartbeat(ctx, tempdms.HeartbeatRequest{ + DeploymentID: deploymentID, + VersionID: versionID, + }) if err != nil { log.Warnf(ctx, "Failed to send deployment heartbeat: %v", err) } else { diff --git a/libs/tempdms/api.go b/libs/tempdms/api.go new file mode 100644 index 0000000000..305633819e --- /dev/null +++ b/libs/tempdms/api.go @@ -0,0 +1,164 @@ +package tempdms + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/databricks/databricks-sdk-go" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/databricks/databricks-sdk-go/client" +) + +const basePath = "/api/2.0/bundle" + +// DeploymentMetadataAPI is a client for the Deployment Metadata Service. +// +// This is a temporary implementation that will be replaced by the SDK-generated +// client once the proto definitions land in the Go SDK. The method signatures +// and types are designed to match what the SDK will generate, so migration +// should be a straightforward import path change. +type DeploymentMetadataAPI struct { + api *client.DatabricksClient +} + +func NewDeploymentMetadataAPI(w *databricks.WorkspaceClient) (*DeploymentMetadataAPI, error) { + apiClient, err := client.New(w.Config) + if err != nil { + return nil, fmt.Errorf("failed to create deployment metadata API client: %w", err) + } + return &DeploymentMetadataAPI{api: apiClient}, nil +} + +func (a *DeploymentMetadataAPI) CreateDeployment(ctx context.Context, request CreateDeploymentRequest) (*Deployment, error) { + var resp Deployment + path := fmt.Sprintf("%s/deployments", basePath) + err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + if err != nil { + return nil, mapError("create deployment", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) GetDeployment(ctx context.Context, request GetDeploymentRequest) (*Deployment, error) { + var resp Deployment + path := fmt.Sprintf("%s/deployments/%s", basePath, request.DeploymentID) + err := a.api.Do(ctx, http.MethodGet, path, nil, nil, nil, &resp) + if err != nil { + return nil, mapError("get deployment", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) DeleteDeployment(ctx context.Context, request DeleteDeploymentRequest) (*Deployment, error) { + var resp Deployment + path := fmt.Sprintf("%s/deployments/%s", basePath, request.DeploymentID) + err := a.api.Do(ctx, http.MethodDelete, path, nil, nil, nil, &resp) + if err != nil { + return nil, mapError("delete deployment", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) CreateVersion(ctx context.Context, request CreateVersionRequest) (*Version, error) { + var resp Version + path := fmt.Sprintf("%s/deployments/%s/versions", basePath, request.DeploymentID) + err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + if err != nil { + return nil, mapError("create version", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) GetVersion(ctx context.Context, request GetVersionRequest) (*Version, error) { + var resp Version + path := fmt.Sprintf("%s/deployments/%s/versions/%s", basePath, request.DeploymentID, request.VersionID) + err := a.api.Do(ctx, http.MethodGet, path, nil, nil, nil, &resp) + if err != nil { + return nil, mapError("get version", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) Heartbeat(ctx context.Context, request HeartbeatRequest) (*HeartbeatResponse, error) { + var resp HeartbeatResponse + path := fmt.Sprintf("%s/deployments/%s/versions/%s/heartbeat", basePath, request.DeploymentID, request.VersionID) + err := a.api.Do(ctx, http.MethodPost, path, nil, nil, struct{}{}, &resp) + if err != nil { + return nil, mapError("heartbeat", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) CompleteVersion(ctx context.Context, request CompleteVersionRequest) (*Version, error) { + var resp Version + path := fmt.Sprintf("%s/deployments/%s/versions/%s/complete", basePath, request.DeploymentID, request.VersionID) + err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + if err != nil { + return nil, mapError("complete version", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) CreateOperation(ctx context.Context, request CreateOperationRequest) (*Operation, error) { + var resp Operation + path := fmt.Sprintf("%s/deployments/%s/versions/%s/operations", basePath, request.DeploymentID, request.VersionID) + err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + if err != nil { + return nil, mapError("create operation", err) + } + return &resp, nil +} + +func (a *DeploymentMetadataAPI) ListResources(ctx context.Context, request ListResourcesRequest) ([]Resource, error) { + var allResources []Resource + pageToken := "" + + for { + var resp ListResourcesResponse + path := fmt.Sprintf("%s/deployments/%s/resources", basePath, request.DeploymentID) + + q := map[string]any{ + "parent": fmt.Sprintf("deployments/%s", request.DeploymentID), + "page_size": 1000, + } + if pageToken != "" { + q["page_token"] = pageToken + } + + err := a.api.Do(ctx, http.MethodGet, path, nil, q, nil, &resp) + if err != nil { + return nil, mapError("list resources", err) + } + + allResources = append(allResources, resp.Resources...) + if resp.NextPageToken == "" { + break + } + pageToken = resp.NextPageToken + } + + return allResources, nil +} + +// mapError translates API errors into user-friendly messages. +func mapError(operation string, err error) error { + var apiErr *apierr.APIError + if !errors.As(err, &apiErr) { + return fmt.Errorf("%s: %w", operation, err) + } + + switch apiErr.StatusCode { + case http.StatusConflict: + return fmt.Errorf("%s: deployment is locked by another active deployment. "+ + "If the prior deployment failed, the lock will expire automatically after 5 minutes. "+ + "You can also force-acquire the lock by running deploy with the --force-lock flag: %w", operation, err) + case http.StatusNotFound: + return fmt.Errorf("%s: resource not found: %w", operation, err) + case http.StatusBadRequest: + return fmt.Errorf("%s: bad request: %w", operation, err) + default: + return fmt.Errorf("%s: %w", operation, err) + } +} diff --git a/bundle/deploy/metadata/service/types.go b/libs/tempdms/types.go similarity index 58% rename from bundle/deploy/metadata/service/types.go rename to libs/tempdms/types.go index 05e0cf03b1..a5d8c0df92 100644 --- a/bundle/deploy/metadata/service/types.go +++ b/libs/tempdms/types.go @@ -1,4 +1,8 @@ -package service +// Package tempdms is a temporary client library for the Deployment Metadata Service. +// It mirrors the structure that the Databricks Go SDK will eventually generate from +// the service's proto definitions. When the protos land in the SDK, migration should +// be a straightforward import path change. +package tempdms import "time" @@ -60,47 +64,47 @@ const ( ) const ( - ResourceTypeUnspecified DeploymentResourceType = 0 - ResourceTypeJob DeploymentResourceType = 1 - ResourceTypePipeline DeploymentResourceType = 2 - ResourceTypeModel DeploymentResourceType = 4 - ResourceTypeRegisteredModel DeploymentResourceType = 5 - ResourceTypeExperiment DeploymentResourceType = 6 - ResourceTypeServingEndpoint DeploymentResourceType = 7 - ResourceTypeQualityMonitor DeploymentResourceType = 8 - ResourceTypeSchema DeploymentResourceType = 9 - ResourceTypeVolume DeploymentResourceType = 10 - ResourceTypeCluster DeploymentResourceType = 11 - ResourceTypeDashboard DeploymentResourceType = 12 - ResourceTypeApp DeploymentResourceType = 13 - ResourceTypeCatalog DeploymentResourceType = 14 - ResourceTypeExternalLocation DeploymentResourceType = 15 - ResourceTypeSecretScope DeploymentResourceType = 16 - ResourceTypeAlert DeploymentResourceType = 17 - ResourceTypeSQLWarehouse DeploymentResourceType = 18 - ResourceTypeDatabaseInstance DeploymentResourceType = 19 - ResourceTypeDatabaseCatalog DeploymentResourceType = 20 - ResourceTypeSyncedDBTable DeploymentResourceType = 21 - ResourceTypePostgresProject DeploymentResourceType = 22 - ResourceTypePostgresBranch DeploymentResourceType = 23 - ResourceTypePostgresEndpoint DeploymentResourceType = 24 + ResourceTypeUnspecified DeploymentResourceType = 0 + ResourceTypeJob DeploymentResourceType = 1 + ResourceTypePipeline DeploymentResourceType = 2 + ResourceTypeModel DeploymentResourceType = 4 + ResourceTypeRegisteredModel DeploymentResourceType = 5 + ResourceTypeExperiment DeploymentResourceType = 6 + ResourceTypeServingEndpoint DeploymentResourceType = 7 + ResourceTypeQualityMonitor DeploymentResourceType = 8 + ResourceTypeSchema DeploymentResourceType = 9 + ResourceTypeVolume DeploymentResourceType = 10 + ResourceTypeCluster DeploymentResourceType = 11 + ResourceTypeDashboard DeploymentResourceType = 12 + ResourceTypeApp DeploymentResourceType = 13 + ResourceTypeCatalog DeploymentResourceType = 14 + ResourceTypeExternalLocation DeploymentResourceType = 15 + ResourceTypeSecretScope DeploymentResourceType = 16 + ResourceTypeAlert DeploymentResourceType = 17 + ResourceTypeSQLWarehouse DeploymentResourceType = 18 + ResourceTypeDatabaseInstance DeploymentResourceType = 19 + ResourceTypeDatabaseCatalog DeploymentResourceType = 20 + ResourceTypeSyncedDBTable DeploymentResourceType = 21 + ResourceTypePostgresProject DeploymentResourceType = 22 + ResourceTypePostgresBranch DeploymentResourceType = 23 + ResourceTypePostgresEndpoint DeploymentResourceType = 24 ) -// Deployment represents a bundle deployment registered with the control plane. +// Resource types (proto message equivalents). + type Deployment struct { - Name string `json:"name,omitempty"` - DisplayName string `json:"display_name,omitempty"` - TargetName string `json:"target_name,omitempty"` - Status DeploymentStatus `json:"status,omitempty"` - LastVersionID string `json:"last_version_id,omitempty"` - CreatedBy string `json:"created_by,omitempty"` - CreateTime *time.Time `json:"create_time,omitempty"` - UpdateTime *time.Time `json:"update_time,omitempty"` - DestroyTime *time.Time `json:"destroy_time,omitempty"` - DestroyedBy string `json:"destroyed_by,omitempty"` -} - -// Version represents a single invocation of deploy/destroy against a deployment. + Name string `json:"name,omitempty"` + DisplayName string `json:"display_name,omitempty"` + TargetName string `json:"target_name,omitempty"` + Status DeploymentStatus `json:"status,omitempty"` + LastVersionID string `json:"last_version_id,omitempty"` + CreatedBy string `json:"created_by,omitempty"` + CreateTime *time.Time `json:"create_time,omitempty"` + UpdateTime *time.Time `json:"update_time,omitempty"` + DestroyTime *time.Time `json:"destroy_time,omitempty"` + DestroyedBy string `json:"destroyed_by,omitempty"` +} + type Version struct { Name string `json:"name,omitempty"` VersionID string `json:"version_id,omitempty"` @@ -116,7 +120,6 @@ type Version struct { TargetName string `json:"target_name,omitempty"` } -// Operation records the result of applying a resource change. type Operation struct { Name string `json:"name,omitempty"` ResourceKey string `json:"resource_key,omitempty"` @@ -128,7 +131,6 @@ type Operation struct { ErrorMessage string `json:"error_message,omitempty"` } -// Resource represents a resource managed by a deployment. type Resource struct { Name string `json:"name,omitempty"` ResourceKey string `json:"resource_key,omitempty"` @@ -139,43 +141,75 @@ type Resource struct { ResourceType DeploymentResourceType `json:"resource_type,omitempty"` } -// Request/Response types. +// Request types. type CreateDeploymentRequest struct { DeploymentID string `json:"deployment_id"` Deployment *Deployment `json:"deployment"` } -type ListDeploymentsResponse struct { - Deployments []Deployment `json:"deployments"` - NextPageToken string `json:"next_page_token,omitempty"` +type GetDeploymentRequest struct { + DeploymentID string `json:"-"` +} + +type DeleteDeploymentRequest struct { + DeploymentID string `json:"-"` } type CreateVersionRequest struct { - Parent string `json:"parent"` - Version *Version `json:"version"` - VersionID string `json:"version_id"` + DeploymentID string `json:"-"` + Parent string `json:"parent"` + Version *Version `json:"version"` + VersionID string `json:"version_id"` } -type ListVersionsResponse struct { - Versions []Version `json:"versions"` - NextPageToken string `json:"next_page_token,omitempty"` +type GetVersionRequest struct { + DeploymentID string `json:"-"` + VersionID string `json:"-"` } -type HeartbeatResponse struct { - ExpireTime *time.Time `json:"expire_time,omitempty"` +type HeartbeatRequest struct { + DeploymentID string `json:"-"` + VersionID string `json:"-"` } type CompleteVersionRequest struct { + DeploymentID string `json:"-"` + VersionID string `json:"-"` Name string `json:"name"` CompletionReason VersionComplete `json:"completion_reason"` Force bool `json:"force,omitempty"` } type CreateOperationRequest struct { - Parent string `json:"parent"` - ResourceKey string `json:"resource_key"` - Operation *Operation `json:"operation"` + DeploymentID string `json:"-"` + VersionID string `json:"-"` + Parent string `json:"parent"` + ResourceKey string `json:"resource_key"` + Operation *Operation `json:"operation"` +} + +type ListResourcesRequest struct { + DeploymentID string `json:"-"` + Parent string `json:"parent"` + PageSize int `json:"page_size,omitempty"` + PageToken string `json:"page_token,omitempty"` +} + +// Response types. + +type HeartbeatResponse struct { + ExpireTime *time.Time `json:"expire_time,omitempty"` +} + +type ListDeploymentsResponse struct { + Deployments []Deployment `json:"deployments"` + NextPageToken string `json:"next_page_token,omitempty"` +} + +type ListVersionsResponse struct { + Versions []Version `json:"versions"` + NextPageToken string `json:"next_page_token,omitempty"` } type ListOperationsResponse struct { diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go new file mode 100644 index 0000000000..1b0b3e9f90 --- /dev/null +++ b/libs/testserver/deployment_metadata.go @@ -0,0 +1,400 @@ +package testserver + +import ( + "encoding/json" + "fmt" + "net/http" + "strconv" + "strings" + "time" + + "github.com/databricks/cli/libs/tempdms" +) + +// deploymentMetadataState holds in-memory state for the deployment metadata service. +// Stored per-workspace inside FakeWorkspace. +type deploymentMetadataState struct { + // deployments keyed by deployment_id + deployments map[string]tempdms.Deployment + + // versions keyed by "deploymentId/versionId" + versions map[string]tempdms.Version + + // operations keyed by "deploymentId/versionId/resourceKey" + operations map[string]tempdms.Operation + + // resources keyed by "deploymentId/resourceKey" + resources map[string]tempdms.Resource + + // lock state per deployment: which version holds the lock and when it expires + lockHolder map[string]string // deploymentId -> "deployments/{id}/versions/{vid}" + lockExpiry map[string]time.Time // deploymentId -> expiry time +} + +func newDeploymentMetadataState() *deploymentMetadataState { + return &deploymentMetadataState{ + deployments: map[string]tempdms.Deployment{}, + versions: map[string]tempdms.Version{}, + operations: map[string]tempdms.Operation{}, + resources: map[string]tempdms.Resource{}, + lockHolder: map[string]string{}, + lockExpiry: map[string]time.Time{}, + } +} + +const lockDuration = 5 * time.Minute + +func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response { + defer s.LockUnlock()() + + var createReq tempdms.CreateDeploymentRequest + if err := json.Unmarshal(req.Body, &createReq); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } + } + + deploymentID := createReq.DeploymentID + if deploymentID == "" { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "deployment_id is required"}, + } + } + + state := s.deploymentMetadata + if _, exists := state.deployments[deploymentID]; exists { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{"error_code": "ALREADY_EXISTS", "message": fmt.Sprintf("deployment %s already exists", deploymentID)}, + } + } + + now := time.Now().UTC() + deployment := tempdms.Deployment{ + Name: fmt.Sprintf("deployments/%s", deploymentID), + DisplayName: deploymentID, + Status: tempdms.DeploymentStatusActive, + CreatedBy: s.CurrentUser().UserName, + CreateTime: &now, + UpdateTime: &now, + } + if createReq.Deployment != nil { + if createReq.Deployment.TargetName != "" { + deployment.TargetName = createReq.Deployment.TargetName + } + } + + state.deployments[deploymentID] = deployment + return Response{Body: deployment} +} + +func (s *FakeWorkspace) DeploymentMetadataGetDeployment(deploymentID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + deployment, ok := state.deployments[deploymentID] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("deployment %s not found", deploymentID)}, + } + } + return Response{Body: deployment} +} + +func (s *FakeWorkspace) DeploymentMetadataDeleteDeployment(deploymentID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + deployment, ok := state.deployments[deploymentID] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("deployment %s not found", deploymentID)}, + } + } + + now := time.Now().UTC() + deployment.Status = tempdms.DeploymentStatusDeleted + deployment.DestroyTime = &now + deployment.DestroyedBy = s.CurrentUser().UserName + deployment.UpdateTime = &now + state.deployments[deploymentID] = deployment + + return Response{Body: deployment} +} + +func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + deployment, ok := state.deployments[deploymentID] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("deployment %s not found", deploymentID)}, + } + } + + var createReq tempdms.CreateVersionRequest + if err := json.Unmarshal(req.Body, &createReq); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } + } + + versionID := createReq.VersionID + if versionID == "" { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "version_id is required"}, + } + } + + // Validate version_id == last_version_id + 1 (matching server behavior). + var expectedVersionID string + if deployment.LastVersionID == "" { + expectedVersionID = "1" + } else { + lastVersion, err := strconv.ParseInt(deployment.LastVersionID, 10, 64) + if err != nil { + return Response{ + StatusCode: http.StatusInternalServerError, + Body: map[string]string{"error_code": "INTERNAL_ERROR", "message": fmt.Sprintf("stored last_version_id is not a valid number: %s", deployment.LastVersionID)}, + } + } + expectedVersionID = strconv.FormatInt(lastVersion+1, 10) + } + if versionID != expectedVersionID { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{ + "error_code": "ABORTED", + "message": fmt.Sprintf("version_id must be %s (last_version_id + 1), got: %s", expectedVersionID, versionID), + }, + } + } + + // Check lock: if a lock is held and not expired, reject with 409. + now := time.Now().UTC() + if holder, hasLock := state.lockHolder[deploymentID]; hasLock { + if expiry, ok := state.lockExpiry[deploymentID]; ok && expiry.After(now) { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{ + "error_code": "ABORTED", + "message": fmt.Sprintf("deployment is locked by %s until %s", holder, expiry.Format(time.RFC3339)), + }, + } + } + } + + versionKey := deploymentID + "/" + versionID + version := tempdms.Version{ + Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + VersionID: versionID, + CreatedBy: s.CurrentUser().UserName, + CreateTime: &now, + Status: tempdms.VersionStatusInProgress, + } + if createReq.Version != nil { + version.CliVersion = createReq.Version.CliVersion + version.VersionType = createReq.Version.VersionType + } + + state.versions[versionKey] = version + + // Acquire the lock. + lockExpiry := now.Add(lockDuration) + state.lockHolder[deploymentID] = version.Name + state.lockExpiry[deploymentID] = lockExpiry + + // Update the deployment's last_version_id and status. + deployment.LastVersionID = versionID + deployment.Status = tempdms.DeploymentStatusInProgress + deployment.UpdateTime = &now + state.deployments[deploymentID] = deployment + + return Response{Body: version} +} + +func (s *FakeWorkspace) DeploymentMetadataGetVersion(deploymentID, versionID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + versionKey := deploymentID + "/" + versionID + version, ok := state.versions[versionKey] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("version %s not found", versionKey)}, + } + } + return Response{Body: version} +} + +func (s *FakeWorkspace) DeploymentMetadataHeartbeat(req Request, deploymentID, versionID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + versionKey := deploymentID + "/" + versionID + version, ok := state.versions[versionKey] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("version %s not found", versionKey)}, + } + } + + if version.Status != tempdms.VersionStatusInProgress { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{"error_code": "ABORTED", "message": "version is no longer in progress"}, + } + } + + // Verify this version holds the lock. + expectedHolder := fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID) + if state.lockHolder[deploymentID] != expectedHolder { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{"error_code": "ABORTED", "message": "lock is not held by this version"}, + } + } + + // Renew the lock. + now := time.Now().UTC() + newExpiry := now.Add(lockDuration) + state.lockExpiry[deploymentID] = newExpiry + + return Response{Body: tempdms.HeartbeatResponse{ExpireTime: &newExpiry}} +} + +func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymentID, versionID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + versionKey := deploymentID + "/" + versionID + version, ok := state.versions[versionKey] + if !ok { + return Response{ + StatusCode: http.StatusNotFound, + Body: map[string]string{"error_code": "NOT_FOUND", "message": fmt.Sprintf("version %s not found", versionKey)}, + } + } + + if version.Status != tempdms.VersionStatusInProgress { + return Response{ + StatusCode: http.StatusConflict, + Body: map[string]string{"error_code": "ABORTED", "message": "version is already completed"}, + } + } + + var completeReq tempdms.CompleteVersionRequest + if err := json.Unmarshal(req.Body, &completeReq); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } + } + + now := time.Now().UTC() + version.Status = tempdms.VersionStatusCompleted + version.CompleteTime = &now + version.CompletionReason = completeReq.CompletionReason + version.CompletedBy = s.CurrentUser().UserName + state.versions[versionKey] = version + + // Release the lock. + delete(state.lockHolder, deploymentID) + delete(state.lockExpiry, deploymentID) + + // Update deployment status based on completion reason. + if deployment, ok := state.deployments[deploymentID]; ok { + switch completeReq.CompletionReason { + case tempdms.VersionCompleteSuccess: + deployment.Status = tempdms.DeploymentStatusActive + case tempdms.VersionCompleteFailure, tempdms.VersionCompleteForceAbort, tempdms.VersionCompleteLeaseExpire: + deployment.Status = tempdms.DeploymentStatusFailed + } + deployment.UpdateTime = &now + state.deployments[deploymentID] = deployment + } + + return Response{Body: version} +} + +func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymentID, versionID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + + var createReq tempdms.CreateOperationRequest + if err := json.Unmarshal(req.Body, &createReq); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } + } + + resourceKey := createReq.ResourceKey + if resourceKey == "" { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "resource_key is required"}, + } + } + + now := time.Now().UTC() + opKey := deploymentID + "/" + versionID + "/" + resourceKey + operation := tempdms.Operation{ + Name: fmt.Sprintf("deployments/%s/versions/%s/operations/%s", deploymentID, versionID, resourceKey), + ResourceKey: resourceKey, + CreateTime: &now, + } + if createReq.Operation != nil { + operation.ActionType = createReq.Operation.ActionType + operation.State = createReq.Operation.State + operation.ResourceID = createReq.Operation.ResourceID + operation.Status = createReq.Operation.Status + operation.ErrorMessage = createReq.Operation.ErrorMessage + } + + state.operations[opKey] = operation + + // Upsert the deployment-level resource. + resKey := deploymentID + "/" + resourceKey + resource := tempdms.Resource{ + Name: fmt.Sprintf("deployments/%s/resources/%s", deploymentID, resourceKey), + ResourceKey: resourceKey, + } + if createReq.Operation != nil { + resource.State = createReq.Operation.State + resource.ResourceID = createReq.Operation.ResourceID + resource.LastActionType = createReq.Operation.ActionType + resource.LastVersionID = versionID + } + state.resources[resKey] = resource + + return Response{Body: operation} +} + +func (s *FakeWorkspace) DeploymentMetadataListResources(deploymentID string) Response { + defer s.LockUnlock()() + + state := s.deploymentMetadata + prefix := deploymentID + "/" + var resources []tempdms.Resource + for key, resource := range state.resources { + if strings.HasPrefix(key, prefix) { + resources = append(resources, resource) + } + } + if resources == nil { + resources = []tempdms.Resource{} + } + return Response{Body: tempdms.ListResourcesResponse{Resources: resources}} +} diff --git a/libs/testserver/fake_workspace.go b/libs/testserver/fake_workspace.go index b13aae069a..a2462c4c6d 100644 --- a/libs/testserver/fake_workspace.go +++ b/libs/testserver/fake_workspace.go @@ -173,6 +173,8 @@ type FakeWorkspace struct { // clusterVenvs caches Python venvs per existing cluster ID, // matching cloud behavior where libraries are cached on running clusters. clusterVenvs map[string]*clusterEnv + + deploymentMetadata *deploymentMetadataState } func (s *FakeWorkspace) LockUnlock() func() { @@ -297,6 +299,7 @@ func NewFakeWorkspace(url, token string) *FakeWorkspace { PostgresEndpoints: map[string]postgres.Endpoint{}, PostgresOperations: map[string]postgres.Operation{}, clusterVenvs: map[string]*clusterEnv{}, + deploymentMetadata: newDeploymentMetadataState(), Alerts: map[string]sql.AlertV2{}, Experiments: map[string]ml.GetExperimentResponse{}, ModelRegistryModels: map[string]ml.Model{}, diff --git a/libs/testserver/handlers.go b/libs/testserver/handlers.go index 9e30cb5f0c..904284ed51 100644 --- a/libs/testserver/handlers.go +++ b/libs/testserver/handlers.go @@ -905,4 +905,42 @@ func AddDefaultHandlers(server *Server) { }, } }) + + // Deployment Metadata Service: + + server.Handle("POST", "/api/2.0/bundle/deployments", func(req Request) any { + return req.Workspace.DeploymentMetadataCreateDeployment(req) + }) + + server.Handle("GET", "/api/2.0/bundle/deployments/{deployment_id}", func(req Request) any { + return req.Workspace.DeploymentMetadataGetDeployment(req.Vars["deployment_id"]) + }) + + server.Handle("DELETE", "/api/2.0/bundle/deployments/{deployment_id}", func(req Request) any { + return req.Workspace.DeploymentMetadataDeleteDeployment(req.Vars["deployment_id"]) + }) + + server.Handle("POST", "/api/2.0/bundle/deployments/{deployment_id}/versions", func(req Request) any { + return req.Workspace.DeploymentMetadataCreateVersion(req, req.Vars["deployment_id"]) + }) + + server.Handle("GET", "/api/2.0/bundle/deployments/{deployment_id}/versions/{version_id}", func(req Request) any { + return req.Workspace.DeploymentMetadataGetVersion(req.Vars["deployment_id"], req.Vars["version_id"]) + }) + + server.Handle("POST", "/api/2.0/bundle/deployments/{deployment_id}/versions/{version_id}/heartbeat", func(req Request) any { + return req.Workspace.DeploymentMetadataHeartbeat(req, req.Vars["deployment_id"], req.Vars["version_id"]) + }) + + server.Handle("POST", "/api/2.0/bundle/deployments/{deployment_id}/versions/{version_id}/complete", func(req Request) any { + return req.Workspace.DeploymentMetadataCompleteVersion(req, req.Vars["deployment_id"], req.Vars["version_id"]) + }) + + server.Handle("POST", "/api/2.0/bundle/deployments/{deployment_id}/versions/{version_id}/operations", func(req Request) any { + return req.Workspace.DeploymentMetadataCreateOperation(req, req.Vars["deployment_id"], req.Vars["version_id"]) + }) + + server.Handle("GET", "/api/2.0/bundle/deployments/{deployment_id}/resources", func(req Request) any { + return req.Workspace.DeploymentMetadataListResources(req.Vars["deployment_id"]) + }) } From 29f567007c9efaca805e8482545f32e5fee48b67 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 21:03:26 +0000 Subject: [PATCH 04/25] Fix query parameter handling for deployment metadata service API The proto HTTP bindings use `body: "deployment"`, `body: "version"`, and `body: "operation"` for Create endpoints, which means only the sub-message goes in the request body. The identifier fields (deployment_id, version_id, resource_key) must be passed as query parameters. Previously these fields were incorrectly included in the request body, which would cause "required field missing" errors against the real service. Also updates the test server to read these fields from query parameters instead of the body, so acceptance tests validate the real API contract. Co-authored-by: Isaac --- libs/tempdms/api.go | 9 ++- libs/testserver/deployment_metadata.go | 104 +++++++++++++------------ 2 files changed, 59 insertions(+), 54 deletions(-) diff --git a/libs/tempdms/api.go b/libs/tempdms/api.go index 305633819e..005e704923 100644 --- a/libs/tempdms/api.go +++ b/libs/tempdms/api.go @@ -34,7 +34,8 @@ func NewDeploymentMetadataAPI(w *databricks.WorkspaceClient) (*DeploymentMetadat func (a *DeploymentMetadataAPI) CreateDeployment(ctx context.Context, request CreateDeploymentRequest) (*Deployment, error) { var resp Deployment path := fmt.Sprintf("%s/deployments", basePath) - err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + query := map[string]string{"deployment_id": request.DeploymentID} + err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Deployment, &resp) if err != nil { return nil, mapError("create deployment", err) } @@ -64,7 +65,8 @@ func (a *DeploymentMetadataAPI) DeleteDeployment(ctx context.Context, request De func (a *DeploymentMetadataAPI) CreateVersion(ctx context.Context, request CreateVersionRequest) (*Version, error) { var resp Version path := fmt.Sprintf("%s/deployments/%s/versions", basePath, request.DeploymentID) - err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + query := map[string]string{"version_id": request.VersionID} + err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Version, &resp) if err != nil { return nil, mapError("create version", err) } @@ -104,7 +106,8 @@ func (a *DeploymentMetadataAPI) CompleteVersion(ctx context.Context, request Com func (a *DeploymentMetadataAPI) CreateOperation(ctx context.Context, request CreateOperationRequest) (*Operation, error) { var resp Operation path := fmt.Sprintf("%s/deployments/%s/versions/%s/operations", basePath, request.DeploymentID, request.VersionID) - err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) + query := map[string]string{"resource_key": request.ResourceKey} + err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Operation, &resp) if err != nil { return nil, mapError("create operation", err) } diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index 1b0b3e9f90..48cf7f5caa 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -47,19 +47,23 @@ const lockDuration = 5 * time.Minute func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response { defer s.LockUnlock()() - var createReq tempdms.CreateDeploymentRequest - if err := json.Unmarshal(req.Body, &createReq); err != nil { + // deployment_id is a query parameter, not in the body. + deploymentID := req.URL.Query().Get("deployment_id") + if deploymentID == "" { return Response{ StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "deployment_id is required"}, } } - deploymentID := createReq.DeploymentID - if deploymentID == "" { - return Response{ - StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "deployment_id is required"}, + // The body maps to the Deployment sub-message. + var bodyDeployment tempdms.Deployment + if len(req.Body) > 0 { + if err := json.Unmarshal(req.Body, &bodyDeployment); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } } } @@ -75,16 +79,12 @@ func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response deployment := tempdms.Deployment{ Name: fmt.Sprintf("deployments/%s", deploymentID), DisplayName: deploymentID, + TargetName: bodyDeployment.TargetName, Status: tempdms.DeploymentStatusActive, CreatedBy: s.CurrentUser().UserName, CreateTime: &now, UpdateTime: &now, } - if createReq.Deployment != nil { - if createReq.Deployment.TargetName != "" { - deployment.TargetName = createReq.Deployment.TargetName - } - } state.deployments[deploymentID] = deployment return Response{Body: deployment} @@ -138,19 +138,23 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI } } - var createReq tempdms.CreateVersionRequest - if err := json.Unmarshal(req.Body, &createReq); err != nil { + // version_id is a query parameter, not in the body. + versionID := req.URL.Query().Get("version_id") + if versionID == "" { return Response{ StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "version_id is required"}, } } - versionID := createReq.VersionID - if versionID == "" { - return Response{ - StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "version_id is required"}, + // The body maps to the Version sub-message. + var bodyVersion tempdms.Version + if len(req.Body) > 0 { + if err := json.Unmarshal(req.Body, &bodyVersion); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } } } @@ -200,10 +204,8 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI CreateTime: &now, Status: tempdms.VersionStatusInProgress, } - if createReq.Version != nil { - version.CliVersion = createReq.Version.CliVersion - version.VersionType = createReq.Version.VersionType - } + version.CliVersion = bodyVersion.CliVersion + version.VersionType = bodyVersion.VersionType state.versions[versionKey] = version @@ -332,35 +334,37 @@ func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymen state := s.deploymentMetadata - var createReq tempdms.CreateOperationRequest - if err := json.Unmarshal(req.Body, &createReq); err != nil { + // resource_key is a query parameter, not in the body. + resourceKey := req.URL.Query().Get("resource_key") + if resourceKey == "" { return Response{ StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "resource_key is required"}, } } - resourceKey := createReq.ResourceKey - if resourceKey == "" { - return Response{ - StatusCode: http.StatusBadRequest, - Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": "resource_key is required"}, + // The body maps to the Operation sub-message. + var bodyOperation tempdms.Operation + if len(req.Body) > 0 { + if err := json.Unmarshal(req.Body, &bodyOperation); err != nil { + return Response{ + StatusCode: http.StatusBadRequest, + Body: map[string]string{"error_code": "INVALID_PARAMETER_VALUE", "message": fmt.Sprintf("invalid request: %s", err)}, + } } } now := time.Now().UTC() opKey := deploymentID + "/" + versionID + "/" + resourceKey operation := tempdms.Operation{ - Name: fmt.Sprintf("deployments/%s/versions/%s/operations/%s", deploymentID, versionID, resourceKey), - ResourceKey: resourceKey, - CreateTime: &now, - } - if createReq.Operation != nil { - operation.ActionType = createReq.Operation.ActionType - operation.State = createReq.Operation.State - operation.ResourceID = createReq.Operation.ResourceID - operation.Status = createReq.Operation.Status - operation.ErrorMessage = createReq.Operation.ErrorMessage + Name: fmt.Sprintf("deployments/%s/versions/%s/operations/%s", deploymentID, versionID, resourceKey), + ResourceKey: resourceKey, + CreateTime: &now, + ActionType: bodyOperation.ActionType, + State: bodyOperation.State, + ResourceID: bodyOperation.ResourceID, + Status: bodyOperation.Status, + ErrorMessage: bodyOperation.ErrorMessage, } state.operations[opKey] = operation @@ -368,14 +372,12 @@ func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymen // Upsert the deployment-level resource. resKey := deploymentID + "/" + resourceKey resource := tempdms.Resource{ - Name: fmt.Sprintf("deployments/%s/resources/%s", deploymentID, resourceKey), - ResourceKey: resourceKey, - } - if createReq.Operation != nil { - resource.State = createReq.Operation.State - resource.ResourceID = createReq.Operation.ResourceID - resource.LastActionType = createReq.Operation.ActionType - resource.LastVersionID = versionID + Name: fmt.Sprintf("deployments/%s/resources/%s", deploymentID, resourceKey), + ResourceKey: resourceKey, + State: bodyOperation.State, + ResourceID: bodyOperation.ResourceID, + LastActionType: bodyOperation.ActionType, + LastVersionID: versionID, } state.resources[resKey] = resource From aa11d7c8fbde2d3a06ce3fc3ca2b009949f10fc7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 21:15:48 +0000 Subject: [PATCH 05/25] Fix remaining issues: enum naming, redundant param, add acceptance test - Rename VersionCompleteLeaseExpire to VersionCompleteLeaseExpired to match proto enum VERSION_COMPLETE_LEASE_EXPIRED. - Remove redundant "parent" query parameter from ListResources (the deployment ID is already in the URL path). - Add acceptance test for the deployment metadata service integration that validates the correct API call sequence during deploy and destroy. Co-authored-by: Isaac --- .../bundle/deploy/metadata-service/databricks.yml | 7 +++++++ acceptance/bundle/deploy/metadata-service/script | 15 +++++++++++++++ .../bundle/deploy/metadata-service/test.toml | 3 +++ libs/tempdms/api.go | 1 - libs/tempdms/types.go | 2 +- libs/testserver/deployment_metadata.go | 2 +- 6 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 acceptance/bundle/deploy/metadata-service/databricks.yml create mode 100644 acceptance/bundle/deploy/metadata-service/script create mode 100644 acceptance/bundle/deploy/metadata-service/test.toml diff --git a/acceptance/bundle/deploy/metadata-service/databricks.yml b/acceptance/bundle/deploy/metadata-service/databricks.yml new file mode 100644 index 0000000000..c21c8a9392 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/databricks.yml @@ -0,0 +1,7 @@ +bundle: + name: metadata-service-test + +resources: + jobs: + test_job: + name: test-job diff --git a/acceptance/bundle/deploy/metadata-service/script b/acceptance/bundle/deploy/metadata-service/script new file mode 100644 index 0000000000..3f2006ac8e --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/script @@ -0,0 +1,15 @@ +# Deploy with the metadata service enabled. Verify the correct API calls are +# made: CreateDeployment, CreateVersion, resource CRUD, CreateOperation, +# CompleteVersion. +trace $CLI bundle deploy +trace print_requests.py --keep --get //bundle | contains.py "POST" "deployments" "versions" "complete" + +# Verify deployment_id is sent as a query parameter (not in the body). +trace print_requests.py --keep --get //bundle/deployments "^//bundle/deployments/" | contains.py "POST" "deployment_id" + +# Verify version_id is sent as a query parameter. +trace print_requests.py --keep --get //versions "^//versions/" | contains.py "POST" "version_id" + +# Destroy with the metadata service enabled. +trace $CLI bundle destroy --auto-approve +trace print_requests.py --get //bundle | contains.py "POST" "deployments" "versions" "complete" diff --git a/acceptance/bundle/deploy/metadata-service/test.toml b/acceptance/bundle/deploy/metadata-service/test.toml new file mode 100644 index 0000000000..33c8f80dd4 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/test.toml @@ -0,0 +1,3 @@ +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_DEPLOYMENT_SERVICE = ["true"] +RecordRequests = true diff --git a/libs/tempdms/api.go b/libs/tempdms/api.go index 005e704923..005425fa68 100644 --- a/libs/tempdms/api.go +++ b/libs/tempdms/api.go @@ -123,7 +123,6 @@ func (a *DeploymentMetadataAPI) ListResources(ctx context.Context, request ListR path := fmt.Sprintf("%s/deployments/%s/resources", basePath, request.DeploymentID) q := map[string]any{ - "parent": fmt.Sprintf("deployments/%s", request.DeploymentID), "page_size": 1000, } if pageToken != "" { diff --git a/libs/tempdms/types.go b/libs/tempdms/types.go index a5d8c0df92..b36386b438 100644 --- a/libs/tempdms/types.go +++ b/libs/tempdms/types.go @@ -35,7 +35,7 @@ const ( VersionCompleteSuccess VersionComplete = 1 VersionCompleteFailure VersionComplete = 2 VersionCompleteForceAbort VersionComplete = 3 - VersionCompleteLeaseExpire VersionComplete = 4 + VersionCompleteLeaseExpired VersionComplete = 4 ) const ( diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index 48cf7f5caa..a1968c8638 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -319,7 +319,7 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen switch completeReq.CompletionReason { case tempdms.VersionCompleteSuccess: deployment.Status = tempdms.DeploymentStatusActive - case tempdms.VersionCompleteFailure, tempdms.VersionCompleteForceAbort, tempdms.VersionCompleteLeaseExpire: + case tempdms.VersionCompleteFailure, tempdms.VersionCompleteForceAbort, tempdms.VersionCompleteLeaseExpired: deployment.Status = tempdms.DeploymentStatusFailed } deployment.UpdateTime = &now From c21aee6b6394ed399560c72aea6ab15082155c68 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 21:21:21 +0000 Subject: [PATCH 06/25] Update acceptance test to print all metadata service requests Use print_requests.py to print all requests to /bundle endpoints at each stage (deploy and destroy) for clear visibility into the API call sequence. Co-authored-by: Isaac --- acceptance/bundle/deploy/metadata-service/script | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/acceptance/bundle/deploy/metadata-service/script b/acceptance/bundle/deploy/metadata-service/script index 3f2006ac8e..fa4f54bd57 100644 --- a/acceptance/bundle/deploy/metadata-service/script +++ b/acceptance/bundle/deploy/metadata-service/script @@ -1,15 +1,11 @@ -# Deploy with the metadata service enabled. Verify the correct API calls are -# made: CreateDeployment, CreateVersion, resource CRUD, CreateOperation, -# CompleteVersion. +# Deploy with the metadata service enabled. trace $CLI bundle deploy -trace print_requests.py --keep --get //bundle | contains.py "POST" "deployments" "versions" "complete" -# Verify deployment_id is sent as a query parameter (not in the body). -trace print_requests.py --keep --get //bundle/deployments "^//bundle/deployments/" | contains.py "POST" "deployment_id" - -# Verify version_id is sent as a query parameter. -trace print_requests.py --keep --get //versions "^//versions/" | contains.py "POST" "version_id" +# Print all metadata service requests made during deploy. +trace print_requests.py --get //bundle # Destroy with the metadata service enabled. trace $CLI bundle destroy --auto-approve -trace print_requests.py --get //bundle | contains.py "POST" "deployments" "versions" "complete" + +# Print all metadata service requests made during destroy. +trace print_requests.py --get //bundle From 1621bc26ff492a51b267a56f4ac4d1b8cd2c949f Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 21:44:40 +0000 Subject: [PATCH 07/25] Fix error masking and input validation from self-review - Rename libs/tempdms package to libs/tmpdms - Rename env var to DATABRICKS_BUNDLE_MANAGED_STATE - Use lineage from resources.json as deployment ID - Write _deployment_id file to state directory - Remove postApplyHook, add inline OperationReporter - Set heartbeat interval to 30 seconds Co-authored-by: Isaac --- .../bundle/deploy/metadata-service/test.toml | 2 +- bundle/direct/bundle_apply.go | 19 +++ bundle/direct/pkg.go | 9 ++ bundle/env/deployment_metadata.go | 14 +- bundle/phases/deploy.go | 37 ++--- bundle/phases/deploy_metadata.go | 143 +++++++++--------- bundle/phases/destroy.go | 12 +- bundle/phases/heartbeat.go | 8 +- libs/testserver/deployment_metadata.go | 64 ++++---- libs/{tempdms => tmpdms}/api.go | 2 +- libs/{tempdms => tmpdms}/types.go | 4 +- 11 files changed, 168 insertions(+), 146 deletions(-) rename libs/{tempdms => tmpdms}/api.go (99%) rename libs/{tempdms => tmpdms}/types.go (98%) diff --git a/acceptance/bundle/deploy/metadata-service/test.toml b/acceptance/bundle/deploy/metadata-service/test.toml index 33c8f80dd4..4cebdfc83a 100644 --- a/acceptance/bundle/deploy/metadata-service/test.toml +++ b/acceptance/bundle/deploy/metadata-service/test.toml @@ -1,3 +1,3 @@ EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] -EnvMatrix.DATABRICKS_BUNDLE_DEPLOYMENT_SERVICE = ["true"] +EnvMatrix.DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] RecordRequests = true diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index ea3f615f7f..68b5672257 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -84,11 +84,24 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa logdiag.LogError(ctx, fmt.Errorf("%s: Unexpected delete action during migration", errorPrefix)) return false } + + // Capture the resource ID before deletion for operation reporting. + var deleteResourceID string + if b.OperationReporter != nil { + if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { + deleteResourceID = dbentry.ID + } + } + err = d.Destroy(ctx, &b.StateDB) if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: %w", errorPrefix, err)) return false } + + if b.OperationReporter != nil { + b.OperationReporter(ctx, resourceKey, deleteResourceID, action) + } return true } @@ -128,6 +141,12 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa logdiag.LogError(ctx, fmt.Errorf("%s: %w", errorPrefix, err)) return false } + + // Report the operation inline to the metadata service. + if b.OperationReporter != nil && !migrateMode { + dbentry, _ := b.StateDB.GetResourceEntry(resourceKey) + b.OperationReporter(ctx, resourceKey, dbentry.ID, action) + } } // TODO: Note, we only really need remote state if there are remote references. diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 74e72e79b0..7932c040eb 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -37,6 +37,11 @@ type DeploymentUnit struct { DependsOn []deployplan.DependsOnEntry } +// OperationReporter is called after each successful resource operation to report +// it to the deployment metadata service. It is best-effort: failures are logged +// as warnings by the caller. +type OperationReporter func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType) + // DeploymentBundle holds everything needed to deploy a bundle type DeploymentBundle struct { StateDB dstate.DeploymentState @@ -44,6 +49,10 @@ type DeploymentBundle struct { Plan *deployplan.Plan RemoteStateCache sync.Map StateCache structvar.Cache + + // OperationReporter, when set, is called inline after each successful + // resource Create/Update/Delete to report the operation to the metadata service. + OperationReporter OperationReporter } // SetRemoteState updates the remote state with type validation and marks as fresh. diff --git a/bundle/env/deployment_metadata.go b/bundle/env/deployment_metadata.go index 60e896c045..a4d08c7cd0 100644 --- a/bundle/env/deployment_metadata.go +++ b/bundle/env/deployment_metadata.go @@ -2,14 +2,14 @@ package env import "context" -// deploymentServiceVariable names the environment variable that controls whether the -// deployment metadata service is used for locking and resource state management. -const deploymentServiceVariable = "DATABRICKS_BUNDLE_DEPLOYMENT_SERVICE" +// managedStateVariable names the environment variable that controls whether +// server-managed state is used for locking and resource state management. +const managedStateVariable = "DATABRICKS_BUNDLE_MANAGED_STATE" -// DeploymentService returns the environment variable that controls whether the -// deployment metadata service is used for locking and resource state management. -func DeploymentService(ctx context.Context) (string, bool) { +// ManagedState returns the environment variable that controls whether +// server-managed state is used for locking and resource state management. +func ManagedState(ctx context.Context) (string, bool) { return get(ctx, []string{ - deploymentServiceVariable, + managedStateVariable, }) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 5ca8745f06..9c067bcff4 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -26,7 +26,7 @@ import ( "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/cli/libs/sync" - "github.com/databricks/cli/libs/tempdms" + "github.com/databricks/cli/libs/tmpdms" ) func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) (bool, error) { @@ -100,11 +100,7 @@ func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.P return approved, nil } -// postApplyHook is called after the deployment plan is applied (terraform/direct Apply). -// It can be used for additional state reporting (e.g. to the metadata service). -type postApplyHook func(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) - -func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType, hook postApplyHook) { +func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType) { cmdio.LogString(ctx, "Deploying resources...") if targetEngine.IsDirect() { @@ -116,11 +112,6 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta // Even if deployment failed, there might be updates in states that we need to upload. statemgmt.PushResourcesState(ctx, b, targetEngine) - // Run any additional post-apply logic (e.g. metadata service operation reporting). - if hook != nil { - hook(ctx, b, plan) - } - if logdiag.HasError(ctx) { return } @@ -150,7 +141,7 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li // The deploy phase deploys artifacts and resources. // If readPlanPath is provided, the plan is loaded from that file instead of being calculated. func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, targetEngine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { - useMetadataService, _ := env.DeploymentService(ctx) + useMetadataService, _ := env.ManagedState(ctx) if useMetadataService == "true" { log.Info(ctx, "Phase: deploy (with metadata service)") @@ -164,20 +155,16 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } // Acquire the deployment lock. - var svc *tempdms.DeploymentMetadataAPI - var deploymentID, versionID string var failed bool if useMetadataService == "true" { - var err error - svc, err = tempdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) + svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) if err != nil { logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) return } - var cleanup func(failed bool) - deploymentID, versionID, cleanup, err = deployMetadataLock(ctx, b, svc, tempdms.VersionTypeDeploy) + deploymentID, versionID, cleanup, err := deployMetadataLock(ctx, b, svc, tmpdms.VersionTypeDeploy) if err != nil { logdiag.LogError(ctx, err) return @@ -185,6 +172,10 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand defer func() { cleanup(failed || logdiag.HasError(ctx)) }() + + if targetEngine.IsDirect() { + b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + } } else { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { @@ -242,15 +233,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - // Build the post-apply hook for metadata service reporting (nil for file-based). - var hook postApplyHook - if useMetadataService == "true" { - hook = func(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) { - reportOperations(ctx, svc, deploymentID, versionID, plan) - } - } - - deployCore(ctx, b, plan, targetEngine, hook) + deployCore(ctx, b, plan, targetEngine) if logdiag.HasError(ctx) { failed = true return diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index 40d1d7d620..85d7290805 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -5,15 +5,18 @@ import ( "errors" "fmt" "net/http" + "os" + "path/filepath" + "strconv" "time" "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/internal/build" "github.com/databricks/cli/libs/log" - "github.com/databricks/cli/libs/logdiag" - "github.com/databricks/cli/libs/tempdms" + "github.com/databricks/cli/libs/tmpdms" "github.com/databricks/databricks-sdk-go/apierr" "github.com/google/uuid" ) @@ -23,23 +26,33 @@ import ( // // It returns a cleanup function that must be deferred by the caller to release // the lock and stop the heartbeat, as well as any error from acquiring the lock. -func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.DeploymentMetadataAPI, versionType tempdms.VersionType) (deploymentID, versionID string, cleanup func(failed bool), err error) { - // Load local deployment state to get the deployment ID and sequence number. - state, loadErr := deploy.LoadState(ctx, b) - if loadErr != nil { - return "", "", nil, fmt.Errorf("failed to load deployment state: %w", loadErr) +func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID, versionID string, cleanup func(failed bool), err error) { + // Read the lineage from resources.json (direct engine state) for the deployment ID. + _, localPath := b.StateFilenameDirect(ctx) + var stateDB dstate.DeploymentState + if openErr := stateDB.Open(localPath); openErr != nil { + return "", "", nil, fmt.Errorf("failed to open resources state: %w", openErr) } - // Generate a deployment ID if one doesn't exist yet. - if state.ID == uuid.Nil { - state.ID = uuid.New() + deploymentID = stateDB.Data.Lineage + if deploymentID == "" { + deploymentID = uuid.New().String() + } + + // Write the deployment ID to _deployment_id for external tooling. + stateDir := filepath.Dir(localPath) + if mkdirErr := os.MkdirAll(stateDir, 0o755); mkdirErr != nil { + return "", "", nil, fmt.Errorf("failed to create state directory: %w", mkdirErr) + } + deploymentIDPath := filepath.Join(stateDir, "_deployment_id") + if writeErr := os.WriteFile(deploymentIDPath, []byte(deploymentID), 0o600); writeErr != nil { + return "", "", nil, fmt.Errorf("failed to write deployment ID: %w", writeErr) } - deploymentID = state.ID.String() // Ensure the deployment exists in the metadata service. - _, createErr := svc.CreateDeployment(ctx, tempdms.CreateDeploymentRequest{ + _, createErr := svc.CreateDeployment(ctx, tmpdms.CreateDeploymentRequest{ DeploymentID: deploymentID, - Deployment: &tempdms.Deployment{ + Deployment: &tmpdms.Deployment{ TargetName: b.Config.Bundle.Target, }, }) @@ -47,13 +60,30 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.Depl return "", "", nil, fmt.Errorf("failed to create deployment: %w", createErr) } + // Get the deployment to determine the next version ID. + dep, getErr := svc.GetDeployment(ctx, tmpdms.GetDeploymentRequest{ + DeploymentID: deploymentID, + }) + if getErr != nil { + return "", "", nil, fmt.Errorf("failed to get deployment: %w", getErr) + } + + if dep.LastVersionID == "" { + versionID = "1" + } else { + lastVersion, parseErr := strconv.ParseInt(dep.LastVersionID, 10, 64) + if parseErr != nil { + return "", "", nil, fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) + } + versionID = strconv.FormatInt(lastVersion+1, 10) + } + // Create a version to acquire the deployment lock. - versionID = fmt.Sprintf("%d", state.Seq+1) - version, versionErr := svc.CreateVersion(ctx, tempdms.CreateVersionRequest{ + version, versionErr := svc.CreateVersion(ctx, tmpdms.CreateVersionRequest{ DeploymentID: deploymentID, Parent: fmt.Sprintf("deployments/%s", deploymentID), VersionID: versionID, - Version: &tempdms.Version{ + Version: &tmpdms.Version{ CliVersion: build.GetInfo().Version, VersionType: versionType, TargetName: b.Config.Bundle.Target, @@ -71,9 +101,9 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.Depl cleanup = func(failed bool) { stopHeartbeat() - reason := tempdms.VersionCompleteSuccess + reason := tmpdms.VersionCompleteSuccess if failed { - reason = tempdms.VersionCompleteFailure + reason = tmpdms.VersionCompleteFailure } // Use a separate context for cleanup so the lock is released even if the @@ -81,7 +111,7 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.Depl cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - _, completeErr := svc.CompleteVersion(cleanupCtx, tempdms.CompleteVersionRequest{ + _, completeErr := svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ DeploymentID: deploymentID, VersionID: versionID, Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), @@ -97,48 +127,40 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tempdms.Depl return deploymentID, versionID, cleanup, nil } -// reportOperations reports each resource operation to the metadata service. -// This is best-effort: failures are logged as warnings, not fatal errors. -func reportOperations(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, deploymentID, versionID string, plan *deployplan.Plan) { - if plan == nil { - return - } - - // Fetch existing resources to determine if this is the first time we're - // tracking each resource in the metadata service. - knownResources := map[string]bool{} - existing, err := svc.ListResources(ctx, tempdms.ListResourcesRequest{ - DeploymentID: deploymentID, - Parent: fmt.Sprintf("deployments/%s", deploymentID), - }) - if err != nil { - log.Warnf(ctx, "Failed to list existing resources from metadata service, will use INITIAL_REGISTER for all: %v", err) - } else { - for _, r := range existing { - knownResources[r.ResourceKey] = true - } +// planActionToOperationAction maps a deploy plan action to a metadata service operation action type. +func planActionToOperationAction(action deployplan.ActionType) tmpdms.OperationActionType { + switch action { + case deployplan.Create: + return tmpdms.OperationActionTypeCreate + case deployplan.Update: + return tmpdms.OperationActionTypeUpdate + case deployplan.Delete: + return tmpdms.OperationActionTypeDelete + case deployplan.Recreate: + return tmpdms.OperationActionTypeRecreate + default: + return tmpdms.OperationActionTypeUnspecified } +} - for resourceKey, entry := range plan.Plan { - var actionType tempdms.OperationActionType - if knownResources[resourceKey] { - actionType = planActionToOperationAction(entry.Action) - } else { - actionType = tempdms.OperationActionTypeInitRegister - } - - if actionType == tempdms.OperationActionTypeUnspecified { - continue +// makeOperationReporter returns an OperationReporter that reports each resource +// operation to the metadata service. Failures are logged as warnings. +func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { + return func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType) { + actionType := planActionToOperationAction(action) + if actionType == tmpdms.OperationActionTypeUnspecified { + return } - _, err := svc.CreateOperation(ctx, tempdms.CreateOperationRequest{ + _, err := svc.CreateOperation(ctx, tmpdms.CreateOperationRequest{ DeploymentID: deploymentID, VersionID: versionID, Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), ResourceKey: resourceKey, - Operation: &tempdms.Operation{ + Operation: &tmpdms.Operation{ ResourceKey: resourceKey, - Status: tempdms.OperationStatusSucceeded, + ResourceID: resourceID, + Status: tmpdms.OperationStatusSucceeded, ActionType: actionType, }, }) @@ -148,21 +170,6 @@ func reportOperations(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, d } } -func planActionToOperationAction(action deployplan.ActionType) tempdms.OperationActionType { - switch action { - case deployplan.Create: - return tempdms.OperationActionTypeCreate - case deployplan.Update: - return tempdms.OperationActionTypeUpdate - case deployplan.Delete: - return tempdms.OperationActionTypeDelete - case deployplan.Recreate: - return tempdms.OperationActionTypeRecreate - default: - return tempdms.OperationActionTypeUnspecified - } -} - // isAlreadyExists checks if an error indicates the resource already exists (HTTP 409). func isAlreadyExists(err error) bool { var apiErr *apierr.APIError diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 81e52a3445..320b8e75dd 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -18,7 +18,7 @@ import ( "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" - "github.com/databricks/cli/libs/tempdms" + "github.com/databricks/cli/libs/tmpdms" "github.com/databricks/databricks-sdk-go/apierr" ) @@ -117,7 +117,7 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, t // The destroy phase deletes artifacts and resources. func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { - useMetadataService, _ := env.DeploymentService(ctx) + useMetadataService, _ := env.ManagedState(ctx) if useMetadataService == "true" { log.Info(ctx, "Phase: destroy (with metadata service)") @@ -139,13 +139,13 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy var failed bool if useMetadataService == "true" { - svc, svcErr := tempdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) + svc, svcErr := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) if svcErr != nil { logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", svcErr)) return } - _, _, cleanup, lockErr := deployMetadataLock(ctx, b, svc, tempdms.VersionTypeDestroy) + deploymentID, versionID, cleanup, lockErr := deployMetadataLock(ctx, b, svc, tmpdms.VersionTypeDestroy) if lockErr != nil { logdiag.LogError(ctx, lockErr) return @@ -153,6 +153,10 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy defer func() { cleanup(failed || logdiag.HasError(ctx)) }() + + if targetEngine.IsDirect() { + b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + } } else { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { diff --git a/bundle/phases/heartbeat.go b/bundle/phases/heartbeat.go index 1f9b3d41d1..925c53193c 100644 --- a/bundle/phases/heartbeat.go +++ b/bundle/phases/heartbeat.go @@ -5,14 +5,14 @@ import ( "time" "github.com/databricks/cli/libs/log" - "github.com/databricks/cli/libs/tempdms" + "github.com/databricks/cli/libs/tmpdms" ) -const defaultHeartbeatInterval = 2 * time.Minute +const defaultHeartbeatInterval = 30 * time.Second // startHeartbeat starts a background goroutine that sends heartbeats to keep // the deployment lock alive. Returns a cancel function to stop the heartbeat. -func startHeartbeat(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, deploymentID, versionID string, interval time.Duration) context.CancelFunc { +func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string, interval time.Duration) context.CancelFunc { ctx, cancel := context.WithCancel(ctx) go func() { @@ -24,7 +24,7 @@ func startHeartbeat(ctx context.Context, svc *tempdms.DeploymentMetadataAPI, dep case <-ctx.Done(): return case <-ticker.C: - _, err := svc.Heartbeat(ctx, tempdms.HeartbeatRequest{ + _, err := svc.Heartbeat(ctx, tmpdms.HeartbeatRequest{ DeploymentID: deploymentID, VersionID: versionID, }) diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index a1968c8638..eaed985156 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -8,23 +8,23 @@ import ( "strings" "time" - "github.com/databricks/cli/libs/tempdms" + "github.com/databricks/cli/libs/tmpdms" ) // deploymentMetadataState holds in-memory state for the deployment metadata service. // Stored per-workspace inside FakeWorkspace. type deploymentMetadataState struct { // deployments keyed by deployment_id - deployments map[string]tempdms.Deployment + deployments map[string]tmpdms.Deployment // versions keyed by "deploymentId/versionId" - versions map[string]tempdms.Version + versions map[string]tmpdms.Version // operations keyed by "deploymentId/versionId/resourceKey" - operations map[string]tempdms.Operation + operations map[string]tmpdms.Operation // resources keyed by "deploymentId/resourceKey" - resources map[string]tempdms.Resource + resources map[string]tmpdms.Resource // lock state per deployment: which version holds the lock and when it expires lockHolder map[string]string // deploymentId -> "deployments/{id}/versions/{vid}" @@ -33,10 +33,10 @@ type deploymentMetadataState struct { func newDeploymentMetadataState() *deploymentMetadataState { return &deploymentMetadataState{ - deployments: map[string]tempdms.Deployment{}, - versions: map[string]tempdms.Version{}, - operations: map[string]tempdms.Operation{}, - resources: map[string]tempdms.Resource{}, + deployments: map[string]tmpdms.Deployment{}, + versions: map[string]tmpdms.Version{}, + operations: map[string]tmpdms.Operation{}, + resources: map[string]tmpdms.Resource{}, lockHolder: map[string]string{}, lockExpiry: map[string]time.Time{}, } @@ -57,7 +57,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response } // The body maps to the Deployment sub-message. - var bodyDeployment tempdms.Deployment + var bodyDeployment tmpdms.Deployment if len(req.Body) > 0 { if err := json.Unmarshal(req.Body, &bodyDeployment); err != nil { return Response{ @@ -76,11 +76,11 @@ func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response } now := time.Now().UTC() - deployment := tempdms.Deployment{ + deployment := tmpdms.Deployment{ Name: fmt.Sprintf("deployments/%s", deploymentID), DisplayName: deploymentID, TargetName: bodyDeployment.TargetName, - Status: tempdms.DeploymentStatusActive, + Status: tmpdms.DeploymentStatusActive, CreatedBy: s.CurrentUser().UserName, CreateTime: &now, UpdateTime: &now, @@ -117,7 +117,7 @@ func (s *FakeWorkspace) DeploymentMetadataDeleteDeployment(deploymentID string) } now := time.Now().UTC() - deployment.Status = tempdms.DeploymentStatusDeleted + deployment.Status = tmpdms.DeploymentStatusDeleted deployment.DestroyTime = &now deployment.DestroyedBy = s.CurrentUser().UserName deployment.UpdateTime = &now @@ -148,7 +148,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI } // The body maps to the Version sub-message. - var bodyVersion tempdms.Version + var bodyVersion tmpdms.Version if len(req.Body) > 0 { if err := json.Unmarshal(req.Body, &bodyVersion); err != nil { return Response{ @@ -197,12 +197,12 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI } versionKey := deploymentID + "/" + versionID - version := tempdms.Version{ + version := tmpdms.Version{ Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), VersionID: versionID, CreatedBy: s.CurrentUser().UserName, CreateTime: &now, - Status: tempdms.VersionStatusInProgress, + Status: tmpdms.VersionStatusInProgress, } version.CliVersion = bodyVersion.CliVersion version.VersionType = bodyVersion.VersionType @@ -216,7 +216,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI // Update the deployment's last_version_id and status. deployment.LastVersionID = versionID - deployment.Status = tempdms.DeploymentStatusInProgress + deployment.Status = tmpdms.DeploymentStatusInProgress deployment.UpdateTime = &now state.deployments[deploymentID] = deployment @@ -251,7 +251,7 @@ func (s *FakeWorkspace) DeploymentMetadataHeartbeat(req Request, deploymentID, v } } - if version.Status != tempdms.VersionStatusInProgress { + if version.Status != tmpdms.VersionStatusInProgress { return Response{ StatusCode: http.StatusConflict, Body: map[string]string{"error_code": "ABORTED", "message": "version is no longer in progress"}, @@ -272,7 +272,7 @@ func (s *FakeWorkspace) DeploymentMetadataHeartbeat(req Request, deploymentID, v newExpiry := now.Add(lockDuration) state.lockExpiry[deploymentID] = newExpiry - return Response{Body: tempdms.HeartbeatResponse{ExpireTime: &newExpiry}} + return Response{Body: tmpdms.HeartbeatResponse{ExpireTime: &newExpiry}} } func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymentID, versionID string) Response { @@ -288,14 +288,14 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen } } - if version.Status != tempdms.VersionStatusInProgress { + if version.Status != tmpdms.VersionStatusInProgress { return Response{ StatusCode: http.StatusConflict, Body: map[string]string{"error_code": "ABORTED", "message": "version is already completed"}, } } - var completeReq tempdms.CompleteVersionRequest + var completeReq tmpdms.CompleteVersionRequest if err := json.Unmarshal(req.Body, &completeReq); err != nil { return Response{ StatusCode: http.StatusBadRequest, @@ -304,7 +304,7 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen } now := time.Now().UTC() - version.Status = tempdms.VersionStatusCompleted + version.Status = tmpdms.VersionStatusCompleted version.CompleteTime = &now version.CompletionReason = completeReq.CompletionReason version.CompletedBy = s.CurrentUser().UserName @@ -317,10 +317,10 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen // Update deployment status based on completion reason. if deployment, ok := state.deployments[deploymentID]; ok { switch completeReq.CompletionReason { - case tempdms.VersionCompleteSuccess: - deployment.Status = tempdms.DeploymentStatusActive - case tempdms.VersionCompleteFailure, tempdms.VersionCompleteForceAbort, tempdms.VersionCompleteLeaseExpired: - deployment.Status = tempdms.DeploymentStatusFailed + case tmpdms.VersionCompleteSuccess: + deployment.Status = tmpdms.DeploymentStatusActive + case tmpdms.VersionCompleteFailure, tmpdms.VersionCompleteForceAbort, tmpdms.VersionCompleteLeaseExpired: + deployment.Status = tmpdms.DeploymentStatusFailed } deployment.UpdateTime = &now state.deployments[deploymentID] = deployment @@ -344,7 +344,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymen } // The body maps to the Operation sub-message. - var bodyOperation tempdms.Operation + var bodyOperation tmpdms.Operation if len(req.Body) > 0 { if err := json.Unmarshal(req.Body, &bodyOperation); err != nil { return Response{ @@ -356,7 +356,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymen now := time.Now().UTC() opKey := deploymentID + "/" + versionID + "/" + resourceKey - operation := tempdms.Operation{ + operation := tmpdms.Operation{ Name: fmt.Sprintf("deployments/%s/versions/%s/operations/%s", deploymentID, versionID, resourceKey), ResourceKey: resourceKey, CreateTime: &now, @@ -371,7 +371,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateOperation(req Request, deploymen // Upsert the deployment-level resource. resKey := deploymentID + "/" + resourceKey - resource := tempdms.Resource{ + resource := tmpdms.Resource{ Name: fmt.Sprintf("deployments/%s/resources/%s", deploymentID, resourceKey), ResourceKey: resourceKey, State: bodyOperation.State, @@ -389,14 +389,14 @@ func (s *FakeWorkspace) DeploymentMetadataListResources(deploymentID string) Res state := s.deploymentMetadata prefix := deploymentID + "/" - var resources []tempdms.Resource + var resources []tmpdms.Resource for key, resource := range state.resources { if strings.HasPrefix(key, prefix) { resources = append(resources, resource) } } if resources == nil { - resources = []tempdms.Resource{} + resources = []tmpdms.Resource{} } - return Response{Body: tempdms.ListResourcesResponse{Resources: resources}} + return Response{Body: tmpdms.ListResourcesResponse{Resources: resources}} } diff --git a/libs/tempdms/api.go b/libs/tmpdms/api.go similarity index 99% rename from libs/tempdms/api.go rename to libs/tmpdms/api.go index 005425fa68..a2bbd5857f 100644 --- a/libs/tempdms/api.go +++ b/libs/tmpdms/api.go @@ -1,4 +1,4 @@ -package tempdms +package tmpdms import ( "context" diff --git a/libs/tempdms/types.go b/libs/tmpdms/types.go similarity index 98% rename from libs/tempdms/types.go rename to libs/tmpdms/types.go index b36386b438..ba1ca723c2 100644 --- a/libs/tempdms/types.go +++ b/libs/tmpdms/types.go @@ -1,8 +1,8 @@ -// Package tempdms is a temporary client library for the Deployment Metadata Service. +// Package tmpdms is a temporary client library for the Deployment Metadata Service. // It mirrors the structure that the Databricks Go SDK will eventually generate from // the service's proto definitions. When the protos land in the SDK, migration should // be a straightforward import path change. -package tempdms +package tmpdms import "time" From b1a9a0a3f05eb70d5125ead3ad937fd6196ef6d0 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 22:24:48 +0000 Subject: [PATCH 08/25] Add acceptance test golden files and fix SDK compatibility Fix map[string]string -> map[string]any in tmpdms API client for SDK v0.126.0 compatibility. Generate golden files for metadata-service acceptance test showing the full deploy/destroy request flow. Co-authored-by: Isaac --- .../deploy/metadata-service/out.test.toml | 6 + .../bundle/deploy/metadata-service/output.txt | 113 ++++++++++++++++++ libs/tmpdms/api.go | 6 +- 3 files changed, 122 insertions(+), 3 deletions(-) create mode 100644 acceptance/bundle/deploy/metadata-service/out.test.toml create mode 100644 acceptance/bundle/deploy/metadata-service/output.txt diff --git a/acceptance/bundle/deploy/metadata-service/out.test.toml b/acceptance/bundle/deploy/metadata-service/out.test.toml new file mode 100644 index 0000000000..6ce208a048 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/out.test.toml @@ -0,0 +1,6 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] + DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] diff --git a/acceptance/bundle/deploy/metadata-service/output.txt b/acceptance/bundle/deploy/metadata-service/output.txt new file mode 100644 index 0000000000..e988794ab6 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/output.txt @@ -0,0 +1,113 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/metadata-service-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": 1, + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": 4, + "resource_id": "[NUMID]", + "status": 1 + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": 1 + } +} + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.jobs.test_job + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/metadata-service-test/default + +Deleting files... +Destroy complete! + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": 2, + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": 6, + "resource_id": "[NUMID]", + "status": 1 + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": 1 + } +} diff --git a/libs/tmpdms/api.go b/libs/tmpdms/api.go index a2bbd5857f..3f6fcf1957 100644 --- a/libs/tmpdms/api.go +++ b/libs/tmpdms/api.go @@ -34,7 +34,7 @@ func NewDeploymentMetadataAPI(w *databricks.WorkspaceClient) (*DeploymentMetadat func (a *DeploymentMetadataAPI) CreateDeployment(ctx context.Context, request CreateDeploymentRequest) (*Deployment, error) { var resp Deployment path := fmt.Sprintf("%s/deployments", basePath) - query := map[string]string{"deployment_id": request.DeploymentID} + query := map[string]any{"deployment_id": request.DeploymentID} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Deployment, &resp) if err != nil { return nil, mapError("create deployment", err) @@ -65,7 +65,7 @@ func (a *DeploymentMetadataAPI) DeleteDeployment(ctx context.Context, request De func (a *DeploymentMetadataAPI) CreateVersion(ctx context.Context, request CreateVersionRequest) (*Version, error) { var resp Version path := fmt.Sprintf("%s/deployments/%s/versions", basePath, request.DeploymentID) - query := map[string]string{"version_id": request.VersionID} + query := map[string]any{"version_id": request.VersionID} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Version, &resp) if err != nil { return nil, mapError("create version", err) @@ -106,7 +106,7 @@ func (a *DeploymentMetadataAPI) CompleteVersion(ctx context.Context, request Com func (a *DeploymentMetadataAPI) CreateOperation(ctx context.Context, request CreateOperationRequest) (*Operation, error) { var resp Operation path := fmt.Sprintf("%s/deployments/%s/versions/%s/operations", basePath, request.DeploymentID, request.VersionID) - query := map[string]string{"resource_key": request.ResourceKey} + query := map[string]any{"resource_key": request.ResourceKey} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Operation, &resp) if err != nil { return nil, mapError("create operation", err) From 8756548c98fb5af628072d09ceeb58d588f1d2c5 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 31 Mar 2026 23:11:36 +0000 Subject: [PATCH 09/25] Use string enums, report failed operations, and refactor lock acquisition - Change all enum types from int to string using proto enum name strings (e.g. "OPERATION_ACTION_TYPE_CREATE" instead of 4), matching proto-over-HTTP serialization format. - Report failed operations to the metadata service with error messages, not just successful ones. - Enforce direct deployment engine for managed state (early return). - Extract acquireMetadataLock helper to deduplicate deploy/destroy lock blocks. - Add deploy-error acceptance test verifying failed operation reporting. Co-authored-by: Isaac --- .../deploy-error/databricks.yml | 7 + .../deploy-error/out.test.toml | 6 + .../metadata-service/deploy-error/output.txt | 61 +++++++++ .../metadata-service/deploy-error/script | 5 + .../metadata-service/deploy-error/test.toml | 8 ++ .../bundle/deploy/metadata-service/output.txt | 16 +-- bundle/direct/bundle_apply.go | 22 ++-- bundle/direct/pkg.go | 9 +- bundle/phases/deploy.go | 14 +- bundle/phases/deploy_metadata.go | 41 +++++- bundle/phases/destroy.go | 12 +- libs/tmpdms/types.go | 121 +++++++++--------- 12 files changed, 213 insertions(+), 109 deletions(-) create mode 100644 acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml create mode 100644 acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml create mode 100644 acceptance/bundle/deploy/metadata-service/deploy-error/output.txt create mode 100644 acceptance/bundle/deploy/metadata-service/deploy-error/script create mode 100644 acceptance/bundle/deploy/metadata-service/deploy-error/test.toml diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml b/acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml new file mode 100644 index 0000000000..4786eeddf7 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml @@ -0,0 +1,7 @@ +bundle: + name: metadata-service-error-test + +resources: + jobs: + test_job: + name: test-job diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml b/acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml new file mode 100644 index 0000000000..6ce208a048 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml @@ -0,0 +1,6 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] + DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/output.txt b/acceptance/bundle/deploy/metadata-service/deploy-error/output.txt new file mode 100644 index 0000000000..db0a3d43e1 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/deploy-error/output.txt @@ -0,0 +1,61 @@ + +>>> musterr [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/metadata-service-error-test/default/files... +Deploying resources... +Error: cannot create resources.jobs.test_job: Invalid job configuration. (400 INVALID_PARAMETER_VALUE) + +Endpoint: POST [DATABRICKS_URL]/api/2.2/jobs/create +HTTP Status: 400 Bad Request +API error_code: INVALID_PARAMETER_VALUE +API message: Invalid job configuration. + +Updating deployment state... + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": "OPERATION_ACTION_TYPE_CREATE", + "status": "OPERATION_STATUS_FAILED", + "error_message": "Invalid job configuration." + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": "VERSION_COMPLETE_FAILURE" + } +} diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/script b/acceptance/bundle/deploy/metadata-service/deploy-error/script new file mode 100644 index 0000000000..806beae3de --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/deploy-error/script @@ -0,0 +1,5 @@ +# Deploy with the metadata service enabled, expecting a resource creation failure. +trace musterr $CLI bundle deploy + +# Print the metadata service requests to verify the failed operation is reported. +trace print_requests.py --get //bundle diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/test.toml b/acceptance/bundle/deploy/metadata-service/deploy-error/test.toml new file mode 100644 index 0000000000..9d7f2c1348 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/deploy-error/test.toml @@ -0,0 +1,8 @@ +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] +RecordRequests = true + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.StatusCode = 400 +Response.Body = '{"error_code": "INVALID_PARAMETER_VALUE", "message": "Invalid job configuration."}' diff --git a/acceptance/bundle/deploy/metadata-service/output.txt b/acceptance/bundle/deploy/metadata-service/output.txt index e988794ab6..9006297906 100644 --- a/acceptance/bundle/deploy/metadata-service/output.txt +++ b/acceptance/bundle/deploy/metadata-service/output.txt @@ -28,7 +28,7 @@ Deployment complete! }, "body": { "cli_version": "[DEV_VERSION]", - "version_type": 1, + "version_type": "VERSION_TYPE_DEPLOY", "target_name": "default" } } @@ -40,9 +40,9 @@ Deployment complete! }, "body": { "resource_key": "resources.jobs.test_job", - "action_type": 4, + "action_type": "OPERATION_ACTION_TYPE_CREATE", "resource_id": "[NUMID]", - "status": 1 + "status": "OPERATION_STATUS_SUCCEEDED" } } { @@ -50,7 +50,7 @@ Deployment complete! "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", "body": { "name": "deployments/[UUID]/versions/1", - "completion_reason": 1 + "completion_reason": "VERSION_COMPLETE_SUCCESS" } } @@ -86,7 +86,7 @@ Destroy complete! }, "body": { "cli_version": "[DEV_VERSION]", - "version_type": 2, + "version_type": "VERSION_TYPE_DESTROY", "target_name": "default" } } @@ -98,9 +98,9 @@ Destroy complete! }, "body": { "resource_key": "resources.jobs.test_job", - "action_type": 6, + "action_type": "OPERATION_ACTION_TYPE_DELETE", "resource_id": "[NUMID]", - "status": 1 + "status": "OPERATION_STATUS_SUCCEEDED" } } { @@ -108,6 +108,6 @@ Destroy complete! "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", "body": { "name": "deployments/[UUID]/versions/1", - "completion_reason": 1 + "completion_reason": "VERSION_COMPLETE_SUCCESS" } } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 68b5672257..95cf32f416 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -94,14 +94,13 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } err = d.Destroy(ctx, &b.StateDB) + if b.OperationReporter != nil { + b.OperationReporter(ctx, resourceKey, deleteResourceID, action, err) + } if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: %w", errorPrefix, err)) return false } - - if b.OperationReporter != nil { - b.OperationReporter(ctx, resourceKey, deleteResourceID, action) - } return true } @@ -137,16 +136,19 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa err = d.Deploy(ctx, &b.StateDB, sv.Value, action, entry) } + // Report the operation inline to the metadata service. + if b.OperationReporter != nil && !migrateMode { + var resourceID string + if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { + resourceID = dbentry.ID + } + b.OperationReporter(ctx, resourceKey, resourceID, action, err) + } + if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: %w", errorPrefix, err)) return false } - - // Report the operation inline to the metadata service. - if b.OperationReporter != nil && !migrateMode { - dbentry, _ := b.StateDB.GetResourceEntry(resourceKey) - b.OperationReporter(ctx, resourceKey, dbentry.ID, action) - } } // TODO: Note, we only really need remote state if there are remote references. diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 7932c040eb..aee2265eba 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -37,10 +37,11 @@ type DeploymentUnit struct { DependsOn []deployplan.DependsOnEntry } -// OperationReporter is called after each successful resource operation to report -// it to the deployment metadata service. It is best-effort: failures are logged -// as warnings by the caller. -type OperationReporter func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType) +// OperationReporter is called after each resource operation (success or failure) +// to report it to the deployment metadata service. If operationErr is non-nil the +// operation is recorded as failed with the error message. It is best-effort: +// reporting failures are logged as warnings by the caller. +type OperationReporter func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error) // DeploymentBundle holds everything needed to deploy a bundle type DeploymentBundle struct { diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 9c067bcff4..b2d272dac3 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -3,7 +3,6 @@ package phases import ( "context" "errors" - "fmt" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/artifacts" @@ -109,9 +108,7 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Even if deployment failed, there might be updates in states that we need to upload. statemgmt.PushResourcesState(ctx, b, targetEngine) - if logdiag.HasError(ctx) { return } @@ -158,13 +155,12 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand var failed bool if useMetadataService == "true" { - svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) - if err != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", err)) + if !targetEngine.IsDirect() { + logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) return } - deploymentID, versionID, cleanup, err := deployMetadataLock(ctx, b, svc, tmpdms.VersionTypeDeploy) + cleanup, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) if err != nil { logdiag.LogError(ctx, err) return @@ -172,10 +168,6 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand defer func() { cleanup(failed || logdiag.HasError(ctx)) }() - - if targetEngine.IsDirect() { - b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - } } else { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index 85d7290805..b77c025cde 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -21,6 +21,24 @@ import ( "github.com/google/uuid" ) +// acquireMetadataLock creates the metadata service client, acquires the deployment +// lock, and sets up the operation reporter on the bundle. It returns a cleanup +// function that releases the lock, or an error if the lock could not be acquired. +func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (cleanup func(failed bool), err error) { + svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) + if err != nil { + return nil, fmt.Errorf("failed to create metadata service client: %w", err) + } + + deploymentID, versionID, cleanup, err := deployMetadataLock(ctx, b, svc, versionType) + if err != nil { + return nil, err + } + + b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + return cleanup, nil +} + // deployMetadataLock implements the lock acquire/release lifecycle using the // deployment metadata service (CreateVersion / CompleteVersion). // @@ -120,7 +138,7 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo if completeErr != nil { log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) } else { - log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%d", deploymentID, versionID, reason) + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", deploymentID, versionID, reason) } } @@ -144,24 +162,33 @@ func planActionToOperationAction(action deployplan.ActionType) tmpdms.OperationA } // makeOperationReporter returns an OperationReporter that reports each resource -// operation to the metadata service. Failures are logged as warnings. +// operation (success or failure) to the metadata service. Reporting failures are +// logged as warnings and do not affect the deploy outcome. func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { - return func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType) { + return func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error) { actionType := planActionToOperationAction(action) if actionType == tmpdms.OperationActionTypeUnspecified { return } + status := tmpdms.OperationStatusSucceeded + var errorMessage string + if operationErr != nil { + status = tmpdms.OperationStatusFailed + errorMessage = operationErr.Error() + } + _, err := svc.CreateOperation(ctx, tmpdms.CreateOperationRequest{ DeploymentID: deploymentID, VersionID: versionID, Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), ResourceKey: resourceKey, Operation: &tmpdms.Operation{ - ResourceKey: resourceKey, - ResourceID: resourceID, - Status: tmpdms.OperationStatusSucceeded, - ActionType: actionType, + ResourceKey: resourceKey, + ResourceID: resourceID, + Status: status, + ActionType: actionType, + ErrorMessage: errorMessage, }, }) if err != nil { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 320b8e75dd..652126f198 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -3,7 +3,6 @@ package phases import ( "context" "errors" - "fmt" "net/http" "github.com/databricks/cli/bundle" @@ -139,13 +138,12 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy var failed bool if useMetadataService == "true" { - svc, svcErr := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) - if svcErr != nil { - logdiag.LogError(ctx, fmt.Errorf("failed to create metadata service client: %w", svcErr)) + if !targetEngine.IsDirect() { + logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) return } - deploymentID, versionID, cleanup, lockErr := deployMetadataLock(ctx, b, svc, tmpdms.VersionTypeDestroy) + cleanup, lockErr := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) if lockErr != nil { logdiag.LogError(ctx, lockErr) return @@ -153,10 +151,6 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy defer func() { cleanup(failed || logdiag.HasError(ctx)) }() - - if targetEngine.IsDirect() { - b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - } } else { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { diff --git a/libs/tmpdms/types.go b/libs/tmpdms/types.go index ba1ca723c2..320823d00e 100644 --- a/libs/tmpdms/types.go +++ b/libs/tmpdms/types.go @@ -7,87 +7,88 @@ package tmpdms import "time" // Enum types matching the proto definitions. +// Values are the proto enum name strings, which is how proto-over-HTTP serializes enums. -type DeploymentStatus int -type VersionStatus int -type VersionComplete int -type VersionType int -type OperationStatus int -type OperationActionType int -type DeploymentResourceType int +type DeploymentStatus string +type VersionStatus string +type VersionComplete string +type VersionType string +type OperationStatus string +type OperationActionType string +type DeploymentResourceType string const ( - DeploymentStatusUnspecified DeploymentStatus = 0 - DeploymentStatusActive DeploymentStatus = 1 - DeploymentStatusFailed DeploymentStatus = 2 - DeploymentStatusInProgress DeploymentStatus = 3 - DeploymentStatusDeleted DeploymentStatus = 4 + DeploymentStatusUnspecified DeploymentStatus = "DEPLOYMENT_STATUS_UNSPECIFIED" + DeploymentStatusActive DeploymentStatus = "DEPLOYMENT_STATUS_ACTIVE" + DeploymentStatusFailed DeploymentStatus = "DEPLOYMENT_STATUS_FAILED" + DeploymentStatusInProgress DeploymentStatus = "DEPLOYMENT_STATUS_IN_PROGRESS" + DeploymentStatusDeleted DeploymentStatus = "DEPLOYMENT_STATUS_DELETED" ) const ( - VersionStatusUnspecified VersionStatus = 0 - VersionStatusInProgress VersionStatus = 1 - VersionStatusCompleted VersionStatus = 2 + VersionStatusUnspecified VersionStatus = "VERSION_STATUS_UNSPECIFIED" + VersionStatusInProgress VersionStatus = "VERSION_STATUS_IN_PROGRESS" + VersionStatusCompleted VersionStatus = "VERSION_STATUS_COMPLETED" ) const ( - VersionCompleteUnspecified VersionComplete = 0 - VersionCompleteSuccess VersionComplete = 1 - VersionCompleteFailure VersionComplete = 2 - VersionCompleteForceAbort VersionComplete = 3 - VersionCompleteLeaseExpired VersionComplete = 4 + VersionCompleteUnspecified VersionComplete = "VERSION_COMPLETE_UNSPECIFIED" + VersionCompleteSuccess VersionComplete = "VERSION_COMPLETE_SUCCESS" + VersionCompleteFailure VersionComplete = "VERSION_COMPLETE_FAILURE" + VersionCompleteForceAbort VersionComplete = "VERSION_COMPLETE_FORCE_ABORT" + VersionCompleteLeaseExpired VersionComplete = "VERSION_COMPLETE_LEASE_EXPIRED" ) const ( - VersionTypeUnspecified VersionType = 0 - VersionTypeDeploy VersionType = 1 - VersionTypeDestroy VersionType = 2 + VersionTypeUnspecified VersionType = "VERSION_TYPE_UNSPECIFIED" + VersionTypeDeploy VersionType = "VERSION_TYPE_DEPLOY" + VersionTypeDestroy VersionType = "VERSION_TYPE_DESTROY" ) const ( - OperationStatusUnspecified OperationStatus = 0 - OperationStatusSucceeded OperationStatus = 1 - OperationStatusFailed OperationStatus = 2 + OperationStatusUnspecified OperationStatus = "OPERATION_STATUS_UNSPECIFIED" + OperationStatusSucceeded OperationStatus = "OPERATION_STATUS_SUCCEEDED" + OperationStatusFailed OperationStatus = "OPERATION_STATUS_FAILED" ) const ( - OperationActionTypeUnspecified OperationActionType = 0 - OperationActionTypeResize OperationActionType = 1 - OperationActionTypeUpdate OperationActionType = 2 - OperationActionTypeUpdateWithID OperationActionType = 3 - OperationActionTypeCreate OperationActionType = 4 - OperationActionTypeRecreate OperationActionType = 5 - OperationActionTypeDelete OperationActionType = 6 - OperationActionTypeBind OperationActionType = 7 - OperationActionTypeBindAndUpdate OperationActionType = 8 - OperationActionTypeInitRegister OperationActionType = 9 + OperationActionTypeUnspecified OperationActionType = "OPERATION_ACTION_TYPE_UNSPECIFIED" + OperationActionTypeResize OperationActionType = "OPERATION_ACTION_TYPE_RESIZE" + OperationActionTypeUpdate OperationActionType = "OPERATION_ACTION_TYPE_UPDATE" + OperationActionTypeUpdateWithID OperationActionType = "OPERATION_ACTION_TYPE_UPDATE_WITH_ID" + OperationActionTypeCreate OperationActionType = "OPERATION_ACTION_TYPE_CREATE" + OperationActionTypeRecreate OperationActionType = "OPERATION_ACTION_TYPE_RECREATE" + OperationActionTypeDelete OperationActionType = "OPERATION_ACTION_TYPE_DELETE" + OperationActionTypeBind OperationActionType = "OPERATION_ACTION_TYPE_BIND" + OperationActionTypeBindAndUpdate OperationActionType = "OPERATION_ACTION_TYPE_BIND_AND_UPDATE" + OperationActionTypeInitRegister OperationActionType = "OPERATION_ACTION_TYPE_INITIAL_REGISTER" ) const ( - ResourceTypeUnspecified DeploymentResourceType = 0 - ResourceTypeJob DeploymentResourceType = 1 - ResourceTypePipeline DeploymentResourceType = 2 - ResourceTypeModel DeploymentResourceType = 4 - ResourceTypeRegisteredModel DeploymentResourceType = 5 - ResourceTypeExperiment DeploymentResourceType = 6 - ResourceTypeServingEndpoint DeploymentResourceType = 7 - ResourceTypeQualityMonitor DeploymentResourceType = 8 - ResourceTypeSchema DeploymentResourceType = 9 - ResourceTypeVolume DeploymentResourceType = 10 - ResourceTypeCluster DeploymentResourceType = 11 - ResourceTypeDashboard DeploymentResourceType = 12 - ResourceTypeApp DeploymentResourceType = 13 - ResourceTypeCatalog DeploymentResourceType = 14 - ResourceTypeExternalLocation DeploymentResourceType = 15 - ResourceTypeSecretScope DeploymentResourceType = 16 - ResourceTypeAlert DeploymentResourceType = 17 - ResourceTypeSQLWarehouse DeploymentResourceType = 18 - ResourceTypeDatabaseInstance DeploymentResourceType = 19 - ResourceTypeDatabaseCatalog DeploymentResourceType = 20 - ResourceTypeSyncedDBTable DeploymentResourceType = 21 - ResourceTypePostgresProject DeploymentResourceType = 22 - ResourceTypePostgresBranch DeploymentResourceType = 23 - ResourceTypePostgresEndpoint DeploymentResourceType = 24 + ResourceTypeUnspecified DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_UNSPECIFIED" + ResourceTypeJob DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_JOB" + ResourceTypePipeline DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_PIPELINE" + ResourceTypeModel DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_MODEL" + ResourceTypeRegisteredModel DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_REGISTERED_MODEL" + ResourceTypeExperiment DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_EXPERIMENT" + ResourceTypeServingEndpoint DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_MODEL_SERVING_ENDPOINT" + ResourceTypeQualityMonitor DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_QUALITY_MONITOR" + ResourceTypeSchema DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_SCHEMA" + ResourceTypeVolume DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_VOLUME" + ResourceTypeCluster DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_CLUSTER" + ResourceTypeDashboard DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_DASHBOARD" + ResourceTypeApp DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_APP" + ResourceTypeCatalog DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_CATALOG" + ResourceTypeExternalLocation DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_EXTERNAL_LOCATION" + ResourceTypeSecretScope DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_SECRET_SCOPE" + ResourceTypeAlert DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_ALERT" + ResourceTypeSQLWarehouse DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_SQL_WAREHOUSE" + ResourceTypeDatabaseInstance DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_DATABASE_INSTANCE" + ResourceTypeDatabaseCatalog DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_DATABASE_CATALOG" + ResourceTypeSyncedDBTable DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_SYNCED_DATABASE_TABLE" + ResourceTypePostgresProject DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_POSTGRES_PROJECT" + ResourceTypePostgresBranch DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_POSTGRES_BRANCH" + ResourceTypePostgresEndpoint DeploymentResourceType = "DEPLOYMENT_RESOURCE_TYPE_POSTGRES_ENDPOINT" ) // Resource types (proto message equivalents). From 8117d818e6d47e2d469843e45d51b4d598ead5d7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 1 Apr 2026 00:23:16 +0000 Subject: [PATCH 10/25] Inject liteswap traffic ID header when DATABRICKS_LITESWAP_ID is set When the DATABRICKS_LITESWAP_ID environment variable is set, wrap the SDK HTTP transport to inject the x-databricks-traffic-id header on all API requests. This routes traffic to the liteswap service instance for E2E testing against dev deployments. Usage: DATABRICKS_LITESWAP_ID=my-env databricks bundle deploy Co-authored-by: Isaac --- bundle/config/workspace.go | 26 ++++++++++++++++++++++++++ bundle/env/liteswap.go | 15 +++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 bundle/env/liteswap.go diff --git a/bundle/config/workspace.go b/bundle/config/workspace.go index 32e2fdd38a..bd9cd871ed 100644 --- a/bundle/config/workspace.go +++ b/bundle/config/workspace.go @@ -1,6 +1,7 @@ package config import ( + "net/http" "os" "path/filepath" @@ -169,9 +170,34 @@ func (w *Workspace) Client() (*databricks.WorkspaceClient, error) { } } + // If DATABRICKS_LITESWAP_ID is set, wrap the transport to inject the + // x-databricks-traffic-id header for routing to the liteswap instance. + if liteswapID := os.Getenv("DATABRICKS_LITESWAP_ID"); liteswapID != "" { + inner := cfg.HTTPTransport + if inner == nil { + inner = http.DefaultTransport + } + cfg.HTTPTransport = &liteswapTransport{ + inner: inner, + trafficID: "testenv://liteswap/" + liteswapID, + } + } + return databricks.NewWorkspaceClient((*databricks.Config)(cfg)) } +// liteswapTransport injects the x-databricks-traffic-id header to route +// requests to a liteswap service instance. +type liteswapTransport struct { + inner http.RoundTripper + trafficID string +} + +func (t *liteswapTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.Header.Set("x-databricks-traffic-id", t.trafficID) + return t.inner.RoundTrip(req) +} + func init() { arg0 := os.Args[0] diff --git a/bundle/env/liteswap.go b/bundle/env/liteswap.go new file mode 100644 index 0000000000..1bdb6fc7c0 --- /dev/null +++ b/bundle/env/liteswap.go @@ -0,0 +1,15 @@ +package env + +import "context" + +// liteswapVariable names the environment variable that holds the liteswap +// environment name. When set, the CLI injects the x-databricks-traffic-id +// header on all API requests to route traffic to the liteswap service instance. +const liteswapVariable = "DATABRICKS_LITESWAP_ID" + +// LiteswapID returns the liteswap environment name if set. +func LiteswapID(ctx context.Context) (string, bool) { + return get(ctx, []string{ + liteswapVariable, + }) +} From d648a9e682fb2416c3076b78d947d8a32167fec5 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 1 Apr 2026 09:15:56 +0000 Subject: [PATCH 11/25] Remove unused liteswap env helper The LiteswapID() function was never called; workspace.go reads DATABRICKS_LITESWAP_ID via os.Getenv directly. Co-authored-by: Isaac --- bundle/env/liteswap.go | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 bundle/env/liteswap.go diff --git a/bundle/env/liteswap.go b/bundle/env/liteswap.go deleted file mode 100644 index 1bdb6fc7c0..0000000000 --- a/bundle/env/liteswap.go +++ /dev/null @@ -1,15 +0,0 @@ -package env - -import "context" - -// liteswapVariable names the environment variable that holds the liteswap -// environment name. When set, the CLI injects the x-databricks-traffic-id -// header on all API requests to route traffic to the liteswap service instance. -const liteswapVariable = "DATABRICKS_LITESWAP_ID" - -// LiteswapID returns the liteswap environment name if set. -func LiteswapID(ctx context.Context) (string, bool) { - return get(ctx, []string{ - liteswapVariable, - }) -} From 3108d851b81afaaefcdad67a2aa5c03aebfbaa53 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 11:31:11 +0000 Subject: [PATCH 12/25] Delete deployment on destroy, use env.Get for liteswap, remove unused LoadState - Call DeleteDeployment after successful bundle destroy to clean up the deployment record in the metadata service. - Replace os.Getenv with env.Get for DATABRICKS_LITESWAP_ID so it respects context-based env overrides. - Remove unused deploy.LoadState function. Co-authored-by: Isaac --- .../bundle/deploy/metadata-service/output.txt | 4 ++++ bundle/bundle.go | 6 ++--- bundle/bundle_test.go | 7 +++--- bundle/config/workspace.go | 6 +++-- bundle/config/workspace_test.go | 18 +++++++-------- bundle/deploy/state_update.go | 6 ----- bundle/phases/deploy.go | 4 ++-- bundle/phases/deploy_metadata.go | 22 +++++++++++++++---- bundle/phases/destroy.go | 18 +++++++++++++-- cmd/root/auth.go | 2 +- cmd/root/bundle.go | 4 ++-- 11 files changed, 63 insertions(+), 34 deletions(-) diff --git a/acceptance/bundle/deploy/metadata-service/output.txt b/acceptance/bundle/deploy/metadata-service/output.txt index 9006297906..1c25636f46 100644 --- a/acceptance/bundle/deploy/metadata-service/output.txt +++ b/acceptance/bundle/deploy/metadata-service/output.txt @@ -111,3 +111,7 @@ Destroy complete! "completion_reason": "VERSION_COMPLETE_SUCCESS" } } +{ + "method": "DELETE", + "path": "/api/2.0/bundle/deployments/[UUID]" +} diff --git a/bundle/bundle.go b/bundle/bundle.go index 97824eb839..2d6e691886 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -225,10 +225,10 @@ func TryLoad(ctx context.Context) *Bundle { return b } -func (b *Bundle) WorkspaceClientE() (*databricks.WorkspaceClient, error) { +func (b *Bundle) WorkspaceClientE(ctx context.Context) (*databricks.WorkspaceClient, error) { b.clientOnce.Do(func() { var err error - b.client, err = b.Config.Workspace.Client() + b.client, err = b.Config.Workspace.Client(ctx) if err != nil { b.clientErr = fmt.Errorf("cannot resolve bundle auth configuration: %w", err) } @@ -238,7 +238,7 @@ func (b *Bundle) WorkspaceClientE() (*databricks.WorkspaceClient, error) { } func (b *Bundle) WorkspaceClient() *databricks.WorkspaceClient { - client, err := b.WorkspaceClientE() + client, err := b.WorkspaceClientE(context.TODO()) if err != nil { panic(err) } diff --git a/bundle/bundle_test.go b/bundle/bundle_test.go index fa8282f343..b571cf8e16 100644 --- a/bundle/bundle_test.go +++ b/bundle/bundle_test.go @@ -184,12 +184,13 @@ func TestClearWorkspaceClient(t *testing.T) { b.Config.Workspace.Host = "https://nonexistent.example.com" b.Config.Workspace.Profile = "profile-A" - _, err1 := b.WorkspaceClientE() + ctx := t.Context() + _, err1 := b.WorkspaceClientE(ctx) require.Error(t, err1) assert.Contains(t, err1.Error(), "profile-A") // Without retry, second call returns the same cached error (same object). - _, err1b := b.WorkspaceClientE() + _, err1b := b.WorkspaceClientE(ctx) assert.Same(t, err1, err1b, "expected same cached error without retry") // After retry, change the profile to "profile-B" and call again. @@ -197,7 +198,7 @@ func TestClearWorkspaceClient(t *testing.T) { b.ClearWorkspaceClient() b.Config.Workspace.Profile = "profile-B" - _, err2 := b.WorkspaceClientE() + _, err2 := b.WorkspaceClientE(ctx) require.Error(t, err2) assert.Contains(t, err2.Error(), "profile-B", "expected re-execution to pick up new profile") assert.NotContains(t, err2.Error(), "profile-A", "stale cached error should not appear") diff --git a/bundle/config/workspace.go b/bundle/config/workspace.go index a9ee064e6a..608c8aab63 100644 --- a/bundle/config/workspace.go +++ b/bundle/config/workspace.go @@ -1,12 +1,14 @@ package config import ( + "context" "net/http" "os" "path/filepath" "github.com/databricks/cli/libs/auth" "github.com/databricks/cli/libs/databrickscfg" + "github.com/databricks/cli/libs/env" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/config" "github.com/databricks/databricks-sdk-go/marshal" @@ -157,7 +159,7 @@ func (w *Workspace) NormalizeHostURL() { } } -func (w *Workspace) Client() (*databricks.WorkspaceClient, error) { +func (w *Workspace) Client(ctx context.Context) (*databricks.WorkspaceClient, error) { // Extract query parameters (?o=, ?a=) from the host URL before building // the SDK config. This ensures workspace_id and account_id are available // for profile resolution during EnsureResolved(). @@ -196,7 +198,7 @@ func (w *Workspace) Client() (*databricks.WorkspaceClient, error) { // If DATABRICKS_LITESWAP_ID is set, wrap the transport to inject the // x-databricks-traffic-id header for routing to the liteswap instance. - if liteswapID := os.Getenv("DATABRICKS_LITESWAP_ID"); liteswapID != "" { + if liteswapID := env.Get(ctx, "DATABRICKS_LITESWAP_ID"); liteswapID != "" { inner := cfg.HTTPTransport if inner == nil { inner = http.DefaultTransport diff --git a/bundle/config/workspace_test.go b/bundle/config/workspace_test.go index 4181d17170..4d87503a03 100644 --- a/bundle/config/workspace_test.go +++ b/bundle/config/workspace_test.go @@ -34,7 +34,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { t.Run("no config file", func(t *testing.T) { setupWorkspaceTest(t) - _, err := w.Client() + _, err := w.Client(t.Context()) assert.NoError(t, err) }) @@ -49,7 +49,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { }) require.NoError(t, err) - client, err := w.Client() + client, err := w.Client(t.Context()) assert.NoError(t, err) assert.Equal(t, "default", client.Config.Profile) }) @@ -67,7 +67,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - client, err := w.Client() + client, err := w.Client(t.Context()) assert.NoError(t, err) assert.Equal(t, "custom", client.Config.Profile) }) @@ -149,7 +149,7 @@ func TestWorkspaceClientNormalizesHostBeforeProfileResolution(t *testing.T) { w := Workspace{ Host: "https://spog.databricks.com/?o=222", } - client, err := w.Client() + client, err := w.Client(t.Context()) require.NoError(t, err) assert.Equal(t, "ws2", client.Config.Profile) } @@ -165,7 +165,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { t.Run("no config file", func(t *testing.T) { setupWorkspaceTest(t) - _, err := w.Client() + _, err := w.Client(t.Context()) assert.ErrorIs(t, err, fs.ErrNotExist) }) @@ -179,7 +179,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { }) require.NoError(t, err) - _, err = w.Client() + _, err = w.Client(t.Context()) assert.NoError(t, err) }) @@ -193,7 +193,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { }) require.NoError(t, err) - _, err = w.Client() + _, err = w.Client(t.Context()) assert.ErrorContains(t, err, "doesn’t match the host configured in the bundle") }) @@ -209,7 +209,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - _, err = w.Client() + _, err = w.Client(t.Context()) assert.NoError(t, err) }) @@ -225,7 +225,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - _, err = w.Client() + _, err = w.Client(t.Context()) assert.ErrorContains(t, err, "doesn’t match the host configured in the bundle") }) } diff --git a/bundle/deploy/state_update.go b/bundle/deploy/state_update.go index 06326c8a93..55cf2393bf 100644 --- a/bundle/deploy/state_update.go +++ b/bundle/deploy/state_update.go @@ -81,12 +81,6 @@ func StateUpdate() bundle.Mutator { return &stateUpdate{} } -// LoadState loads the deployment state from the local cache directory. -// If no state file exists, a new default DeploymentState is returned. -func LoadState(ctx context.Context, b *bundle.Bundle) (*DeploymentState, error) { - return load(ctx, b) -} - func load(ctx context.Context, b *bundle.Bundle) (*DeploymentState, error) { // If the file does not exist, return a new DeploymentState. statePath, err := getPathToStateFile(ctx, b) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index b2d272dac3..2232eb93fd 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -160,13 +160,13 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - cleanup, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) + lockResult, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) if err != nil { logdiag.LogError(ctx, err) return } defer func() { - cleanup(failed || logdiag.HasError(ctx)) + lockResult.cleanup(failed || logdiag.HasError(ctx)) }() } else { bundle.ApplyContext(ctx, b, lock.Acquire()) diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index b77c025cde..7fd980c99d 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -21,10 +21,20 @@ import ( "github.com/google/uuid" ) +// metadataLockResult holds the state needed after acquiring a metadata lock, +// including references to the service client and deployment ID for post-lock +// operations like deleting the deployment. +type metadataLockResult struct { + svc *tmpdms.DeploymentMetadataAPI + deploymentID string + cleanup func(failed bool) +} + // acquireMetadataLock creates the metadata service client, acquires the deployment -// lock, and sets up the operation reporter on the bundle. It returns a cleanup -// function that releases the lock, or an error if the lock could not be acquired. -func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (cleanup func(failed bool), err error) { +// lock, and sets up the operation reporter on the bundle. It returns the lock +// result containing the cleanup function and service references, or an error if +// the lock could not be acquired. +func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (*metadataLockResult, error) { svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) if err != nil { return nil, fmt.Errorf("failed to create metadata service client: %w", err) @@ -36,7 +46,11 @@ func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpd } b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - return cleanup, nil + return &metadataLockResult{ + svc: svc, + deploymentID: deploymentID, + cleanup: cleanup, + }, nil } // deployMetadataLock implements the lock acquire/release lifecycle using the diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 652126f198..9982a45910 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -136,6 +136,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy // Acquire the deployment lock. var failed bool + var lockResult *metadataLockResult if useMetadataService == "true" { if !targetEngine.IsDirect() { @@ -143,13 +144,14 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy return } - cleanup, lockErr := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) + var lockErr error + lockResult, lockErr = acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) if lockErr != nil { logdiag.LogError(ctx, lockErr) return } defer func() { - cleanup(failed || logdiag.HasError(ctx)) + lockResult.cleanup(failed || logdiag.HasError(ctx)) }() } else { bundle.ApplyContext(ctx, b, lock.Acquire()) @@ -216,6 +218,18 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy destroyCore(ctx, b, plan, targetEngine) if logdiag.HasError(ctx) { failed = true + return + } + + // Delete the deployment record from the metadata service after a + // successful destroy. + if lockResult != nil { + _, err := lockResult.svc.DeleteDeployment(ctx, tmpdms.DeleteDeploymentRequest{ + DeploymentID: lockResult.deploymentID, + }) + if err != nil { + log.Warnf(ctx, "Failed to delete deployment: %v", err) + } } } else { cmdio.LogString(ctx, "Destroy cancelled!") diff --git a/cmd/root/auth.go b/cmd/root/auth.go index 477db14337..8446b9b525 100644 --- a/cmd/root/auth.go +++ b/cmd/root/auth.go @@ -261,7 +261,7 @@ func MustWorkspaceClient(cmd *cobra.Command, args []string) error { if b != nil { ctx = cmdctx.SetConfigUsed(ctx, b.Config.Workspace.Config()) cmd.SetContext(ctx) - client, err := b.WorkspaceClientE() + client, err := b.WorkspaceClientE(ctx) if err != nil { return err } diff --git a/cmd/root/bundle.go b/cmd/root/bundle.go index 0b2ba1cfc6..602710d722 100644 --- a/cmd/root/bundle.go +++ b/cmd/root/bundle.go @@ -161,7 +161,7 @@ func configureBundle(cmd *cobra.Command, b *bundle.Bundle) { // // Note that just initializing a workspace client and loading auth configuration // is a fast operation. It does not perform network I/O or invoke processes (for example the Azure CLI). - client, err := b.WorkspaceClientE() + client, err := b.WorkspaceClientE(ctx) if err != nil { names, isMulti := databrickscfg.AsMultipleProfiles(err) if !isMulti { @@ -177,7 +177,7 @@ func configureBundle(cmd *cobra.Command, b *bundle.Bundle) { b.Config.Workspace.Profile = selected b.ClearWorkspaceClient() - client, err = b.WorkspaceClientE() + client, err = b.WorkspaceClientE(ctx) if err != nil { logdiag.LogError(ctx, err) return From aad93ec9df7550982d6aba3173784713d4b9e5ad Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 11:35:02 +0000 Subject: [PATCH 13/25] Move DeleteDeployment into cleanup closure, expand operation action mapping - Move DeleteDeployment call into the cleanup closure in deployMetadataLock so it runs after CompleteVersion (correct ordering). This removes the need to expose svc/deploymentID via a struct. - Add Resize, UpdateWithID mappings to planActionToOperationAction. - Return error for unsupported action types instead of silently ignoring. - Silently skip no-op actions like Skip. Co-authored-by: Isaac --- bundle/phases/deploy.go | 4 +- bundle/phases/deploy_metadata.go | 63 ++++++++++++++++++-------------- bundle/phases/destroy.go | 18 +-------- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 2232eb93fd..b2d272dac3 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -160,13 +160,13 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - lockResult, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) + cleanup, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) if err != nil { logdiag.LogError(ctx, err) return } defer func() { - lockResult.cleanup(failed || logdiag.HasError(ctx)) + cleanup(failed || logdiag.HasError(ctx)) }() } else { bundle.ApplyContext(ctx, b, lock.Acquire()) diff --git a/bundle/phases/deploy_metadata.go b/bundle/phases/deploy_metadata.go index 7fd980c99d..ce7b9d0d96 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/phases/deploy_metadata.go @@ -21,20 +21,10 @@ import ( "github.com/google/uuid" ) -// metadataLockResult holds the state needed after acquiring a metadata lock, -// including references to the service client and deployment ID for post-lock -// operations like deleting the deployment. -type metadataLockResult struct { - svc *tmpdms.DeploymentMetadataAPI - deploymentID string - cleanup func(failed bool) -} - // acquireMetadataLock creates the metadata service client, acquires the deployment -// lock, and sets up the operation reporter on the bundle. It returns the lock -// result containing the cleanup function and service references, or an error if -// the lock could not be acquired. -func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (*metadataLockResult, error) { +// lock, and sets up the operation reporter on the bundle. It returns a cleanup +// function that releases the lock, or an error if the lock could not be acquired. +func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (cleanup func(failed bool), err error) { svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) if err != nil { return nil, fmt.Errorf("failed to create metadata service client: %w", err) @@ -46,11 +36,7 @@ func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpd } b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - return &metadataLockResult{ - svc: svc, - deploymentID: deploymentID, - cleanup: cleanup, - }, nil + return cleanup, nil } // deployMetadataLock implements the lock acquire/release lifecycle using the @@ -154,24 +140,43 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo } else { log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", deploymentID, versionID, reason) } + + // For destroy operations, delete the deployment record after + // successfully releasing the lock. + if !failed && versionType == tmpdms.VersionTypeDestroy { + _, deleteErr := svc.DeleteDeployment(cleanupCtx, tmpdms.DeleteDeploymentRequest{ + DeploymentID: deploymentID, + }) + if deleteErr != nil { + log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) + } + } } return deploymentID, versionID, cleanup, nil } -// planActionToOperationAction maps a deploy plan action to a metadata service operation action type. -func planActionToOperationAction(action deployplan.ActionType) tmpdms.OperationActionType { +// planActionToOperationAction maps a deploy plan action to a metadata service +// operation action type. It returns an error for actions that are not supported +// by the backend. No-op actions like Skip return ("", nil) and should be ignored. +func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { switch action { + case deployplan.Skip: + return "", nil case deployplan.Create: - return tmpdms.OperationActionTypeCreate + return tmpdms.OperationActionTypeCreate, nil case deployplan.Update: - return tmpdms.OperationActionTypeUpdate + return tmpdms.OperationActionTypeUpdate, nil + case deployplan.UpdateWithID: + return tmpdms.OperationActionTypeUpdateWithID, nil case deployplan.Delete: - return tmpdms.OperationActionTypeDelete + return tmpdms.OperationActionTypeDelete, nil case deployplan.Recreate: - return tmpdms.OperationActionTypeRecreate + return tmpdms.OperationActionTypeRecreate, nil + case deployplan.Resize: + return tmpdms.OperationActionTypeResize, nil default: - return tmpdms.OperationActionTypeUnspecified + return "", fmt.Errorf("unsupported operation action type: %s", action) } } @@ -180,8 +185,12 @@ func planActionToOperationAction(action deployplan.ActionType) tmpdms.OperationA // logged as warnings and do not affect the deploy outcome. func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { return func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error) { - actionType := planActionToOperationAction(action) - if actionType == tmpdms.OperationActionTypeUnspecified { + actionType, mapErr := planActionToOperationAction(action) + if mapErr != nil { + log.Warnf(ctx, "Skipping operation report for resource %s: %v", resourceKey, mapErr) + return + } + if actionType == "" { return } diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 9982a45910..652126f198 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -136,7 +136,6 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy // Acquire the deployment lock. var failed bool - var lockResult *metadataLockResult if useMetadataService == "true" { if !targetEngine.IsDirect() { @@ -144,14 +143,13 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy return } - var lockErr error - lockResult, lockErr = acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) + cleanup, lockErr := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) if lockErr != nil { logdiag.LogError(ctx, lockErr) return } defer func() { - lockResult.cleanup(failed || logdiag.HasError(ctx)) + cleanup(failed || logdiag.HasError(ctx)) }() } else { bundle.ApplyContext(ctx, b, lock.Acquire()) @@ -218,18 +216,6 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy destroyCore(ctx, b, plan, targetEngine) if logdiag.HasError(ctx) { failed = true - return - } - - // Delete the deployment record from the metadata service after a - // successful destroy. - if lockResult != nil { - _, err := lockResult.svc.DeleteDeployment(ctx, tmpdms.DeleteDeploymentRequest{ - DeploymentID: lockResult.deploymentID, - }) - if err != nil { - log.Warnf(ctx, "Failed to delete deployment: %v", err) - } } } else { cmdio.LogString(ctx, "Destroy cancelled!") From 92925973748540a846d1f1c09aec15fd63e4b50a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 12:07:50 +0000 Subject: [PATCH 14/25] Add DeploymentLock and ResourceState interfaces for backend-agnostic locking and state management Introduce two interfaces to enable seamless transition between legacy filesystem-based deployments and the new deployment metadata service: - DeploymentLock: Acquire/Release with factory that selects filesystem or metadata service lock based on DATABRICKS_BUNDLE_MANAGED_STATE env var. - ResourceState Manager: Read/Push with direct and terraform implementations, consolidating the engine-type branching into a single factory. Move metadata service lock code from phases/ to deploy/lock/ package, delete old acquire/release mutators, and update all consumers (deploy, destroy, bind, unbind) to use the new interfaces. Co-authored-by: Isaac --- bundle/deploy/lock/acquire.go | 69 --------- .../lock/acquire_metadata.go} | 135 +++++------------- bundle/deploy/lock/filesystem.go | 66 +++++++++ bundle/{phases => deploy/lock}/heartbeat.go | 6 +- bundle/deploy/lock/lock.go | 62 ++++++++ bundle/deploy/lock/metadata_service.go | 87 +++++++++++ bundle/deploy/lock/release.go | 58 -------- bundle/phases/bind.go | 20 ++- bundle/phases/deploy.go | 38 ++--- bundle/phases/destroy.go | 38 ++--- bundle/statemgmt/check_running_resources.go | 19 +-- bundle/statemgmt/resourcestate/manager.go | 12 ++ bundle/statemgmt/state_load.go | 20 +-- bundle/statemgmt/state_manager.go | 74 ++++++++++ bundle/statemgmt/state_push.go | 36 +---- 15 files changed, 388 insertions(+), 352 deletions(-) delete mode 100644 bundle/deploy/lock/acquire.go rename bundle/{phases/deploy_metadata.go => deploy/lock/acquire_metadata.go} (59%) create mode 100644 bundle/deploy/lock/filesystem.go rename bundle/{phases => deploy/lock}/heartbeat.go (86%) create mode 100644 bundle/deploy/lock/lock.go create mode 100644 bundle/deploy/lock/metadata_service.go delete mode 100644 bundle/deploy/lock/release.go create mode 100644 bundle/statemgmt/resourcestate/manager.go create mode 100644 bundle/statemgmt/state_manager.go diff --git a/bundle/deploy/lock/acquire.go b/bundle/deploy/lock/acquire.go deleted file mode 100644 index d4f788c3ca..0000000000 --- a/bundle/deploy/lock/acquire.go +++ /dev/null @@ -1,69 +0,0 @@ -package lock - -import ( - "context" - "errors" - "io/fs" - - "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/permissions" - "github.com/databricks/cli/libs/diag" - "github.com/databricks/cli/libs/locker" - "github.com/databricks/cli/libs/log" -) - -type acquire struct{} - -func Acquire() bundle.Mutator { - return &acquire{} -} - -func (m *acquire) Name() string { - return "lock:acquire" -} - -func (m *acquire) init(b *bundle.Bundle) error { - user := b.Config.Workspace.CurrentUser.UserName - dir := b.Config.Workspace.StatePath - l, err := locker.CreateLocker(user, dir, b.WorkspaceClient()) - if err != nil { - return err - } - - b.Locker = l - return nil -} - -func (m *acquire) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - // Return early if locking is disabled. - if !b.Config.Bundle.Deployment.Lock.IsEnabled() { - log.Infof(ctx, "Skipping; locking is disabled") - return nil - } - - err := m.init(b) - if err != nil { - return diag.FromErr(err) - } - - force := b.Config.Bundle.Deployment.Lock.Force - log.Infof(ctx, "Acquiring deployment lock (force: %v)", force) - err = b.Locker.Lock(ctx, force) - if err != nil { - log.Errorf(ctx, "Failed to acquire deployment lock: %v", err) - - if errors.Is(err, fs.ErrPermission) { - return permissions.ReportPossiblePermissionDenied(ctx, b, b.Config.Workspace.StatePath) - } - - if errors.Is(err, fs.ErrNotExist) { - // If we get a "doesn't exist" error from the API this indicates - // we either don't have permissions or the path is invalid. - return permissions.ReportPossiblePermissionDenied(ctx, b, b.Config.Workspace.StatePath) - } - - return diag.FromErr(err) - } - - return nil -} diff --git a/bundle/phases/deploy_metadata.go b/bundle/deploy/lock/acquire_metadata.go similarity index 59% rename from bundle/phases/deploy_metadata.go rename to bundle/deploy/lock/acquire_metadata.go index ce7b9d0d96..e0992e2340 100644 --- a/bundle/phases/deploy_metadata.go +++ b/bundle/deploy/lock/acquire_metadata.go @@ -1,4 +1,4 @@ -package phases +package lock import ( "context" @@ -8,7 +8,6 @@ import ( "os" "path/filepath" "strconv" - "time" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" @@ -21,35 +20,14 @@ import ( "github.com/google/uuid" ) -// acquireMetadataLock creates the metadata service client, acquires the deployment -// lock, and sets up the operation reporter on the bundle. It returns a cleanup -// function that releases the lock, or an error if the lock could not be acquired. -func acquireMetadataLock(ctx context.Context, b *bundle.Bundle, versionType tmpdms.VersionType) (cleanup func(failed bool), err error) { - svc, err := tmpdms.NewDeploymentMetadataAPI(b.WorkspaceClient()) - if err != nil { - return nil, fmt.Errorf("failed to create metadata service client: %w", err) - } - - deploymentID, versionID, cleanup, err := deployMetadataLock(ctx, b, svc, versionType) - if err != nil { - return nil, err - } - - b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - return cleanup, nil -} - -// deployMetadataLock implements the lock acquire/release lifecycle using the -// deployment metadata service (CreateVersion / CompleteVersion). -// -// It returns a cleanup function that must be deferred by the caller to release -// the lock and stop the heartbeat, as well as any error from acquiring the lock. -func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID, versionID string, cleanup func(failed bool), err error) { +// acquireLock implements the lock acquisition protocol using the deployment +// metadata service: read lineage, ensure deployment, create version. +func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID, versionID string, err error) { // Read the lineage from resources.json (direct engine state) for the deployment ID. _, localPath := b.StateFilenameDirect(ctx) var stateDB dstate.DeploymentState if openErr := stateDB.Open(localPath); openErr != nil { - return "", "", nil, fmt.Errorf("failed to open resources state: %w", openErr) + return "", "", fmt.Errorf("failed to open resources state: %w", openErr) } deploymentID = stateDB.Data.Lineage @@ -60,11 +38,11 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo // Write the deployment ID to _deployment_id for external tooling. stateDir := filepath.Dir(localPath) if mkdirErr := os.MkdirAll(stateDir, 0o755); mkdirErr != nil { - return "", "", nil, fmt.Errorf("failed to create state directory: %w", mkdirErr) + return "", "", fmt.Errorf("failed to create state directory: %w", mkdirErr) } deploymentIDPath := filepath.Join(stateDir, "_deployment_id") if writeErr := os.WriteFile(deploymentIDPath, []byte(deploymentID), 0o600); writeErr != nil { - return "", "", nil, fmt.Errorf("failed to write deployment ID: %w", writeErr) + return "", "", fmt.Errorf("failed to write deployment ID: %w", writeErr) } // Ensure the deployment exists in the metadata service. @@ -75,7 +53,7 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo }, }) if createErr != nil && !isAlreadyExists(createErr) { - return "", "", nil, fmt.Errorf("failed to create deployment: %w", createErr) + return "", "", fmt.Errorf("failed to create deployment: %w", createErr) } // Get the deployment to determine the next version ID. @@ -83,7 +61,7 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo DeploymentID: deploymentID, }) if getErr != nil { - return "", "", nil, fmt.Errorf("failed to get deployment: %w", getErr) + return "", "", fmt.Errorf("failed to get deployment: %w", getErr) } if dep.LastVersionID == "" { @@ -91,7 +69,7 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo } else { lastVersion, parseErr := strconv.ParseInt(dep.LastVersionID, 10, 64) if parseErr != nil { - return "", "", nil, fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) + return "", "", fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) } versionID = strconv.FormatInt(lastVersion+1, 10) } @@ -108,76 +86,11 @@ func deployMetadataLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.Deplo }, }) if versionErr != nil { - return "", "", nil, fmt.Errorf("failed to acquire deployment lock: %w", versionErr) + return "", "", fmt.Errorf("failed to acquire deployment lock: %w", versionErr) } log.Infof(ctx, "Acquired deployment lock: deployment=%s version=%s", deploymentID, version.VersionID) - - // Start heartbeat to keep the lock alive. - stopHeartbeat := startHeartbeat(ctx, svc, deploymentID, versionID, defaultHeartbeatInterval) - - cleanup = func(failed bool) { - stopHeartbeat() - - reason := tmpdms.VersionCompleteSuccess - if failed { - reason = tmpdms.VersionCompleteFailure - } - - // Use a separate context for cleanup so the lock is released even if the - // parent context was cancelled (e.g. user hit Ctrl+C). - cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - _, completeErr := svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ - DeploymentID: deploymentID, - VersionID: versionID, - Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), - CompletionReason: reason, - }) - if completeErr != nil { - log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) - } else { - log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", deploymentID, versionID, reason) - } - - // For destroy operations, delete the deployment record after - // successfully releasing the lock. - if !failed && versionType == tmpdms.VersionTypeDestroy { - _, deleteErr := svc.DeleteDeployment(cleanupCtx, tmpdms.DeleteDeploymentRequest{ - DeploymentID: deploymentID, - }) - if deleteErr != nil { - log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) - } - } - } - - return deploymentID, versionID, cleanup, nil -} - -// planActionToOperationAction maps a deploy plan action to a metadata service -// operation action type. It returns an error for actions that are not supported -// by the backend. No-op actions like Skip return ("", nil) and should be ignored. -func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { - switch action { - case deployplan.Skip: - return "", nil - case deployplan.Create: - return tmpdms.OperationActionTypeCreate, nil - case deployplan.Update: - return tmpdms.OperationActionTypeUpdate, nil - case deployplan.UpdateWithID: - return tmpdms.OperationActionTypeUpdateWithID, nil - case deployplan.Delete: - return tmpdms.OperationActionTypeDelete, nil - case deployplan.Recreate: - return tmpdms.OperationActionTypeRecreate, nil - case deployplan.Resize: - return tmpdms.OperationActionTypeResize, nil - default: - return "", fmt.Errorf("unsupported operation action type: %s", action) - } + return deploymentID, versionID, nil } // makeOperationReporter returns an OperationReporter that reports each resource @@ -220,6 +133,30 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers } } +// planActionToOperationAction maps a deploy plan action to a metadata service +// operation action type. It returns an error for actions that are not supported +// by the backend. No-op actions like Skip return ("", nil) and should be ignored. +func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { + switch action { + case deployplan.Skip: + return "", nil + case deployplan.Create: + return tmpdms.OperationActionTypeCreate, nil + case deployplan.Update: + return tmpdms.OperationActionTypeUpdate, nil + case deployplan.UpdateWithID: + return tmpdms.OperationActionTypeUpdateWithID, nil + case deployplan.Delete: + return tmpdms.OperationActionTypeDelete, nil + case deployplan.Recreate: + return tmpdms.OperationActionTypeRecreate, nil + case deployplan.Resize: + return tmpdms.OperationActionTypeResize, nil + default: + return "", fmt.Errorf("unsupported operation action type: %s", action) + } +} + // isAlreadyExists checks if an error indicates the resource already exists (HTTP 409). func isAlreadyExists(err error) bool { var apiErr *apierr.APIError diff --git a/bundle/deploy/lock/filesystem.go b/bundle/deploy/lock/filesystem.go new file mode 100644 index 0000000000..7f81451ea7 --- /dev/null +++ b/bundle/deploy/lock/filesystem.go @@ -0,0 +1,66 @@ +package lock + +import ( + "context" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/locker" + "github.com/databricks/cli/libs/log" +) + +type filesystemLock struct { + b *bundle.Bundle + goal Goal +} + +func newFilesystemLock(b *bundle.Bundle, goal Goal) *filesystemLock { + return &filesystemLock{b: b, goal: goal} +} + +func (l *filesystemLock) Acquire(ctx context.Context) error { + b := l.b + + if !b.Config.Bundle.Deployment.Lock.IsEnabled() { + log.Infof(ctx, "Skipping; locking is disabled") + return nil + } + + user := b.Config.Workspace.CurrentUser.UserName + dir := b.Config.Workspace.StatePath + lk, err := locker.CreateLocker(user, dir, b.WorkspaceClient()) + if err != nil { + return err + } + + b.Locker = lk + + force := b.Config.Bundle.Deployment.Lock.Force + log.Infof(ctx, "Acquiring deployment lock (force: %v)", force) + err = lk.Lock(ctx, force) + if err != nil { + log.Errorf(ctx, "Failed to acquire deployment lock: %v", err) + return err + } + + return nil +} + +func (l *filesystemLock) Release(ctx context.Context, _ DeploymentStatus) error { + b := l.b + + if !b.Config.Bundle.Deployment.Lock.IsEnabled() { + log.Infof(ctx, "Skipping; locking is disabled") + return nil + } + + if b.Locker == nil { + log.Warnf(ctx, "Unable to release lock if locker is not configured") + return nil + } + + log.Infof(ctx, "Releasing deployment lock") + if l.goal == GoalDestroy { + return b.Locker.Unlock(ctx, locker.AllowLockFileNotExist) + } + return b.Locker.Unlock(ctx) +} diff --git a/bundle/phases/heartbeat.go b/bundle/deploy/lock/heartbeat.go similarity index 86% rename from bundle/phases/heartbeat.go rename to bundle/deploy/lock/heartbeat.go index 925c53193c..4199abd0f5 100644 --- a/bundle/phases/heartbeat.go +++ b/bundle/deploy/lock/heartbeat.go @@ -1,4 +1,4 @@ -package phases +package lock import ( "context" @@ -12,11 +12,11 @@ const defaultHeartbeatInterval = 30 * time.Second // startHeartbeat starts a background goroutine that sends heartbeats to keep // the deployment lock alive. Returns a cancel function to stop the heartbeat. -func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string, interval time.Duration) context.CancelFunc { +func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) context.CancelFunc { ctx, cancel := context.WithCancel(ctx) go func() { - ticker := time.NewTicker(interval) + ticker := time.NewTicker(defaultHeartbeatInterval) defer ticker.Stop() for { diff --git a/bundle/deploy/lock/lock.go b/bundle/deploy/lock/lock.go new file mode 100644 index 0000000000..687e2d947f --- /dev/null +++ b/bundle/deploy/lock/lock.go @@ -0,0 +1,62 @@ +package lock + +import ( + "context" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/env" + "github.com/databricks/cli/libs/tmpdms" +) + +// Goal describes the purpose of a deployment operation. +type Goal string + +const ( + GoalBind = Goal("bind") + GoalUnbind = Goal("unbind") + GoalDeploy = Goal("deploy") + GoalDestroy = Goal("destroy") +) + +// DeploymentStatus indicates whether the deployment operation succeeded or failed. +type DeploymentStatus int + +const ( + DeploymentSuccess DeploymentStatus = iota + DeploymentFailure +) + +// DeploymentLock manages the deployment lock lifecycle. +type DeploymentLock interface { + // Acquire acquires the deployment lock. + Acquire(ctx context.Context) error + + // Release releases the deployment lock with the given deployment status. + Release(ctx context.Context, status DeploymentStatus) error +} + +// NewDeploymentLock returns a DeploymentLock implementation based on the +// current environment. If managed state is enabled and the goal maps to a +// supported version type, a metadata service lock is returned. Otherwise, +// a filesystem lock is returned. +func NewDeploymentLock(ctx context.Context, b *bundle.Bundle, goal Goal) DeploymentLock { + useManagedState, _ := env.ManagedState(ctx) + if useManagedState == "true" { + versionType, ok := goalToVersionType(goal) + if ok { + return newMetadataServiceLock(b, versionType) + } + } + return newFilesystemLock(b, goal) +} + +func goalToVersionType(goal Goal) (tmpdms.VersionType, bool) { + switch goal { + case GoalDeploy: + return tmpdms.VersionTypeDeploy, true + case GoalDestroy: + return tmpdms.VersionTypeDestroy, true + default: + return "", false + } +} diff --git a/bundle/deploy/lock/metadata_service.go b/bundle/deploy/lock/metadata_service.go new file mode 100644 index 0000000000..59836af87f --- /dev/null +++ b/bundle/deploy/lock/metadata_service.go @@ -0,0 +1,87 @@ +package lock + +import ( + "context" + "fmt" + "time" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/tmpdms" +) + +type metadataServiceLock struct { + b *bundle.Bundle + versionType tmpdms.VersionType + + svc *tmpdms.DeploymentMetadataAPI + deploymentID string + versionID string + + stopHeartbeat func() +} + +func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *metadataServiceLock { + return &metadataServiceLock{b: b, versionType: versionType} +} + +func (l *metadataServiceLock) Acquire(ctx context.Context) error { + svc, err := tmpdms.NewDeploymentMetadataAPI(l.b.WorkspaceClient()) + if err != nil { + return fmt.Errorf("failed to create metadata service client: %w", err) + } + l.svc = svc + + deploymentID, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) + if err != nil { + return err + } + + l.deploymentID = deploymentID + l.versionID = versionID + l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) + + l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + return nil +} + +func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStatus) error { + if l.stopHeartbeat != nil { + l.stopHeartbeat() + } + + reason := tmpdms.VersionCompleteSuccess + if status == DeploymentFailure { + reason = tmpdms.VersionCompleteFailure + } + + // Use a separate context for cleanup so the lock is released even if the + // parent context was cancelled (e.g. user hit Ctrl+C). + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + _, completeErr := l.svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ + DeploymentID: l.deploymentID, + VersionID: l.versionID, + Name: fmt.Sprintf("deployments/%s/versions/%s", l.deploymentID, l.versionID), + CompletionReason: reason, + }) + if completeErr != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) + } else { + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) + } + + // For destroy operations, delete the deployment record after + // successfully releasing the lock. + if status == DeploymentSuccess && l.versionType == tmpdms.VersionTypeDestroy { + _, deleteErr := l.svc.DeleteDeployment(cleanupCtx, tmpdms.DeleteDeploymentRequest{ + DeploymentID: l.deploymentID, + }) + if deleteErr != nil { + log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) + } + } + + return completeErr +} diff --git a/bundle/deploy/lock/release.go b/bundle/deploy/lock/release.go deleted file mode 100644 index 26f95edfc9..0000000000 --- a/bundle/deploy/lock/release.go +++ /dev/null @@ -1,58 +0,0 @@ -package lock - -import ( - "context" - - "github.com/databricks/cli/bundle" - "github.com/databricks/cli/libs/diag" - "github.com/databricks/cli/libs/locker" - "github.com/databricks/cli/libs/log" -) - -type Goal string - -const ( - GoalBind = Goal("bind") - GoalUnbind = Goal("unbind") - GoalDeploy = Goal("deploy") - GoalDestroy = Goal("destroy") -) - -type release struct { - goal Goal -} - -func Release(goal Goal) bundle.Mutator { - return &release{goal} -} - -func (m *release) Name() string { - return "lock:release" -} - -func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - // Return early if locking is disabled. - if !b.Config.Bundle.Deployment.Lock.IsEnabled() { - log.Infof(ctx, "Skipping; locking is disabled") - return nil - } - - // Return early if the locker is not set. - // It is likely an error occurred prior to initialization of the locker instance. - if b.Locker == nil { - log.Warnf(ctx, "Unable to release lock if locker is not configured") - return nil - } - - log.Infof(ctx, "Releasing deployment lock") - switch m.goal { - case GoalDeploy: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalBind, GoalUnbind: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalDestroy: - return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) - default: - return diag.Errorf("unknown goal for lock release: %s", m.goal) - } -} diff --git a/bundle/phases/bind.go b/bundle/phases/bind.go index a8f99b28e8..5050d548c6 100644 --- a/bundle/phases/bind.go +++ b/bundle/phases/bind.go @@ -27,14 +27,12 @@ func Bind(ctx context.Context, b *bundle.Bundle, opts *terraform.BindOptions) { return } - bundle.ApplyContext(ctx, b, lock.Acquire()) - if logdiag.HasError(ctx) { + dl := lock.NewDeploymentLock(ctx, b, lock.GoalBind) + if err := dl.Acquire(ctx); err != nil { + logdiag.LogError(ctx, err) return } - - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalBind)) - }() + defer dl.Release(ctx, lock.DeploymentSuccess) if engine.IsDirect() { // Direct engine: import into temp state, run plan, check for changes @@ -129,14 +127,12 @@ func Unbind(ctx context.Context, b *bundle.Bundle, bundleType, tfResourceType, r return } - bundle.ApplyContext(ctx, b, lock.Acquire()) - if logdiag.HasError(ctx) { + dl := lock.NewDeploymentLock(ctx, b, lock.GoalUnbind) + if err := dl.Acquire(ctx); err != nil { + logdiag.LogError(ctx, err) return } - - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalUnbind)) - }() + defer dl.Release(ctx, lock.DeploymentSuccess) if engine.IsDirect() { groupName, ok := terraform.TerraformToGroupName[tfResourceType] diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index b2d272dac3..e016c028a5 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -25,7 +25,6 @@ import ( "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/cli/libs/sync" - "github.com/databricks/cli/libs/tmpdms" ) func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan) (bool, error) { @@ -142,6 +141,10 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand if useMetadataService == "true" { log.Info(ctx, "Phase: deploy (with metadata service)") + if !targetEngine.IsDirect() { + logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) + return + } } else { log.Info(ctx, "Phase: deploy") } @@ -154,29 +157,18 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand // Acquire the deployment lock. var failed bool - if useMetadataService == "true" { - if !targetEngine.IsDirect() { - logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) - return - } - - cleanup, err := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDeploy) - if err != nil { - logdiag.LogError(ctx, err) - return - } - defer func() { - cleanup(failed || logdiag.HasError(ctx)) - }() - } else { - bundle.ApplyContext(ctx, b, lock.Acquire()) - if logdiag.HasError(ctx) { - return - } - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) - }() + dl := lock.NewDeploymentLock(ctx, b, lock.GoalDeploy) + if err := dl.Acquire(ctx); err != nil { + logdiag.LogError(ctx, err) + return } + defer func() { + status := lock.DeploymentSuccess + if failed || logdiag.HasError(ctx) { + status = lock.DeploymentFailure + } + dl.Release(ctx, status) + }() uploadLibraries(ctx, b, libs) if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 652126f198..96ea91cbe1 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -17,7 +17,6 @@ import ( "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" - "github.com/databricks/cli/libs/tmpdms" "github.com/databricks/databricks-sdk-go/apierr" ) @@ -120,6 +119,10 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy if useMetadataService == "true" { log.Info(ctx, "Phase: destroy (with metadata service)") + if !targetEngine.IsDirect() { + logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) + return + } } else { log.Info(ctx, "Phase: destroy") } @@ -137,29 +140,18 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy // Acquire the deployment lock. var failed bool - if useMetadataService == "true" { - if !targetEngine.IsDirect() { - logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) - return - } - - cleanup, lockErr := acquireMetadataLock(ctx, b, tmpdms.VersionTypeDestroy) - if lockErr != nil { - logdiag.LogError(ctx, lockErr) - return - } - defer func() { - cleanup(failed || logdiag.HasError(ctx)) - }() - } else { - bundle.ApplyContext(ctx, b, lock.Acquire()) - if logdiag.HasError(ctx) { - return - } - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDestroy)) - }() + dl := lock.NewDeploymentLock(ctx, b, lock.GoalDestroy) + if err := dl.Acquire(ctx); err != nil { + logdiag.LogError(ctx, err) + return } + defer func() { + status := lock.DeploymentSuccess + if failed || logdiag.HasError(ctx) { + status = lock.DeploymentFailure + } + dl.Release(ctx, status) + }() if !targetEngine.IsDirect() { bundle.ApplySeqContext(ctx, b, diff --git a/bundle/statemgmt/check_running_resources.go b/bundle/statemgmt/check_running_resources.go index 7108c9b542..af80bf42cc 100644 --- a/bundle/statemgmt/check_running_resources.go +++ b/bundle/statemgmt/check_running_resources.go @@ -8,7 +8,6 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/libs/diag" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/service/jobs" @@ -38,20 +37,10 @@ func (l *checkRunningResources) Apply(ctx context.Context, b *bundle.Bundle) dia return nil } - var err error - var state ExportedResourcesMap - - if l.engine.IsDirect() { - _, fullPathDirect := b.StateFilenameDirect(ctx) - state, err = b.DeploymentBundle.ExportState(ctx, fullPathDirect) - if err != nil { - return diag.FromErr(err) - } - } else { - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - return diag.FromErr(err) - } + mgr := NewStateManager(b, l.engine) + state, err := mgr.Read(ctx) + if err != nil { + return diag.FromErr(err) } w := b.WorkspaceClient() diff --git a/bundle/statemgmt/resourcestate/manager.go b/bundle/statemgmt/resourcestate/manager.go new file mode 100644 index 0000000000..d12e2937ca --- /dev/null +++ b/bundle/statemgmt/resourcestate/manager.go @@ -0,0 +1,12 @@ +package resourcestate + +import "context" + +// Manager provides read and write access to deployment resource state. +type Manager interface { + // Read returns the current resource state as a map of resource keys to their state. + Read(ctx context.Context) (ExportedResourcesMap, error) + + // Push uploads local state to the remote workspace location. + Push(ctx context.Context) error +} diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index ef9e1f829d..12360b14e4 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -11,7 +11,6 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" @@ -35,21 +34,10 @@ func (l *load) Name() string { } func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - var err error - var state ExportedResourcesMap - - if l.engine.IsDirect() { - _, fullPathDirect := b.StateFilenameDirect(ctx) - state, err = b.DeploymentBundle.ExportState(ctx, fullPathDirect) - if err != nil { - return diag.FromErr(err) - } - } else { - var err error - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - return diag.FromErr(err) - } + mgr := NewStateManager(b, l.engine) + state, err := mgr.Read(ctx) + if err != nil { + return diag.FromErr(err) } err = l.validateState(state) diff --git a/bundle/statemgmt/state_manager.go b/bundle/statemgmt/state_manager.go new file mode 100644 index 0000000000..b1a23fb262 --- /dev/null +++ b/bundle/statemgmt/state_manager.go @@ -0,0 +1,74 @@ +package statemgmt + +import ( + "context" + "errors" + "io/fs" + "os" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/deploy" + "github.com/databricks/cli/bundle/deploy/terraform" + "github.com/databricks/cli/bundle/statemgmt/resourcestate" + "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/filer" + "github.com/databricks/cli/libs/log" +) + +// NewStateManager returns a Manager implementation for the given engine type. +func NewStateManager(b *bundle.Bundle, e engine.EngineType) resourcestate.Manager { + if e.IsDirect() { + return &directStateManager{b: b} + } + return &terraformStateManager{b: b} +} + +type directStateManager struct { + b *bundle.Bundle +} + +func (m *directStateManager) Read(ctx context.Context) (ExportedResourcesMap, error) { + _, fullPath := m.b.StateFilenameDirect(ctx) + return m.b.DeploymentBundle.ExportState(ctx, fullPath) +} + +func (m *directStateManager) Push(ctx context.Context) error { + remotePath, localPath := m.b.StateFilenameDirect(ctx) + return pushLocalState(ctx, m.b, remotePath, localPath) +} + +type terraformStateManager struct { + b *bundle.Bundle +} + +func (m *terraformStateManager) Read(ctx context.Context) (ExportedResourcesMap, error) { + return terraform.ParseResourcesState(ctx, m.b) +} + +func (m *terraformStateManager) Push(ctx context.Context) error { + remotePath, localPath := m.b.StateFilenameTerraform(ctx) + return pushLocalState(ctx, m.b, remotePath, localPath) +} + +func pushLocalState(ctx context.Context, b *bundle.Bundle, remotePath, localPath string) error { + f, err := deploy.StateFiler(b) + if err != nil { + return err + } + + local, err := os.Open(localPath) + if errors.Is(err, fs.ErrNotExist) { + // The state file can be absent if terraform apply is skipped because + // there are no changes to apply in the plan. + log.Debugf(ctx, "Local state file does not exist: %s", localPath) + return nil + } + if err != nil { + return err + } + defer local.Close() + + cmdio.LogString(ctx, "Updating deployment state...") + return f.Write(ctx, remotePath, local, filer.CreateParentDirectories, filer.OverwriteIfExists) +} diff --git a/bundle/statemgmt/state_push.go b/bundle/statemgmt/state_push.go index b2da9f893c..5774a0b5fe 100644 --- a/bundle/statemgmt/state_push.go +++ b/bundle/statemgmt/state_push.go @@ -4,50 +4,18 @@ import ( "context" "errors" "io/fs" - "os" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" - "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/filer" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" ) // PushResourcesState uploads the local state file to the remote location. func PushResourcesState(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { - f, err := deploy.StateFiler(b) - if err != nil { - logdiag.LogError(ctx, err) - return - } - - var remotePath, localPath string - - if engine.IsDirect() { - remotePath, localPath = b.StateFilenameDirect(ctx) - } else { - remotePath, localPath = b.StateFilenameTerraform(ctx) - } - - local, err := os.Open(localPath) - if errors.Is(err, fs.ErrNotExist) { - // The state file can be absent if terraform apply is skipped because - // there are no changes to apply in the plan. - log.Debugf(ctx, "Local state file does not exist: %s", localPath) - return - } - if err != nil { - logdiag.LogError(ctx, err) - return - } - defer local.Close() - - // Upload state file from local cache directory to filer. - cmdio.LogString(ctx, "Updating deployment state...") - err = f.Write(ctx, remotePath, local, filer.CreateParentDirectories, filer.OverwriteIfExists) - if err != nil { + mgr := NewStateManager(b, engine) + if err := mgr.Push(ctx); err != nil { logdiag.LogError(ctx, err) } } From 2f9275d933fab0d650b08b422da6d02088bee084 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 17:56:08 +0000 Subject: [PATCH 15/25] Fix lint issues: errcheck, gofmt, gofumpt, perfsprint, exhaustive, gocritic Co-authored-by: Isaac --- bundle/deploy/lock/acquire_metadata.go | 10 +++++-- bundle/deploy/lock/metadata_service.go | 2 +- bundle/direct/pkg.go | 8 +++++- bundle/phases/bind.go | 4 +-- bundle/phases/deploy.go | 2 +- bundle/phases/destroy.go | 2 +- libs/testserver/deployment_metadata.go | 14 +++++---- libs/tmpdms/api.go | 2 +- libs/tmpdms/types.go | 40 ++++++++++++++------------ 9 files changed, 50 insertions(+), 34 deletions(-) diff --git a/bundle/deploy/lock/acquire_metadata.go b/bundle/deploy/lock/acquire_metadata.go index e0992e2340..4b86d16d74 100644 --- a/bundle/deploy/lock/acquire_metadata.go +++ b/bundle/deploy/lock/acquire_metadata.go @@ -77,7 +77,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe // Create a version to acquire the deployment lock. version, versionErr := svc.CreateVersion(ctx, tmpdms.CreateVersionRequest{ DeploymentID: deploymentID, - Parent: fmt.Sprintf("deployments/%s", deploymentID), + Parent: "deployments/" + deploymentID, VersionID: versionID, Version: &tmpdms.Version{ CliVersion: build.GetInfo().Version, @@ -97,7 +97,13 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe // operation (success or failure) to the metadata service. Reporting failures are // logged as warnings and do not affect the deploy outcome. func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { - return func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error) { + return func( + ctx context.Context, + resourceKey string, + resourceID string, + action deployplan.ActionType, + operationErr error, + ) { actionType, mapErr := planActionToOperationAction(action) if mapErr != nil { log.Warnf(ctx, "Skipping operation report for resource %s: %v", resourceKey, mapErr) diff --git a/bundle/deploy/lock/metadata_service.go b/bundle/deploy/lock/metadata_service.go index 59836af87f..33d7be0a83 100644 --- a/bundle/deploy/lock/metadata_service.go +++ b/bundle/deploy/lock/metadata_service.go @@ -57,7 +57,7 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat // Use a separate context for cleanup so the lock is released even if the // parent context was cancelled (e.g. user hit Ctrl+C). - cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) //nolint:gocritic // Intentional: cleanup must survive parent cancellation defer cancel() _, completeErr := l.svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index aee2265eba..97ef49eedc 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -41,7 +41,13 @@ type DeploymentUnit struct { // to report it to the deployment metadata service. If operationErr is non-nil the // operation is recorded as failed with the error message. It is best-effort: // reporting failures are logged as warnings by the caller. -type OperationReporter func(ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error) +type OperationReporter func( + ctx context.Context, + resourceKey string, + resourceID string, + action deployplan.ActionType, + operationErr error, +) // DeploymentBundle holds everything needed to deploy a bundle type DeploymentBundle struct { diff --git a/bundle/phases/bind.go b/bundle/phases/bind.go index 5050d548c6..aa30415c86 100644 --- a/bundle/phases/bind.go +++ b/bundle/phases/bind.go @@ -32,7 +32,7 @@ func Bind(ctx context.Context, b *bundle.Bundle, opts *terraform.BindOptions) { logdiag.LogError(ctx, err) return } - defer dl.Release(ctx, lock.DeploymentSuccess) + defer func() { _ = dl.Release(ctx, lock.DeploymentSuccess) }() if engine.IsDirect() { // Direct engine: import into temp state, run plan, check for changes @@ -132,7 +132,7 @@ func Unbind(ctx context.Context, b *bundle.Bundle, bundleType, tfResourceType, r logdiag.LogError(ctx, err) return } - defer dl.Release(ctx, lock.DeploymentSuccess) + defer func() { _ = dl.Release(ctx, lock.DeploymentSuccess) }() if engine.IsDirect() { groupName, ok := terraform.TerraformToGroupName[tfResourceType] diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index e016c028a5..76e5a5847c 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -167,7 +167,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand if failed || logdiag.HasError(ctx) { status = lock.DeploymentFailure } - dl.Release(ctx, status) + _ = dl.Release(ctx, status) }() uploadLibraries(ctx, b, libs) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 96ea91cbe1..3109f6fc27 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -150,7 +150,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy if failed || logdiag.HasError(ctx) { status = lock.DeploymentFailure } - dl.Release(ctx, status) + _ = dl.Release(ctx, status) }() if !targetEngine.IsDirect() { diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index eaed985156..f3a43863c3 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -77,7 +77,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateDeployment(req Request) Response now := time.Now().UTC() deployment := tmpdms.Deployment{ - Name: fmt.Sprintf("deployments/%s", deploymentID), + Name: "deployments/" + deploymentID, DisplayName: deploymentID, TargetName: bodyDeployment.TargetName, Status: tmpdms.DeploymentStatusActive, @@ -167,7 +167,7 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI if err != nil { return Response{ StatusCode: http.StatusInternalServerError, - Body: map[string]string{"error_code": "INTERNAL_ERROR", "message": fmt.Sprintf("stored last_version_id is not a valid number: %s", deployment.LastVersionID)}, + Body: map[string]string{"error_code": "INTERNAL_ERROR", "message": "stored last_version_id is not a valid number: " + deployment.LastVersionID}, } } expectedVersionID = strconv.FormatInt(lastVersion+1, 10) @@ -198,11 +198,11 @@ func (s *FakeWorkspace) DeploymentMetadataCreateVersion(req Request, deploymentI versionKey := deploymentID + "/" + versionID version := tmpdms.Version{ - Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), - VersionID: versionID, - CreatedBy: s.CurrentUser().UserName, + Name: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + VersionID: versionID, + CreatedBy: s.CurrentUser().UserName, CreateTime: &now, - Status: tmpdms.VersionStatusInProgress, + Status: tmpdms.VersionStatusInProgress, } version.CliVersion = bodyVersion.CliVersion version.VersionType = bodyVersion.VersionType @@ -321,6 +321,8 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen deployment.Status = tmpdms.DeploymentStatusActive case tmpdms.VersionCompleteFailure, tmpdms.VersionCompleteForceAbort, tmpdms.VersionCompleteLeaseExpired: deployment.Status = tmpdms.DeploymentStatusFailed + case tmpdms.VersionCompleteUnspecified: + // No status change for unspecified completion reason. } deployment.UpdateTime = &now state.deployments[deploymentID] = deployment diff --git a/libs/tmpdms/api.go b/libs/tmpdms/api.go index 3f6fcf1957..fb233f15ae 100644 --- a/libs/tmpdms/api.go +++ b/libs/tmpdms/api.go @@ -33,7 +33,7 @@ func NewDeploymentMetadataAPI(w *databricks.WorkspaceClient) (*DeploymentMetadat func (a *DeploymentMetadataAPI) CreateDeployment(ctx context.Context, request CreateDeploymentRequest) (*Deployment, error) { var resp Deployment - path := fmt.Sprintf("%s/deployments", basePath) + path := basePath + "/deployments" query := map[string]any{"deployment_id": request.DeploymentID} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Deployment, &resp) if err != nil { diff --git a/libs/tmpdms/types.go b/libs/tmpdms/types.go index 320823d00e..8dd6cdbfb6 100644 --- a/libs/tmpdms/types.go +++ b/libs/tmpdms/types.go @@ -9,20 +9,22 @@ import "time" // Enum types matching the proto definitions. // Values are the proto enum name strings, which is how proto-over-HTTP serializes enums. -type DeploymentStatus string -type VersionStatus string -type VersionComplete string -type VersionType string -type OperationStatus string -type OperationActionType string -type DeploymentResourceType string +type ( + DeploymentStatus string + VersionStatus string + VersionComplete string + VersionType string + OperationStatus string + OperationActionType string + DeploymentResourceType string +) const ( DeploymentStatusUnspecified DeploymentStatus = "DEPLOYMENT_STATUS_UNSPECIFIED" - DeploymentStatusActive DeploymentStatus = "DEPLOYMENT_STATUS_ACTIVE" - DeploymentStatusFailed DeploymentStatus = "DEPLOYMENT_STATUS_FAILED" - DeploymentStatusInProgress DeploymentStatus = "DEPLOYMENT_STATUS_IN_PROGRESS" - DeploymentStatusDeleted DeploymentStatus = "DEPLOYMENT_STATUS_DELETED" + DeploymentStatusActive DeploymentStatus = "DEPLOYMENT_STATUS_ACTIVE" + DeploymentStatusFailed DeploymentStatus = "DEPLOYMENT_STATUS_FAILED" + DeploymentStatusInProgress DeploymentStatus = "DEPLOYMENT_STATUS_IN_PROGRESS" + DeploymentStatusDeleted DeploymentStatus = "DEPLOYMENT_STATUS_DELETED" ) const ( @@ -33,26 +35,26 @@ const ( const ( VersionCompleteUnspecified VersionComplete = "VERSION_COMPLETE_UNSPECIFIED" - VersionCompleteSuccess VersionComplete = "VERSION_COMPLETE_SUCCESS" - VersionCompleteFailure VersionComplete = "VERSION_COMPLETE_FAILURE" - VersionCompleteForceAbort VersionComplete = "VERSION_COMPLETE_FORCE_ABORT" + VersionCompleteSuccess VersionComplete = "VERSION_COMPLETE_SUCCESS" + VersionCompleteFailure VersionComplete = "VERSION_COMPLETE_FAILURE" + VersionCompleteForceAbort VersionComplete = "VERSION_COMPLETE_FORCE_ABORT" VersionCompleteLeaseExpired VersionComplete = "VERSION_COMPLETE_LEASE_EXPIRED" ) const ( VersionTypeUnspecified VersionType = "VERSION_TYPE_UNSPECIFIED" - VersionTypeDeploy VersionType = "VERSION_TYPE_DEPLOY" - VersionTypeDestroy VersionType = "VERSION_TYPE_DESTROY" + VersionTypeDeploy VersionType = "VERSION_TYPE_DEPLOY" + VersionTypeDestroy VersionType = "VERSION_TYPE_DESTROY" ) const ( OperationStatusUnspecified OperationStatus = "OPERATION_STATUS_UNSPECIFIED" - OperationStatusSucceeded OperationStatus = "OPERATION_STATUS_SUCCEEDED" - OperationStatusFailed OperationStatus = "OPERATION_STATUS_FAILED" + OperationStatusSucceeded OperationStatus = "OPERATION_STATUS_SUCCEEDED" + OperationStatusFailed OperationStatus = "OPERATION_STATUS_FAILED" ) const ( - OperationActionTypeUnspecified OperationActionType = "OPERATION_ACTION_TYPE_UNSPECIFIED" + OperationActionTypeUnspecified OperationActionType = "OPERATION_ACTION_TYPE_UNSPECIFIED" OperationActionTypeResize OperationActionType = "OPERATION_ACTION_TYPE_RESIZE" OperationActionTypeUpdate OperationActionType = "OPERATION_ACTION_TYPE_UPDATE" OperationActionTypeUpdateWithID OperationActionType = "OPERATION_ACTION_TYPE_UPDATE_WITH_ID" From 37d61ff62aaf0b7bf275b1c5b2b3702c17f4e1b1 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 18:08:10 +0000 Subject: [PATCH 16/25] Rename and consolidate lock package files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - filesystem.go → workspace_filesystem.go - Merge metadata_service.go, acquire_metadata.go, heartbeat.go → deployment_metadata_service.go Co-authored-by: Isaac --- ...data.go => deployment_metadata_service.go} | 109 ++++++++++++++++++ bundle/deploy/lock/heartbeat.go | 41 ------- bundle/deploy/lock/metadata_service.go | 87 -------------- ...{filesystem.go => workspace_filesystem.go} | 0 4 files changed, 109 insertions(+), 128 deletions(-) rename bundle/deploy/lock/{acquire_metadata.go => deployment_metadata_service.go} (63%) delete mode 100644 bundle/deploy/lock/heartbeat.go delete mode 100644 bundle/deploy/lock/metadata_service.go rename bundle/deploy/lock/{filesystem.go => workspace_filesystem.go} (100%) diff --git a/bundle/deploy/lock/acquire_metadata.go b/bundle/deploy/lock/deployment_metadata_service.go similarity index 63% rename from bundle/deploy/lock/acquire_metadata.go rename to bundle/deploy/lock/deployment_metadata_service.go index 4b86d16d74..1e9e9765b2 100644 --- a/bundle/deploy/lock/acquire_metadata.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "strconv" + "time" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" @@ -20,6 +21,84 @@ import ( "github.com/google/uuid" ) +const defaultHeartbeatInterval = 30 * time.Second + +type metadataServiceLock struct { + b *bundle.Bundle + versionType tmpdms.VersionType + + svc *tmpdms.DeploymentMetadataAPI + deploymentID string + versionID string + + stopHeartbeat func() +} + +func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *metadataServiceLock { + return &metadataServiceLock{b: b, versionType: versionType} +} + +func (l *metadataServiceLock) Acquire(ctx context.Context) error { + svc, err := tmpdms.NewDeploymentMetadataAPI(l.b.WorkspaceClient()) + if err != nil { + return fmt.Errorf("failed to create metadata service client: %w", err) + } + l.svc = svc + + deploymentID, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) + if err != nil { + return err + } + + l.deploymentID = deploymentID + l.versionID = versionID + l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) + + l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + return nil +} + +func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStatus) error { + if l.stopHeartbeat != nil { + l.stopHeartbeat() + } + + reason := tmpdms.VersionCompleteSuccess + if status == DeploymentFailure { + reason = tmpdms.VersionCompleteFailure + } + + // Use a separate context for cleanup so the lock is released even if the + // parent context was cancelled (e.g. user hit Ctrl+C). + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) //nolint:gocritic // Intentional: cleanup must survive parent cancellation + defer cancel() + + _, completeErr := l.svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ + DeploymentID: l.deploymentID, + VersionID: l.versionID, + Name: fmt.Sprintf("deployments/%s/versions/%s", l.deploymentID, l.versionID), + CompletionReason: reason, + }) + if completeErr != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) + } else { + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) + } + + // For destroy operations, delete the deployment record after + // successfully releasing the lock. + if status == DeploymentSuccess && l.versionType == tmpdms.VersionTypeDestroy { + _, deleteErr := l.svc.DeleteDeployment(cleanupCtx, tmpdms.DeleteDeploymentRequest{ + DeploymentID: l.deploymentID, + }) + if deleteErr != nil { + log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) + } + } + + return completeErr +} + // acquireLock implements the lock acquisition protocol using the deployment // metadata service: read lineage, ensure deployment, create version. func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID, versionID string, err error) { @@ -163,6 +242,36 @@ func planActionToOperationAction(action deployplan.ActionType) (tmpdms.Operation } } +// startHeartbeat starts a background goroutine that sends heartbeats to keep +// the deployment lock alive. Returns a cancel function to stop the heartbeat. +func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) context.CancelFunc { + ctx, cancel := context.WithCancel(ctx) + + go func() { + ticker := time.NewTicker(defaultHeartbeatInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + _, err := svc.Heartbeat(ctx, tmpdms.HeartbeatRequest{ + DeploymentID: deploymentID, + VersionID: versionID, + }) + if err != nil { + log.Warnf(ctx, "Failed to send deployment heartbeat: %v", err) + } else { + log.Debugf(ctx, "Deployment heartbeat sent for deployment=%s version=%s", deploymentID, versionID) + } + } + } + }() + + return cancel +} + // isAlreadyExists checks if an error indicates the resource already exists (HTTP 409). func isAlreadyExists(err error) bool { var apiErr *apierr.APIError diff --git a/bundle/deploy/lock/heartbeat.go b/bundle/deploy/lock/heartbeat.go deleted file mode 100644 index 4199abd0f5..0000000000 --- a/bundle/deploy/lock/heartbeat.go +++ /dev/null @@ -1,41 +0,0 @@ -package lock - -import ( - "context" - "time" - - "github.com/databricks/cli/libs/log" - "github.com/databricks/cli/libs/tmpdms" -) - -const defaultHeartbeatInterval = 30 * time.Second - -// startHeartbeat starts a background goroutine that sends heartbeats to keep -// the deployment lock alive. Returns a cancel function to stop the heartbeat. -func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) context.CancelFunc { - ctx, cancel := context.WithCancel(ctx) - - go func() { - ticker := time.NewTicker(defaultHeartbeatInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - _, err := svc.Heartbeat(ctx, tmpdms.HeartbeatRequest{ - DeploymentID: deploymentID, - VersionID: versionID, - }) - if err != nil { - log.Warnf(ctx, "Failed to send deployment heartbeat: %v", err) - } else { - log.Debugf(ctx, "Deployment heartbeat sent for deployment=%s version=%s", deploymentID, versionID) - } - } - } - }() - - return cancel -} diff --git a/bundle/deploy/lock/metadata_service.go b/bundle/deploy/lock/metadata_service.go deleted file mode 100644 index 33d7be0a83..0000000000 --- a/bundle/deploy/lock/metadata_service.go +++ /dev/null @@ -1,87 +0,0 @@ -package lock - -import ( - "context" - "fmt" - "time" - - "github.com/databricks/cli/bundle" - "github.com/databricks/cli/libs/log" - "github.com/databricks/cli/libs/tmpdms" -) - -type metadataServiceLock struct { - b *bundle.Bundle - versionType tmpdms.VersionType - - svc *tmpdms.DeploymentMetadataAPI - deploymentID string - versionID string - - stopHeartbeat func() -} - -func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *metadataServiceLock { - return &metadataServiceLock{b: b, versionType: versionType} -} - -func (l *metadataServiceLock) Acquire(ctx context.Context) error { - svc, err := tmpdms.NewDeploymentMetadataAPI(l.b.WorkspaceClient()) - if err != nil { - return fmt.Errorf("failed to create metadata service client: %w", err) - } - l.svc = svc - - deploymentID, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) - if err != nil { - return err - } - - l.deploymentID = deploymentID - l.versionID = versionID - l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) - - l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) - return nil -} - -func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStatus) error { - if l.stopHeartbeat != nil { - l.stopHeartbeat() - } - - reason := tmpdms.VersionCompleteSuccess - if status == DeploymentFailure { - reason = tmpdms.VersionCompleteFailure - } - - // Use a separate context for cleanup so the lock is released even if the - // parent context was cancelled (e.g. user hit Ctrl+C). - cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) //nolint:gocritic // Intentional: cleanup must survive parent cancellation - defer cancel() - - _, completeErr := l.svc.CompleteVersion(cleanupCtx, tmpdms.CompleteVersionRequest{ - DeploymentID: l.deploymentID, - VersionID: l.versionID, - Name: fmt.Sprintf("deployments/%s/versions/%s", l.deploymentID, l.versionID), - CompletionReason: reason, - }) - if completeErr != nil { - log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) - } else { - log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) - } - - // For destroy operations, delete the deployment record after - // successfully releasing the lock. - if status == DeploymentSuccess && l.versionType == tmpdms.VersionTypeDestroy { - _, deleteErr := l.svc.DeleteDeployment(cleanupCtx, tmpdms.DeleteDeploymentRequest{ - DeploymentID: l.deploymentID, - }) - if deleteErr != nil { - log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) - } - } - - return completeErr -} diff --git a/bundle/deploy/lock/filesystem.go b/bundle/deploy/lock/workspace_filesystem.go similarity index 100% rename from bundle/deploy/lock/filesystem.go rename to bundle/deploy/lock/workspace_filesystem.go From 89e02c1bda9078617a3d5b94acec0778c4f2dc91 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 18:10:49 +0000 Subject: [PATCH 17/25] Report INITIAL_REGISTER operations for existing resources on first DMS deployment When the metadata service is used for the first time (version ID is "1"), resources with Skip actions are reported as INITIAL_REGISTER operations instead of being silently ignored. This registers all pre-existing resources in the metadata service on the initial deployment. Co-authored-by: Isaac --- bundle/deploy/lock/deployment_metadata_service.go | 12 ++++++++---- bundle/direct/bundle_apply.go | 10 ++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index 1e9e9765b2..922540cea2 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -54,7 +54,8 @@ func (l *metadataServiceLock) Acquire(ctx context.Context) error { l.versionID = versionID l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) - l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) + initialRegistration := versionID == "1" + l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID, initialRegistration) return nil } @@ -175,7 +176,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe // makeOperationReporter returns an OperationReporter that reports each resource // operation (success or failure) to the metadata service. Reporting failures are // logged as warnings and do not affect the deploy outcome. -func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { +func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string, initialRegistration bool) direct.OperationReporter { return func( ctx context.Context, resourceKey string, @@ -183,7 +184,7 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers action deployplan.ActionType, operationErr error, ) { - actionType, mapErr := planActionToOperationAction(action) + actionType, mapErr := planActionToOperationAction(action, initialRegistration) if mapErr != nil { log.Warnf(ctx, "Skipping operation report for resource %s: %v", resourceKey, mapErr) return @@ -221,9 +222,12 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers // planActionToOperationAction maps a deploy plan action to a metadata service // operation action type. It returns an error for actions that are not supported // by the backend. No-op actions like Skip return ("", nil) and should be ignored. -func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { +func planActionToOperationAction(action deployplan.ActionType, initialRegistration bool) (tmpdms.OperationActionType, error) { switch action { case deployplan.Skip: + if initialRegistration { + return tmpdms.OperationActionTypeInitRegister, nil + } return "", nil case deployplan.Create: return tmpdms.OperationActionTypeCreate, nil diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 0134f5f553..19c0de5e23 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -106,6 +106,16 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa // We don't keep NewState around for 'skip' nodes + // Report skip actions to the metadata service. On initial registration, + // these are recorded as INITIAL_REGISTER operations. + if action == deployplan.Skip && b.OperationReporter != nil { + var resourceID string + if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { + resourceID = dbentry.ID + } + b.OperationReporter(ctx, resourceKey, resourceID, action, nil) + } + if action != deployplan.Skip { if !b.resolveReferences(ctx, resourceKey, entry, errorPrefix, false) { return false From cfcec218aecf915945d9c0b129977facabf925ea Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 7 Apr 2026 18:18:02 +0000 Subject: [PATCH 18/25] Add acceptance test for INITIAL_REGISTER operation type Tests the scenario where a user enables the metadata service for the first time: pre-existing resources that have no changes (Skip action) are reported as OPERATION_ACTION_TYPE_INITIAL_REGISTER. Co-authored-by: Isaac --- .../initial-register/databricks.yml | 7 +++ .../initial-register/out.test.toml | 6 ++ .../initial-register/output.txt | 61 +++++++++++++++++++ .../metadata-service/initial-register/script | 11 ++++ .../initial-register/test.toml | 5 ++ 5 files changed, 90 insertions(+) create mode 100644 acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml create mode 100644 acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml create mode 100644 acceptance/bundle/deploy/metadata-service/initial-register/output.txt create mode 100644 acceptance/bundle/deploy/metadata-service/initial-register/script create mode 100644 acceptance/bundle/deploy/metadata-service/initial-register/test.toml diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml b/acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml new file mode 100644 index 0000000000..04d35740b6 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml @@ -0,0 +1,7 @@ +bundle: + name: initial-register-test + +resources: + jobs: + test_job: + name: test-job diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml b/acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml new file mode 100644 index 0000000000..991ce54dfa --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml @@ -0,0 +1,6 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] + DATABRICKS_BUNDLE_MANAGED_STATE = ["false"] diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/output.txt b/acceptance/bundle/deploy/metadata-service/initial-register/output.txt new file mode 100644 index 0000000000..b48d0b6630 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/initial-register/output.txt @@ -0,0 +1,61 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/initial-register-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/initial-register-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": "OPERATION_ACTION_TYPE_INITIAL_REGISTER", + "resource_id": "[NUMID]", + "status": "OPERATION_STATUS_SUCCEEDED" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": "VERSION_COMPLETE_SUCCESS" + } +} diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/script b/acceptance/bundle/deploy/metadata-service/initial-register/script new file mode 100644 index 0000000000..a971338f43 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/initial-register/script @@ -0,0 +1,11 @@ +# First deploy without metadata service - creates the job using filesystem lock. +trace $CLI bundle deploy + +# Enable metadata service and deploy again. +# Since this is the first DMS deployment (version 1) and the job hasn't changed +# (Skip action), it should be reported as INITIAL_REGISTER. +export DATABRICKS_BUNDLE_MANAGED_STATE=true +trace $CLI bundle deploy + +# Print metadata service requests from the second deploy. +trace print_requests.py --get //bundle diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/test.toml b/acceptance/bundle/deploy/metadata-service/initial-register/test.toml new file mode 100644 index 0000000000..270de6ce44 --- /dev/null +++ b/acceptance/bundle/deploy/metadata-service/initial-register/test.toml @@ -0,0 +1,5 @@ +# Override parent: start without managed state so the first deploy uses filesystem lock. +# The script enables managed state before the second deploy. +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_MANAGED_STATE = ["false"] +RecordRequests = true From ef0335e767ff41b44bbbd67499f003a78994a991 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 8 Apr 2026 10:52:24 +0000 Subject: [PATCH 19/25] Rename filesystemLock to workspaceFilesystemLock Co-authored-by: Isaac --- bundle/deploy/lock/lock.go | 4 ++-- bundle/deploy/lock/workspace_filesystem.go | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bundle/deploy/lock/lock.go b/bundle/deploy/lock/lock.go index 687e2d947f..c51b17c912 100644 --- a/bundle/deploy/lock/lock.go +++ b/bundle/deploy/lock/lock.go @@ -38,7 +38,7 @@ type DeploymentLock interface { // NewDeploymentLock returns a DeploymentLock implementation based on the // current environment. If managed state is enabled and the goal maps to a // supported version type, a metadata service lock is returned. Otherwise, -// a filesystem lock is returned. +// a workspace filesystem lock is returned. func NewDeploymentLock(ctx context.Context, b *bundle.Bundle, goal Goal) DeploymentLock { useManagedState, _ := env.ManagedState(ctx) if useManagedState == "true" { @@ -47,7 +47,7 @@ func NewDeploymentLock(ctx context.Context, b *bundle.Bundle, goal Goal) Deploym return newMetadataServiceLock(b, versionType) } } - return newFilesystemLock(b, goal) + return newWorkspaceFilesystemLock(b, goal) } func goalToVersionType(goal Goal) (tmpdms.VersionType, bool) { diff --git a/bundle/deploy/lock/workspace_filesystem.go b/bundle/deploy/lock/workspace_filesystem.go index 7f81451ea7..e84f327d84 100644 --- a/bundle/deploy/lock/workspace_filesystem.go +++ b/bundle/deploy/lock/workspace_filesystem.go @@ -8,16 +8,16 @@ import ( "github.com/databricks/cli/libs/log" ) -type filesystemLock struct { +type workspaceFilesystemLock struct { b *bundle.Bundle goal Goal } -func newFilesystemLock(b *bundle.Bundle, goal Goal) *filesystemLock { - return &filesystemLock{b: b, goal: goal} +func newWorkspaceFilesystemLock(b *bundle.Bundle, goal Goal) *workspaceFilesystemLock { + return &workspaceFilesystemLock{b: b, goal: goal} } -func (l *filesystemLock) Acquire(ctx context.Context) error { +func (l *workspaceFilesystemLock) Acquire(ctx context.Context) error { b := l.b if !b.Config.Bundle.Deployment.Lock.IsEnabled() { @@ -45,7 +45,7 @@ func (l *filesystemLock) Acquire(ctx context.Context) error { return nil } -func (l *filesystemLock) Release(ctx context.Context, _ DeploymentStatus) error { +func (l *workspaceFilesystemLock) Release(ctx context.Context, _ DeploymentStatus) error { b := l.b if !b.Config.Bundle.Deployment.Lock.IsEnabled() { From 0f7c224f06e475f54e940a25d0936f6a05fa3bfc Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 8 Apr 2026 11:06:47 +0000 Subject: [PATCH 20/25] Make operation reporting non-best-effort and log lock release failures - OperationReporter now returns error; callers fail the deployment on reporting errors instead of silently logging warnings. - Log warnings when Release (CompleteVersion) fails in deploy, destroy, bind, and unbind phases. - Remove duplicate warning log from metadataServiceLock.Release since callers now handle the error. - Restore comments in deployCore explaining state push behavior. - Revert engine -> targetEngine rename in deploy.go and destroy.go. - Delete empty destroy_metadata.go placeholder. - Move DMS acceptance tests from bundle/deploy/metadata-service to bundle/dms. - Add release-lock-error acceptance test to verify warning on CompleteVersion failure. Co-authored-by: Isaac --- .../metadata-service => dms}/databricks.yml | 0 .../deploy-error/databricks.yml | 0 .../deploy-error/out.test.toml | 0 .../deploy-error/output.txt | 0 .../deploy-error/script | 0 .../deploy-error/test.toml | 0 .../initial-register/databricks.yml | 0 .../initial-register/out.test.toml | 0 .../initial-register/output.txt | 0 .../initial-register/script | 0 .../initial-register/test.toml | 0 .../metadata-service => dms}/out.test.toml | 0 .../metadata-service => dms}/output.txt | 0 .../dms/release-lock-error/databricks.yml | 11 ++++ .../dms/release-lock-error/out.test.toml | 6 ++ .../bundle/dms/release-lock-error/output.txt | 56 +++++++++++++++++++ .../bundle/dms/release-lock-error/script | 8 +++ .../bundle/dms/release-lock-error/test.toml | 2 + .../{deploy/metadata-service => dms}/script | 0 .../metadata-service => dms}/test.toml | 0 .../lock/deployment_metadata_service.go | 23 ++++---- bundle/direct/bundle_apply.go | 16 +++++- bundle/direct/pkg.go | 6 +- bundle/phases/bind.go | 12 +++- bundle/phases/deploy.go | 28 ++++++---- bundle/phases/destroy.go | 18 +++--- bundle/phases/destroy_metadata.go | 3 - libs/testserver/deployment_metadata.go | 10 ++++ libs/testserver/server.go | 2 +- 29 files changed, 158 insertions(+), 43 deletions(-) rename acceptance/bundle/{deploy/metadata-service => dms}/databricks.yml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/deploy-error/databricks.yml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/deploy-error/out.test.toml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/deploy-error/output.txt (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/deploy-error/script (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/deploy-error/test.toml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/initial-register/databricks.yml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/initial-register/out.test.toml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/initial-register/output.txt (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/initial-register/script (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/initial-register/test.toml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/out.test.toml (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/output.txt (100%) create mode 100644 acceptance/bundle/dms/release-lock-error/databricks.yml create mode 100644 acceptance/bundle/dms/release-lock-error/out.test.toml create mode 100644 acceptance/bundle/dms/release-lock-error/output.txt create mode 100644 acceptance/bundle/dms/release-lock-error/script create mode 100644 acceptance/bundle/dms/release-lock-error/test.toml rename acceptance/bundle/{deploy/metadata-service => dms}/script (100%) rename acceptance/bundle/{deploy/metadata-service => dms}/test.toml (100%) delete mode 100644 bundle/phases/destroy_metadata.go diff --git a/acceptance/bundle/deploy/metadata-service/databricks.yml b/acceptance/bundle/dms/databricks.yml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/databricks.yml rename to acceptance/bundle/dms/databricks.yml diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml b/acceptance/bundle/dms/deploy-error/databricks.yml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/deploy-error/databricks.yml rename to acceptance/bundle/dms/deploy-error/databricks.yml diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml b/acceptance/bundle/dms/deploy-error/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/deploy-error/out.test.toml rename to acceptance/bundle/dms/deploy-error/out.test.toml diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/output.txt b/acceptance/bundle/dms/deploy-error/output.txt similarity index 100% rename from acceptance/bundle/deploy/metadata-service/deploy-error/output.txt rename to acceptance/bundle/dms/deploy-error/output.txt diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/script b/acceptance/bundle/dms/deploy-error/script similarity index 100% rename from acceptance/bundle/deploy/metadata-service/deploy-error/script rename to acceptance/bundle/dms/deploy-error/script diff --git a/acceptance/bundle/deploy/metadata-service/deploy-error/test.toml b/acceptance/bundle/dms/deploy-error/test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/deploy-error/test.toml rename to acceptance/bundle/dms/deploy-error/test.toml diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml b/acceptance/bundle/dms/initial-register/databricks.yml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/initial-register/databricks.yml rename to acceptance/bundle/dms/initial-register/databricks.yml diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml b/acceptance/bundle/dms/initial-register/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/initial-register/out.test.toml rename to acceptance/bundle/dms/initial-register/out.test.toml diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/output.txt b/acceptance/bundle/dms/initial-register/output.txt similarity index 100% rename from acceptance/bundle/deploy/metadata-service/initial-register/output.txt rename to acceptance/bundle/dms/initial-register/output.txt diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/script b/acceptance/bundle/dms/initial-register/script similarity index 100% rename from acceptance/bundle/deploy/metadata-service/initial-register/script rename to acceptance/bundle/dms/initial-register/script diff --git a/acceptance/bundle/deploy/metadata-service/initial-register/test.toml b/acceptance/bundle/dms/initial-register/test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/initial-register/test.toml rename to acceptance/bundle/dms/initial-register/test.toml diff --git a/acceptance/bundle/deploy/metadata-service/out.test.toml b/acceptance/bundle/dms/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/out.test.toml rename to acceptance/bundle/dms/out.test.toml diff --git a/acceptance/bundle/deploy/metadata-service/output.txt b/acceptance/bundle/dms/output.txt similarity index 100% rename from acceptance/bundle/deploy/metadata-service/output.txt rename to acceptance/bundle/dms/output.txt diff --git a/acceptance/bundle/dms/release-lock-error/databricks.yml b/acceptance/bundle/dms/release-lock-error/databricks.yml new file mode 100644 index 0000000000..94323b84d9 --- /dev/null +++ b/acceptance/bundle/dms/release-lock-error/databricks.yml @@ -0,0 +1,11 @@ +bundle: + name: dms-release-lock-error + +targets: + fail-complete: + default: true + +resources: + jobs: + test_job: + name: test-job diff --git a/acceptance/bundle/dms/release-lock-error/out.test.toml b/acceptance/bundle/dms/release-lock-error/out.test.toml new file mode 100644 index 0000000000..6ce208a048 --- /dev/null +++ b/acceptance/bundle/dms/release-lock-error/out.test.toml @@ -0,0 +1,6 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] + DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] diff --git a/acceptance/bundle/dms/release-lock-error/output.txt b/acceptance/bundle/dms/release-lock-error/output.txt new file mode 100644 index 0000000000..253593e8d9 --- /dev/null +++ b/acceptance/bundle/dms/release-lock-error/output.txt @@ -0,0 +1,56 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/dms-release-lock-error/fail-complete/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Warn: Failed to release deployment lock: complete version: simulated complete version failure + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "fail-complete" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "fail-complete" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": "OPERATION_ACTION_TYPE_CREATE", + "resource_id": "[NUMID]", + "status": "OPERATION_STATUS_SUCCEEDED" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": "VERSION_COMPLETE_SUCCESS" + } +} diff --git a/acceptance/bundle/dms/release-lock-error/script b/acceptance/bundle/dms/release-lock-error/script new file mode 100644 index 0000000000..deff401bef --- /dev/null +++ b/acceptance/bundle/dms/release-lock-error/script @@ -0,0 +1,8 @@ +# Deploy with the metadata service enabled. +# The target name "fail-complete" triggers a simulated error on the +# CompleteVersion endpoint (release lock), so deploy should warn about +# the failed lock release. +trace $CLI bundle deploy + +# Print the metadata service requests to verify the lock release was attempted. +trace print_requests.py --get //bundle diff --git a/acceptance/bundle/dms/release-lock-error/test.toml b/acceptance/bundle/dms/release-lock-error/test.toml new file mode 100644 index 0000000000..1910e96135 --- /dev/null +++ b/acceptance/bundle/dms/release-lock-error/test.toml @@ -0,0 +1,2 @@ +# Override target to "fail-complete" which makes the test server's +# CompleteVersion endpoint return an error, simulating a release failure. diff --git a/acceptance/bundle/deploy/metadata-service/script b/acceptance/bundle/dms/script similarity index 100% rename from acceptance/bundle/deploy/metadata-service/script rename to acceptance/bundle/dms/script diff --git a/acceptance/bundle/deploy/metadata-service/test.toml b/acceptance/bundle/dms/test.toml similarity index 100% rename from acceptance/bundle/deploy/metadata-service/test.toml rename to acceptance/bundle/dms/test.toml diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index 922540cea2..940e4943da 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -80,9 +80,7 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat Name: fmt.Sprintf("deployments/%s/versions/%s", l.deploymentID, l.versionID), CompletionReason: reason, }) - if completeErr != nil { - log.Warnf(ctx, "Failed to release deployment lock: %v", completeErr) - } else { + if completeErr == nil { log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) } @@ -174,8 +172,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe } // makeOperationReporter returns an OperationReporter that reports each resource -// operation (success or failure) to the metadata service. Reporting failures are -// logged as warnings and do not affect the deploy outcome. +// operation (success or failure) to the deployment metadata service. func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string, initialRegistration bool) direct.OperationReporter { return func( ctx context.Context, @@ -183,14 +180,13 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers resourceID string, action deployplan.ActionType, operationErr error, - ) { - actionType, mapErr := planActionToOperationAction(action, initialRegistration) - if mapErr != nil { - log.Warnf(ctx, "Skipping operation report for resource %s: %v", resourceKey, mapErr) - return + ) error { + actionType, err := planActionToOperationAction(action, initialRegistration) + if err != nil { + return fmt.Errorf("mapping action for resource %s: %w", resourceKey, err) } if actionType == "" { - return + return nil } status := tmpdms.OperationStatusSucceeded @@ -200,7 +196,7 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers errorMessage = operationErr.Error() } - _, err := svc.CreateOperation(ctx, tmpdms.CreateOperationRequest{ + _, err = svc.CreateOperation(ctx, tmpdms.CreateOperationRequest{ DeploymentID: deploymentID, VersionID: versionID, Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), @@ -214,8 +210,9 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers }, }) if err != nil { - log.Warnf(ctx, "Failed to report operation for resource %s: %v", resourceKey, err) + return fmt.Errorf("reporting operation for resource %s: %w", resourceKey, err) } + return nil } } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 19c0de5e23..bad6701a6a 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -95,7 +95,11 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa err = d.Destroy(ctx, &b.StateDB) if b.OperationReporter != nil { - b.OperationReporter(ctx, resourceKey, deleteResourceID, action, err) + reportErr := b.OperationReporter(ctx, resourceKey, deleteResourceID, action, err) + if reportErr != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: failed to report operation: %w", errorPrefix, reportErr)) + return false + } } if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: %w", errorPrefix, err)) @@ -113,7 +117,10 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { resourceID = dbentry.ID } - b.OperationReporter(ctx, resourceKey, resourceID, action, nil) + if reportErr := b.OperationReporter(ctx, resourceKey, resourceID, action, nil); reportErr != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: failed to report operation: %w", errorPrefix, reportErr)) + return false + } } if action != deployplan.Skip { @@ -152,7 +159,10 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { resourceID = dbentry.ID } - b.OperationReporter(ctx, resourceKey, resourceID, action, err) + if reportErr := b.OperationReporter(ctx, resourceKey, resourceID, action, err); reportErr != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: failed to report operation: %w", errorPrefix, reportErr)) + return false + } } if err != nil { diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 97ef49eedc..34ada4fd56 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -39,15 +39,15 @@ type DeploymentUnit struct { // OperationReporter is called after each resource operation (success or failure) // to report it to the deployment metadata service. If operationErr is non-nil the -// operation is recorded as failed with the error message. It is best-effort: -// reporting failures are logged as warnings by the caller. +// operation is recorded as failed with the error message. Returns an error if +// reporting fails; callers must treat this as a deployment failure. type OperationReporter func( ctx context.Context, resourceKey string, resourceID string, action deployplan.ActionType, operationErr error, -) +) error // DeploymentBundle holds everything needed to deploy a bundle type DeploymentBundle struct { diff --git a/bundle/phases/bind.go b/bundle/phases/bind.go index aa30415c86..22635e57a7 100644 --- a/bundle/phases/bind.go +++ b/bundle/phases/bind.go @@ -32,7 +32,11 @@ func Bind(ctx context.Context, b *bundle.Bundle, opts *terraform.BindOptions) { logdiag.LogError(ctx, err) return } - defer func() { _ = dl.Release(ctx, lock.DeploymentSuccess) }() + defer func() { + if err := dl.Release(ctx, lock.DeploymentSuccess); err != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", err) + } + }() if engine.IsDirect() { // Direct engine: import into temp state, run plan, check for changes @@ -132,7 +136,11 @@ func Unbind(ctx context.Context, b *bundle.Bundle, bundleType, tfResourceType, r logdiag.LogError(ctx, err) return } - defer func() { _ = dl.Release(ctx, lock.DeploymentSuccess) }() + defer func() { + if err := dl.Release(ctx, lock.DeploymentSuccess); err != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", err) + } + }() if engine.IsDirect() { groupName, ok := terraform.TerraformToGroupName[tfResourceType] diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 76e5a5847c..8b1eca3284 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -98,25 +98,31 @@ func approvalForDeploy(ctx context.Context, b *bundle.Bundle, plan *deployplan.P return approved, nil } -func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType) { +func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, engine engine.EngineType) { + // Core mutators that CRUD resources and modify deployment state. These + // mutators need informed consent if they are potentially destructive. cmdio.LogString(ctx, "Deploying resources...") - if targetEngine.IsDirect() { + if engine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) } else { bundle.ApplyContext(ctx, b, terraform.Apply()) } - statemgmt.PushResourcesState(ctx, b, targetEngine) + // Even if deployment failed, there might be updates in state that we need + // to upload. For the filesystem-based state, this uploads the state file to + // the workspace. For the metadata service, this is a no-op since operation + // results are reported inline during deployment. + statemgmt.PushResourcesState(ctx, b, engine) if logdiag.HasError(ctx) { return } bundle.ApplySeqContext(ctx, b, - statemgmt.Load(targetEngine), + statemgmt.Load(engine), metadata.Compute(), metadata.Upload(), - statemgmt.UploadStateForYamlSync(targetEngine), + statemgmt.UploadStateForYamlSync(engine), ) if !logdiag.HasError(ctx) { @@ -136,12 +142,12 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li // The deploy phase deploys artifacts and resources. // If readPlanPath is provided, the plan is loaded from that file instead of being calculated. -func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, targetEngine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { +func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, engine engine.EngineType, libs map[string][]libraries.LocationToUpdate, plan *deployplan.Plan) { useMetadataService, _ := env.ManagedState(ctx) if useMetadataService == "true" { log.Info(ctx, "Phase: deploy (with metadata service)") - if !targetEngine.IsDirect() { + if !engine.IsDirect() { logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) return } @@ -167,7 +173,9 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand if failed || logdiag.HasError(ctx) { status = lock.DeploymentFailure } - _ = dl.Release(ctx, status) + if err := dl.Release(ctx, status); err != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", err) + } }() uploadLibraries(ctx, b, libs) @@ -199,7 +207,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } } else { - plan = RunPlan(ctx, b, targetEngine) + plan = RunPlan(ctx, b, engine) } if logdiag.HasError(ctx) { failed = true @@ -217,7 +225,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - deployCore(ctx, b, plan, targetEngine) + deployCore(ctx, b, plan, engine) if logdiag.HasError(ctx) { failed = true return diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 3109f6fc27..07d3cc8ddd 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -95,8 +95,8 @@ func approvalForDestroy(ctx context.Context, b *bundle.Bundle, plan *deployplan. return approved, nil } -func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, targetEngine engine.EngineType) { - if targetEngine.IsDirect() { +func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, engine engine.EngineType) { + if engine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(), plan, direct.MigrateMode(false)) } else { bundle.ApplyContext(ctx, b, terraform.Apply()) @@ -114,12 +114,12 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, t } // The destroy phase deletes artifacts and resources. -func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineType) { +func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { useMetadataService, _ := env.ManagedState(ctx) if useMetadataService == "true" { log.Info(ctx, "Phase: destroy (with metadata service)") - if !targetEngine.IsDirect() { + if !engine.IsDirect() { logdiag.LogError(ctx, errors.New("managed state is only supported with the direct deployment engine")) return } @@ -150,10 +150,12 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy if failed || logdiag.HasError(ctx) { status = lock.DeploymentFailure } - _ = dl.Release(ctx, status) + if err := dl.Release(ctx, status); err != nil { + log.Warnf(ctx, "Failed to release deployment lock: %v", err) + } }() - if !targetEngine.IsDirect() { + if !engine.IsDirect() { bundle.ApplySeqContext(ctx, b, // We need to resolve artifact variable (how we do it in build phase) // because some of the to-be-destroyed resource might use this variable. @@ -173,7 +175,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy } var plan *deployplan.Plan - if targetEngine.IsDirect() { + if engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) plan, err = b.DeploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(), nil, localPath) if err != nil { @@ -205,7 +207,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, targetEngine engine.EngineTy } if hasApproval { - destroyCore(ctx, b, plan, targetEngine) + destroyCore(ctx, b, plan, engine) if logdiag.HasError(ctx) { failed = true } diff --git a/bundle/phases/destroy_metadata.go b/bundle/phases/destroy_metadata.go deleted file mode 100644 index 6cfa47ecc0..0000000000 --- a/bundle/phases/destroy_metadata.go +++ /dev/null @@ -1,3 +0,0 @@ -// This file is intentionally left minimal. The destroy flow with metadata service -// support has been unified into destroy.go using the deployMetadataLock helper. -package phases diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index f3a43863c3..ef66507cd0 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -279,6 +279,16 @@ func (s *FakeWorkspace) DeploymentMetadataCompleteVersion(req Request, deploymen defer s.LockUnlock()() state := s.deploymentMetadata + + // Allow tests to simulate a complete version failure. If the deployment's + // target_name is "fail-complete", return a 500 error. + if deployment, ok := state.deployments[deploymentID]; ok && deployment.TargetName == "fail-complete" { + return Response{ + StatusCode: http.StatusInternalServerError, + Body: map[string]string{"error_code": "INTERNAL_ERROR", "message": "simulated complete version failure"}, + } + } + versionKey := deploymentID + "/" + versionID version, ok := state.versions[versionKey] if !ok { diff --git a/libs/testserver/server.go b/libs/testserver/server.go index 2d7048dc8d..54d291fb7e 100644 --- a/libs/testserver/server.go +++ b/libs/testserver/server.go @@ -305,7 +305,7 @@ func (s *Server) Handle(method, path string, handler HandlerFunc) { var resp EncodedResponse - if bytes.Contains(request.Body, []byte("INJECT_ERROR")) { + if bytes.Contains(request.Body, []byte("INJECT_ERROR")) || strings.Contains(r.URL.Path, "INJECT_ERROR") { resp = EncodedResponse{ StatusCode: 500, Body: []byte("INJECTED"), From 0e5fdd3abec3c4af53c026c77e39862c48395d6d Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 8 Apr 2026 11:27:39 +0000 Subject: [PATCH 21/25] Read deployment ID from workspace and skip state push for DMS - Resolve deployment ID from workspace _deployment_id file, falling back to resources.json lineage for first-time DMS migration. - Write _deployment_id to workspace state directory so future deploys (from any machine) can discover the deployment ID. - On first successful DMS deploy, back up resources.json to resources.json.backup in workspace. - Skip PushResourcesState when DMS is enabled since operations are reported inline via OperationReporter. - Deploy and destroy now correctly share the same deployment ID, with sequential version numbers. Co-authored-by: Isaac --- acceptance/bundle/dms/deploy-error/output.txt | 1 - .../bundle/dms/initial-register/output.txt | 1 - acceptance/bundle/dms/output.txt | 9 +- .../bundle/dms/release-lock-error/output.txt | 1 - .../lock/deployment_metadata_service.go | 183 +++++++++++++++--- bundle/statemgmt/state_push.go | 8 + 6 files changed, 165 insertions(+), 38 deletions(-) diff --git a/acceptance/bundle/dms/deploy-error/output.txt b/acceptance/bundle/dms/deploy-error/output.txt index db0a3d43e1..3f8400a75e 100644 --- a/acceptance/bundle/dms/deploy-error/output.txt +++ b/acceptance/bundle/dms/deploy-error/output.txt @@ -9,7 +9,6 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Invalid job configuration. -Updating deployment state... >>> print_requests.py --get //bundle { diff --git a/acceptance/bundle/dms/initial-register/output.txt b/acceptance/bundle/dms/initial-register/output.txt index b48d0b6630..5a48a1b5cf 100644 --- a/acceptance/bundle/dms/initial-register/output.txt +++ b/acceptance/bundle/dms/initial-register/output.txt @@ -8,7 +8,6 @@ Deployment complete! >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/initial-register-test/default/files... Deploying resources... -Updating deployment state... Deployment complete! >>> print_requests.py --get //bundle diff --git a/acceptance/bundle/dms/output.txt b/acceptance/bundle/dms/output.txt index 1c25636f46..7c63c5a89d 100644 --- a/acceptance/bundle/dms/output.txt +++ b/acceptance/bundle/dms/output.txt @@ -2,7 +2,6 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/metadata-service-test/default/files... Deploying resources... -Updating deployment state... Deployment complete! >>> print_requests.py --get //bundle @@ -82,7 +81,7 @@ Destroy complete! "method": "POST", "path": "/api/2.0/bundle/deployments/[UUID]/versions", "q": { - "version_id": "1" + "version_id": "2" }, "body": { "cli_version": "[DEV_VERSION]", @@ -92,7 +91,7 @@ Destroy complete! } { "method": "POST", - "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/2/operations", "q": { "resource_key": "resources.jobs.test_job" }, @@ -105,9 +104,9 @@ Destroy complete! } { "method": "POST", - "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/2/complete", "body": { - "name": "deployments/[UUID]/versions/1", + "name": "deployments/[UUID]/versions/2", "completion_reason": "VERSION_COMPLETE_SUCCESS" } } diff --git a/acceptance/bundle/dms/release-lock-error/output.txt b/acceptance/bundle/dms/release-lock-error/output.txt index 253593e8d9..cc08c3326e 100644 --- a/acceptance/bundle/dms/release-lock-error/output.txt +++ b/acceptance/bundle/dms/release-lock-error/output.txt @@ -2,7 +2,6 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/dms-release-lock-error/fail-complete/files... Deploying resources... -Updating deployment state... Deployment complete! Warn: Failed to release deployment lock: complete version: simulated complete version failure diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index 940e4943da..fdcc4179c5 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -2,19 +2,23 @@ package lock import ( "context" + "encoding/json" "errors" "fmt" + "io" + "io/fs" "net/http" - "os" - "path/filepath" "strconv" + "strings" "time" "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/filer" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/tmpdms" "github.com/databricks/databricks-sdk-go/apierr" @@ -32,6 +36,11 @@ type metadataServiceLock struct { versionID string stopHeartbeat func() + + // migratedFromResourcesJSON is true when the deployment ID was seeded from + // resources.json lineage (first DMS deploy). On successful release, this + // triggers backing up resources.json to resources.json.backup. + migratedFromResourcesJSON bool } func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *metadataServiceLock { @@ -45,13 +54,14 @@ func (l *metadataServiceLock) Acquire(ctx context.Context) error { } l.svc = svc - deploymentID, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) + deploymentID, migrated, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) if err != nil { return err } l.deploymentID = deploymentID l.versionID = versionID + l.migratedFromResourcesJSON = migrated l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) initialRegistration := versionID == "1" @@ -84,6 +94,12 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) } + // On the first DMS deploy (migrated from resources.json), back up + // resources.json so users can revert to non-DMS deployment if needed. + if completeErr == nil && status == DeploymentSuccess && l.migratedFromResourcesJSON && l.versionType != tmpdms.VersionTypeDestroy { + backupResourcesJSON(cleanupCtx, l.b) + } + // For destroy operations, delete the deployment record after // successfully releasing the lock. if status == DeploymentSuccess && l.versionType == tmpdms.VersionTypeDestroy { @@ -99,28 +115,11 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat } // acquireLock implements the lock acquisition protocol using the deployment -// metadata service: read lineage, ensure deployment, create version. -func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID, versionID string, err error) { - // Read the lineage from resources.json (direct engine state) for the deployment ID. - _, localPath := b.StateFilenameDirect(ctx) - var stateDB dstate.DeploymentState - if openErr := stateDB.Open(localPath); openErr != nil { - return "", "", fmt.Errorf("failed to open resources state: %w", openErr) - } - - deploymentID = stateDB.Data.Lineage - if deploymentID == "" { - deploymentID = uuid.New().String() - } - - // Write the deployment ID to _deployment_id for external tooling. - stateDir := filepath.Dir(localPath) - if mkdirErr := os.MkdirAll(stateDir, 0o755); mkdirErr != nil { - return "", "", fmt.Errorf("failed to create state directory: %w", mkdirErr) - } - deploymentIDPath := filepath.Join(stateDir, "_deployment_id") - if writeErr := os.WriteFile(deploymentIDPath, []byte(deploymentID), 0o600); writeErr != nil { - return "", "", fmt.Errorf("failed to write deployment ID: %w", writeErr) +// metadata service: resolve deployment ID, ensure deployment, create version. +func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID string, migrated bool, versionID string, err error) { + deploymentID, migrated, err = resolveDeploymentID(ctx, b) + if err != nil { + return "", false, "", err } // Ensure the deployment exists in the metadata service. @@ -131,7 +130,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe }, }) if createErr != nil && !isAlreadyExists(createErr) { - return "", "", fmt.Errorf("failed to create deployment: %w", createErr) + return "", false, "", fmt.Errorf("failed to create deployment: %w", createErr) } // Get the deployment to determine the next version ID. @@ -139,7 +138,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe DeploymentID: deploymentID, }) if getErr != nil { - return "", "", fmt.Errorf("failed to get deployment: %w", getErr) + return "", false, "", fmt.Errorf("failed to get deployment: %w", getErr) } if dep.LastVersionID == "" { @@ -147,7 +146,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe } else { lastVersion, parseErr := strconv.ParseInt(dep.LastVersionID, 10, 64) if parseErr != nil { - return "", "", fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) + return "", false, "", fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) } versionID = strconv.FormatInt(lastVersion+1, 10) } @@ -164,11 +163,135 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe }, }) if versionErr != nil { - return "", "", fmt.Errorf("failed to acquire deployment lock: %w", versionErr) + return "", false, "", fmt.Errorf("failed to acquire deployment lock: %w", versionErr) } log.Infof(ctx, "Acquired deployment lock: deployment=%s version=%s", deploymentID, version.VersionID) - return deploymentID, versionID, nil + return deploymentID, migrated, versionID, nil +} + +const deploymentIDFilename = "_deployment_id" + +// resolveDeploymentID reads the deployment ID from the workspace state directory. +// It first checks for a _deployment_id file. If not found, it falls back to +// reading the lineage from resources.json (for first-time DMS migration). +// The resolved ID is written to _deployment_id in the workspace for future use. +// The returned bool indicates whether the ID was migrated from resources.json. +func resolveDeploymentID(ctx context.Context, b *bundle.Bundle) (string, bool, error) { + f, err := deploy.StateFiler(b) + if err != nil { + return "", false, fmt.Errorf("failed to create state filer: %w", err) + } + + // Try reading _deployment_id from the workspace state directory. + deploymentID, err := readDeploymentIDFile(ctx, f) + if err != nil { + return "", false, err + } + if deploymentID != "" { + return deploymentID, false, nil + } + + // Fall back to reading lineage from resources.json in the workspace. + migrated := false + deploymentID, err = readLineageFromResourcesJSON(ctx, f) + if err != nil { + return "", false, err + } + if deploymentID != "" { + migrated = true + } else { + // Fresh deployment: generate a new ID. + deploymentID = uuid.New().String() + } + + // Persist the deployment ID to the workspace for future deployments. + err = f.Write(ctx, deploymentIDFilename, strings.NewReader(deploymentID), filer.CreateParentDirectories, filer.OverwriteIfExists) + if err != nil { + return "", false, fmt.Errorf("failed to write %s to workspace: %w", deploymentIDFilename, err) + } + + return deploymentID, migrated, nil +} + +// readDeploymentIDFile reads the _deployment_id file from the workspace. +// Returns ("", nil) if the file does not exist. +func readDeploymentIDFile(ctx context.Context, f filer.Filer) (string, error) { + reader, err := f.Read(ctx, deploymentIDFilename) + if errors.Is(err, fs.ErrNotExist) { + return "", nil + } + if err != nil { + return "", fmt.Errorf("failed to read %s from workspace: %w", deploymentIDFilename, err) + } + defer reader.Close() + + data, err := io.ReadAll(reader) + if err != nil { + return "", fmt.Errorf("failed to read %s content: %w", deploymentIDFilename, err) + } + + id := strings.TrimSpace(string(data)) + if id == "" { + log.Warnf(ctx, "Found empty %s in workspace, falling back to resources.json", deploymentIDFilename) + return "", nil + } + return id, nil +} + +// readLineageFromResourcesJSON reads the lineage field from resources.json in the workspace. +// Returns ("", nil) if the file does not exist or has no lineage. +func readLineageFromResourcesJSON(ctx context.Context, f filer.Filer) (string, error) { + reader, err := f.Read(ctx, "resources.json") + if errors.Is(err, fs.ErrNotExist) { + return "", nil + } + if err != nil { + return "", fmt.Errorf("failed to read resources.json from workspace: %w", err) + } + defer reader.Close() + + data, err := io.ReadAll(reader) + if err != nil { + return "", fmt.Errorf("failed to read resources.json content: %w", err) + } + + var db dstate.Database + if err := json.Unmarshal(data, &db); err != nil { + return "", fmt.Errorf("failed to parse resources.json: %w", err) + } + return db.Lineage, nil +} + +// backupResourcesJSON renames resources.json to resources.json.backup in the +// workspace state directory. This is called after the first successful DMS +// deploy to indicate that _deployment_id is now the source of truth. +func backupResourcesJSON(ctx context.Context, b *bundle.Bundle) { + f, err := deploy.StateFiler(b) + if err != nil { + log.Warnf(ctx, "Failed to back up resources.json: %v", err) + return + } + + reader, err := f.Read(ctx, "resources.json") + if errors.Is(err, fs.ErrNotExist) { + return + } + if err != nil { + log.Warnf(ctx, "Failed to read resources.json for backup: %v", err) + return + } + + err = f.Write(ctx, "resources.json.backup", reader, filer.OverwriteIfExists) + if err != nil { + log.Warnf(ctx, "Failed to write resources.json.backup: %v", err) + return + } + + err = f.Delete(ctx, "resources.json") + if err != nil { + log.Warnf(ctx, "Failed to delete resources.json after backup: %v", err) + } } // makeOperationReporter returns an OperationReporter that reports each resource diff --git a/bundle/statemgmt/state_push.go b/bundle/statemgmt/state_push.go index 5774a0b5fe..a67cb6de05 100644 --- a/bundle/statemgmt/state_push.go +++ b/bundle/statemgmt/state_push.go @@ -8,12 +8,20 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" + "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" ) // PushResourcesState uploads the local state file to the remote location. +// When the deployment metadata service is enabled, this is a no-op because +// operation results are reported inline during deployment. func PushResourcesState(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { + if useDMS, _ := env.ManagedState(ctx); useDMS == "true" { + log.Debugf(ctx, "Skipping state push: using deployment metadata service") + return + } + mgr := NewStateManager(b, engine) if err := mgr.Push(ctx); err != nil { logdiag.LogError(ctx, err) From d4c032aea7d5c6ce372406d7106d5d9b1c927104 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 8 Apr 2026 22:53:45 +0000 Subject: [PATCH 22/25] Add OCC, initial registration, backup state push, revert Manager interface - Inline deployment version in plan for optimistic concurrency control - Separate initial registration phase before CRUD operations - Push state to resources.json.backup when DMS is enabled - Revert Manager interface, keep simple inline branching - Rename deploymentMetadataState to deploymentMetadata - Add sequential-deploys acceptance test and unit tests Co-authored-by: Isaac --- acceptance/bundle/dms/deploy-error/output.txt | 1 + .../bundle/dms/initial-register/output.txt | 1 + acceptance/bundle/dms/output.txt | 1 + .../bundle/dms/release-lock-error/output.txt | 1 + .../dms/sequential-deploys/databricks.yml | 7 + .../dms/sequential-deploys/out.test.toml | 6 + .../bundle/dms/sequential-deploys/output.txt | 135 ++++++++++++++++++ .../bundle/dms/sequential-deploys/script | 7 + .../lock/deployment_metadata_service.go | 131 ++++++++--------- .../lock/deployment_metadata_service_test.go | 71 +++++++++ bundle/deployplan/plan.go | 6 + bundle/direct/bundle_apply.go | 28 ++-- bundle/direct/bundle_plan.go | 5 + bundle/direct/pkg.go | 17 +++ bundle/statemgmt/check_running_resources.go | 19 ++- bundle/statemgmt/resourcestate/manager.go | 12 -- bundle/statemgmt/state_load.go | 19 ++- bundle/statemgmt/state_manager.go | 74 ---------- bundle/statemgmt/state_push.go | 45 +++++- libs/testserver/deployment_metadata.go | 8 +- libs/testserver/fake_workspace.go | 4 +- 21 files changed, 407 insertions(+), 191 deletions(-) create mode 100644 acceptance/bundle/dms/sequential-deploys/databricks.yml create mode 100644 acceptance/bundle/dms/sequential-deploys/out.test.toml create mode 100644 acceptance/bundle/dms/sequential-deploys/output.txt create mode 100644 acceptance/bundle/dms/sequential-deploys/script create mode 100644 bundle/deploy/lock/deployment_metadata_service_test.go delete mode 100644 bundle/statemgmt/resourcestate/manager.go delete mode 100644 bundle/statemgmt/state_manager.go diff --git a/acceptance/bundle/dms/deploy-error/output.txt b/acceptance/bundle/dms/deploy-error/output.txt index 3f8400a75e..db0a3d43e1 100644 --- a/acceptance/bundle/dms/deploy-error/output.txt +++ b/acceptance/bundle/dms/deploy-error/output.txt @@ -9,6 +9,7 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Invalid job configuration. +Updating deployment state... >>> print_requests.py --get //bundle { diff --git a/acceptance/bundle/dms/initial-register/output.txt b/acceptance/bundle/dms/initial-register/output.txt index 5a48a1b5cf..b48d0b6630 100644 --- a/acceptance/bundle/dms/initial-register/output.txt +++ b/acceptance/bundle/dms/initial-register/output.txt @@ -8,6 +8,7 @@ Deployment complete! >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/initial-register-test/default/files... Deploying resources... +Updating deployment state... Deployment complete! >>> print_requests.py --get //bundle diff --git a/acceptance/bundle/dms/output.txt b/acceptance/bundle/dms/output.txt index 7c63c5a89d..536622a866 100644 --- a/acceptance/bundle/dms/output.txt +++ b/acceptance/bundle/dms/output.txt @@ -2,6 +2,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/metadata-service-test/default/files... Deploying resources... +Updating deployment state... Deployment complete! >>> print_requests.py --get //bundle diff --git a/acceptance/bundle/dms/release-lock-error/output.txt b/acceptance/bundle/dms/release-lock-error/output.txt index cc08c3326e..253593e8d9 100644 --- a/acceptance/bundle/dms/release-lock-error/output.txt +++ b/acceptance/bundle/dms/release-lock-error/output.txt @@ -2,6 +2,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/dms-release-lock-error/fail-complete/files... Deploying resources... +Updating deployment state... Deployment complete! Warn: Failed to release deployment lock: complete version: simulated complete version failure diff --git a/acceptance/bundle/dms/sequential-deploys/databricks.yml b/acceptance/bundle/dms/sequential-deploys/databricks.yml new file mode 100644 index 0000000000..0d7c1fb63b --- /dev/null +++ b/acceptance/bundle/dms/sequential-deploys/databricks.yml @@ -0,0 +1,7 @@ +bundle: + name: sequential-deploys-test + +resources: + jobs: + test_job: + name: test-job diff --git a/acceptance/bundle/dms/sequential-deploys/out.test.toml b/acceptance/bundle/dms/sequential-deploys/out.test.toml new file mode 100644 index 0000000000..6ce208a048 --- /dev/null +++ b/acceptance/bundle/dms/sequential-deploys/out.test.toml @@ -0,0 +1,6 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] + DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] diff --git a/acceptance/bundle/dms/sequential-deploys/output.txt b/acceptance/bundle/dms/sequential-deploys/output.txt new file mode 100644 index 0000000000..35dbc6626f --- /dev/null +++ b/acceptance/bundle/dms/sequential-deploys/output.txt @@ -0,0 +1,135 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/sequential-deploys-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/sequential-deploys-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/sequential-deploys-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> print_requests.py --get //bundle +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "1" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/operations", + "q": { + "resource_key": "resources.jobs.test_job" + }, + "body": { + "resource_key": "resources.jobs.test_job", + "action_type": "OPERATION_ACTION_TYPE_CREATE", + "resource_id": "[NUMID]", + "status": "OPERATION_STATUS_SUCCEEDED" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/1/complete", + "body": { + "name": "deployments/[UUID]/versions/1", + "completion_reason": "VERSION_COMPLETE_SUCCESS" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "2" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/2/complete", + "body": { + "name": "deployments/[UUID]/versions/2", + "completion_reason": "VERSION_COMPLETE_SUCCESS" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments", + "q": { + "deployment_id": "[UUID]" + }, + "body": { + "target_name": "default" + } +} +{ + "method": "GET", + "path": "/api/2.0/bundle/deployments/[UUID]" +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions", + "q": { + "version_id": "3" + }, + "body": { + "cli_version": "[DEV_VERSION]", + "version_type": "VERSION_TYPE_DEPLOY", + "target_name": "default" + } +} +{ + "method": "POST", + "path": "/api/2.0/bundle/deployments/[UUID]/versions/3/complete", + "body": { + "name": "deployments/[UUID]/versions/3", + "completion_reason": "VERSION_COMPLETE_SUCCESS" + } +} diff --git a/acceptance/bundle/dms/sequential-deploys/script b/acceptance/bundle/dms/sequential-deploys/script new file mode 100644 index 0000000000..42850217b8 --- /dev/null +++ b/acceptance/bundle/dms/sequential-deploys/script @@ -0,0 +1,7 @@ +# Deploy three times in sequence to verify version numbers increment. +trace $CLI bundle deploy +trace $CLI bundle deploy +trace $CLI bundle deploy + +# Print metadata service requests. Version IDs should be 1, 2, 3. +trace print_requests.py --get //bundle diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index fdcc4179c5..23d3e540d5 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -37,10 +37,6 @@ type metadataServiceLock struct { stopHeartbeat func() - // migratedFromResourcesJSON is true when the deployment ID was seeded from - // resources.json lineage (first DMS deploy). On successful release, this - // triggers backing up resources.json to resources.json.backup. - migratedFromResourcesJSON bool } func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *metadataServiceLock { @@ -54,18 +50,27 @@ func (l *metadataServiceLock) Acquire(ctx context.Context) error { } l.svc = svc - deploymentID, migrated, versionID, err := acquireLock(ctx, l.b, svc, l.versionType) + deploymentID, versionID, lastVersionID, err := acquireLock(ctx, l.b, svc, l.versionType) if err != nil { return err } l.deploymentID = deploymentID l.versionID = versionID - l.migratedFromResourcesJSON = migrated l.stopHeartbeat = startHeartbeat(ctx, svc, deploymentID, versionID) - initialRegistration := versionID == "1" - l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID, initialRegistration) + // Store the last version ID on the bundle so that the plan can record + // which deployment version it was computed against (OCC). + l.b.DeploymentBundle.DeploymentVersion = lastVersionID + + // Set the initial registration reporter if this is the first DMS deployment. + // This will be called after the state DB is loaded (during plan computation) + // to register all existing resources before any CRUD operations. + if versionID == "1" { + l.b.DeploymentBundle.InitialRegistrationReporter = makeInitialRegistrationReporter(svc, deploymentID, versionID) + } + + l.b.DeploymentBundle.OperationReporter = makeOperationReporter(svc, deploymentID, versionID) return nil } @@ -94,12 +99,6 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) } - // On the first DMS deploy (migrated from resources.json), back up - // resources.json so users can revert to non-DMS deployment if needed. - if completeErr == nil && status == DeploymentSuccess && l.migratedFromResourcesJSON && l.versionType != tmpdms.VersionTypeDestroy { - backupResourcesJSON(cleanupCtx, l.b) - } - // For destroy operations, delete the deployment record after // successfully releasing the lock. if status == DeploymentSuccess && l.versionType == tmpdms.VersionTypeDestroy { @@ -116,10 +115,12 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat // acquireLock implements the lock acquisition protocol using the deployment // metadata service: resolve deployment ID, ensure deployment, create version. -func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID string, migrated bool, versionID string, err error) { - deploymentID, migrated, err = resolveDeploymentID(ctx, b) +// The returned lastVersionID is the deployment's last_version_id before this +// version was created (empty string for brand-new deployments). +func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMetadataAPI, versionType tmpdms.VersionType) (deploymentID string, versionID string, lastVersionID string, err error) { + deploymentID, err = resolveDeploymentID(ctx, b) if err != nil { - return "", false, "", err + return "", "", "", err } // Ensure the deployment exists in the metadata service. @@ -130,7 +131,7 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe }, }) if createErr != nil && !isAlreadyExists(createErr) { - return "", false, "", fmt.Errorf("failed to create deployment: %w", createErr) + return "", "", "", fmt.Errorf("failed to create deployment: %w", createErr) } // Get the deployment to determine the next version ID. @@ -138,15 +139,16 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe DeploymentID: deploymentID, }) if getErr != nil { - return "", false, "", fmt.Errorf("failed to get deployment: %w", getErr) + return "", "", "", fmt.Errorf("failed to get deployment: %w", getErr) } - if dep.LastVersionID == "" { + lastVersionID = dep.LastVersionID + if lastVersionID == "" { versionID = "1" } else { - lastVersion, parseErr := strconv.ParseInt(dep.LastVersionID, 10, 64) + lastVersion, parseErr := strconv.ParseInt(lastVersionID, 10, 64) if parseErr != nil { - return "", false, "", fmt.Errorf("failed to parse last_version_id %q: %w", dep.LastVersionID, parseErr) + return "", "", "", fmt.Errorf("failed to parse last_version_id %q: %w", lastVersionID, parseErr) } versionID = strconv.FormatInt(lastVersion+1, 10) } @@ -163,11 +165,11 @@ func acquireLock(ctx context.Context, b *bundle.Bundle, svc *tmpdms.DeploymentMe }, }) if versionErr != nil { - return "", false, "", fmt.Errorf("failed to acquire deployment lock: %w", versionErr) + return "", "", "", fmt.Errorf("failed to acquire deployment lock: %w", versionErr) } log.Infof(ctx, "Acquired deployment lock: deployment=%s version=%s", deploymentID, version.VersionID) - return deploymentID, migrated, versionID, nil + return deploymentID, versionID, lastVersionID, nil } const deploymentIDFilename = "_deployment_id" @@ -176,31 +178,27 @@ const deploymentIDFilename = "_deployment_id" // It first checks for a _deployment_id file. If not found, it falls back to // reading the lineage from resources.json (for first-time DMS migration). // The resolved ID is written to _deployment_id in the workspace for future use. -// The returned bool indicates whether the ID was migrated from resources.json. -func resolveDeploymentID(ctx context.Context, b *bundle.Bundle) (string, bool, error) { +func resolveDeploymentID(ctx context.Context, b *bundle.Bundle) (string, error) { f, err := deploy.StateFiler(b) if err != nil { - return "", false, fmt.Errorf("failed to create state filer: %w", err) + return "", fmt.Errorf("failed to create state filer: %w", err) } // Try reading _deployment_id from the workspace state directory. deploymentID, err := readDeploymentIDFile(ctx, f) if err != nil { - return "", false, err + return "", err } if deploymentID != "" { - return deploymentID, false, nil + return deploymentID, nil } // Fall back to reading lineage from resources.json in the workspace. - migrated := false deploymentID, err = readLineageFromResourcesJSON(ctx, f) if err != nil { - return "", false, err + return "", err } - if deploymentID != "" { - migrated = true - } else { + if deploymentID == "" { // Fresh deployment: generate a new ID. deploymentID = uuid.New().String() } @@ -208,10 +206,10 @@ func resolveDeploymentID(ctx context.Context, b *bundle.Bundle) (string, bool, e // Persist the deployment ID to the workspace for future deployments. err = f.Write(ctx, deploymentIDFilename, strings.NewReader(deploymentID), filer.CreateParentDirectories, filer.OverwriteIfExists) if err != nil { - return "", false, fmt.Errorf("failed to write %s to workspace: %w", deploymentIDFilename, err) + return "", fmt.Errorf("failed to write %s to workspace: %w", deploymentIDFilename, err) } - return deploymentID, migrated, nil + return deploymentID, nil } // readDeploymentIDFile reads the _deployment_id file from the workspace. @@ -263,40 +261,32 @@ func readLineageFromResourcesJSON(ctx context.Context, f filer.Filer) (string, e return db.Lineage, nil } -// backupResourcesJSON renames resources.json to resources.json.backup in the -// workspace state directory. This is called after the first successful DMS -// deploy to indicate that _deployment_id is now the source of truth. -func backupResourcesJSON(ctx context.Context, b *bundle.Bundle) { - f, err := deploy.StateFiler(b) - if err != nil { - log.Warnf(ctx, "Failed to back up resources.json: %v", err) - return - } - - reader, err := f.Read(ctx, "resources.json") - if errors.Is(err, fs.ErrNotExist) { - return - } - if err != nil { - log.Warnf(ctx, "Failed to read resources.json for backup: %v", err) - return - } - - err = f.Write(ctx, "resources.json.backup", reader, filer.OverwriteIfExists) - if err != nil { - log.Warnf(ctx, "Failed to write resources.json.backup: %v", err) - return - } - - err = f.Delete(ctx, "resources.json") - if err != nil { - log.Warnf(ctx, "Failed to delete resources.json after backup: %v", err) +// makeInitialRegistrationReporter returns a reporter that registers a single +// existing resource with the deployment metadata service. +func makeInitialRegistrationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.InitialRegistrationReporter { + return func(ctx context.Context, resourceKey, resourceID string) error { + _, err := svc.CreateOperation(ctx, tmpdms.CreateOperationRequest{ + DeploymentID: deploymentID, + VersionID: versionID, + Parent: fmt.Sprintf("deployments/%s/versions/%s", deploymentID, versionID), + ResourceKey: resourceKey, + Operation: &tmpdms.Operation{ + ResourceKey: resourceKey, + ResourceID: resourceID, + Status: tmpdms.OperationStatusSucceeded, + ActionType: tmpdms.OperationActionTypeInitRegister, + }, + }) + if err != nil { + return fmt.Errorf("registering existing resource %s: %w", resourceKey, err) + } + return nil } } // makeOperationReporter returns an OperationReporter that reports each resource // operation (success or failure) to the deployment metadata service. -func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string, initialRegistration bool) direct.OperationReporter { +func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, versionID string) direct.OperationReporter { return func( ctx context.Context, resourceKey string, @@ -304,7 +294,7 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers action deployplan.ActionType, operationErr error, ) error { - actionType, err := planActionToOperationAction(action, initialRegistration) + actionType, err := planActionToOperationAction(action) if err != nil { return fmt.Errorf("mapping action for resource %s: %w", resourceKey, err) } @@ -340,14 +330,11 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers } // planActionToOperationAction maps a deploy plan action to a metadata service -// operation action type. It returns an error for actions that are not supported -// by the backend. No-op actions like Skip return ("", nil) and should be ignored. -func planActionToOperationAction(action deployplan.ActionType, initialRegistration bool) (tmpdms.OperationActionType, error) { +// operation action type. No-op actions like Skip return ("", nil) and should +// be ignored. Initial registration is handled separately by registerExistingResources. +func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { switch action { case deployplan.Skip: - if initialRegistration { - return tmpdms.OperationActionTypeInitRegister, nil - } return "", nil case deployplan.Create: return tmpdms.OperationActionTypeCreate, nil diff --git a/bundle/deploy/lock/deployment_metadata_service_test.go b/bundle/deploy/lock/deployment_metadata_service_test.go new file mode 100644 index 0000000000..433d403a64 --- /dev/null +++ b/bundle/deploy/lock/deployment_metadata_service_test.go @@ -0,0 +1,71 @@ +package lock + +import ( + "net/http" + "testing" + + "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/libs/tmpdms" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPlanActionToOperationAction(t *testing.T) { + tests := []struct { + action deployplan.ActionType + expected tmpdms.OperationActionType + isNoop bool + }{ + {deployplan.Skip, "", true}, + {deployplan.Create, tmpdms.OperationActionTypeCreate, false}, + {deployplan.Update, tmpdms.OperationActionTypeUpdate, false}, + {deployplan.UpdateWithID, tmpdms.OperationActionTypeUpdateWithID, false}, + {deployplan.Delete, tmpdms.OperationActionTypeDelete, false}, + {deployplan.Recreate, tmpdms.OperationActionTypeRecreate, false}, + {deployplan.Resize, tmpdms.OperationActionTypeResize, false}, + } + + for _, tt := range tests { + t.Run(string(tt.action), func(t *testing.T) { + result, err := planActionToOperationAction(tt.action) + require.NoError(t, err) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestPlanActionToOperationActionSkipIsNoop(t *testing.T) { + result, err := planActionToOperationAction(deployplan.Skip) + require.NoError(t, err) + assert.Equal(t, tmpdms.OperationActionType(""), result) +} + +func TestPlanActionToOperationActionUnsupported(t *testing.T) { + _, err := planActionToOperationAction("unknown_action") + assert.ErrorContains(t, err, "unsupported operation action type") +} + +func TestIsAlreadyExists(t *testing.T) { + assert.True(t, isAlreadyExists(&apierr.APIError{StatusCode: http.StatusConflict})) + assert.False(t, isAlreadyExists(&apierr.APIError{StatusCode: http.StatusNotFound})) + assert.False(t, isAlreadyExists(&apierr.APIError{StatusCode: http.StatusInternalServerError})) + assert.False(t, isAlreadyExists(assert.AnError)) + assert.False(t, isAlreadyExists(nil)) +} + +func TestGoalToVersionType(t *testing.T) { + vt, ok := goalToVersionType(GoalDeploy) + assert.True(t, ok) + assert.Equal(t, tmpdms.VersionTypeDeploy, vt) + + vt, ok = goalToVersionType(GoalDestroy) + assert.True(t, ok) + assert.Equal(t, tmpdms.VersionTypeDestroy, vt) + + _, ok = goalToVersionType(GoalBind) + assert.False(t, ok) + + _, ok = goalToVersionType(GoalUnbind) + assert.False(t, ok) +} diff --git a/bundle/deployplan/plan.go b/bundle/deployplan/plan.go index e0dcd9b288..e7eae034b1 100644 --- a/bundle/deployplan/plan.go +++ b/bundle/deployplan/plan.go @@ -22,6 +22,12 @@ type Plan struct { Serial int `json:"serial,omitempty"` Plan map[string]*PlanEntry `json:"plan,omitzero"` + // DeploymentVersion is the DMS deployment version that this plan was + // computed against. Used for optimistic concurrency control: the plan + // can only be applied when the deployment's last_version_id still equals + // this value, ensuring no other deployment happened in between. + DeploymentVersion string `json:"deployment_version,omitempty"` + mutex sync.Mutex `json:"-"` lockmap lockmap `json:"-"` } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index bad6701a6a..79211da2fa 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -26,6 +26,20 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } b.StateDB.AssertOpened() + + // On the first DMS deployment, register all existing resources from state + // before any CRUD operations. This is a separate phase so that registration + // errors are treated as deployment failures. + if b.InitialRegistrationReporter != nil { + for resourceKey, entry := range b.StateDB.Data.State { + if err := b.InitialRegistrationReporter(ctx, resourceKey, entry.ID); err != nil { + logdiag.LogError(ctx, err) + return + } + } + b.InitialRegistrationReporter = nil + } + b.RemoteStateCache.Clear() g, err := makeGraph(plan) @@ -109,20 +123,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } // We don't keep NewState around for 'skip' nodes - - // Report skip actions to the metadata service. On initial registration, - // these are recorded as INITIAL_REGISTER operations. - if action == deployplan.Skip && b.OperationReporter != nil { - var resourceID string - if dbentry, ok := b.StateDB.GetResourceEntry(resourceKey); ok { - resourceID = dbentry.ID - } - if reportErr := b.OperationReporter(ctx, resourceKey, resourceID, action, nil); reportErr != nil { - logdiag.LogError(ctx, fmt.Errorf("%s: failed to report operation: %w", errorPrefix, reportErr)) - return false - } - } - if action != deployplan.Skip { if !b.resolveReferences(ctx, resourceKey, entry, errorPrefix, false) { return false diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 8bd2b4bfd1..96f5bf013f 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -126,6 +126,11 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks return nil, fmt.Errorf("reading config: %w", err) } + // Record the DMS deployment version for optimistic concurrency control. + // When applying a pre-computed plan, this version is validated against + // the current deployment state to detect stale plans. + plan.DeploymentVersion = b.DeploymentVersion + b.Plan = plan g, err := makeGraph(plan) diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 34ada4fd56..a005d61a93 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -49,6 +49,15 @@ type OperationReporter func( operationErr error, ) error +// InitialRegistrationReporter registers a single existing resource with the +// metadata service during the first DMS deployment. Called for each resource +// in the state DB before any CRUD operations. +type InitialRegistrationReporter func( + ctx context.Context, + resourceKey string, + resourceID string, +) error + // DeploymentBundle holds everything needed to deploy a bundle type DeploymentBundle struct { StateDB dstate.DeploymentState @@ -57,9 +66,17 @@ type DeploymentBundle struct { RemoteStateCache sync.Map StateCache structvar.Cache + // DeploymentVersion is the DMS last_version_id before the current version + // was created. Set during lock acquisition and copied to the plan for OCC. + DeploymentVersion string + // OperationReporter, when set, is called inline after each successful // resource Create/Update/Delete to report the operation to the metadata service. OperationReporter OperationReporter + + // InitialRegistrationReporter, when set, is called during the first DMS + // deployment to register all existing resources from state before CRUD. + InitialRegistrationReporter InitialRegistrationReporter } // SetRemoteState updates the remote state with type validation and marks as fresh. diff --git a/bundle/statemgmt/check_running_resources.go b/bundle/statemgmt/check_running_resources.go index af80bf42cc..7108c9b542 100644 --- a/bundle/statemgmt/check_running_resources.go +++ b/bundle/statemgmt/check_running_resources.go @@ -8,6 +8,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/libs/diag" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/service/jobs" @@ -37,10 +38,20 @@ func (l *checkRunningResources) Apply(ctx context.Context, b *bundle.Bundle) dia return nil } - mgr := NewStateManager(b, l.engine) - state, err := mgr.Read(ctx) - if err != nil { - return diag.FromErr(err) + var err error + var state ExportedResourcesMap + + if l.engine.IsDirect() { + _, fullPathDirect := b.StateFilenameDirect(ctx) + state, err = b.DeploymentBundle.ExportState(ctx, fullPathDirect) + if err != nil { + return diag.FromErr(err) + } + } else { + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + return diag.FromErr(err) + } } w := b.WorkspaceClient() diff --git a/bundle/statemgmt/resourcestate/manager.go b/bundle/statemgmt/resourcestate/manager.go deleted file mode 100644 index d12e2937ca..0000000000 --- a/bundle/statemgmt/resourcestate/manager.go +++ /dev/null @@ -1,12 +0,0 @@ -package resourcestate - -import "context" - -// Manager provides read and write access to deployment resource state. -type Manager interface { - // Read returns the current resource state as a map of resource keys to their state. - Read(ctx context.Context) (ExportedResourcesMap, error) - - // Push uploads local state to the remote workspace location. - Push(ctx context.Context) error -} diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index 12360b14e4..c0dfe45c97 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -11,6 +11,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" @@ -34,10 +35,20 @@ func (l *load) Name() string { } func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - mgr := NewStateManager(b, l.engine) - state, err := mgr.Read(ctx) - if err != nil { - return diag.FromErr(err) + var err error + var state ExportedResourcesMap + + if l.engine.IsDirect() { + _, fullPathDirect := b.StateFilenameDirect(ctx) + state, err = b.DeploymentBundle.ExportState(ctx, fullPathDirect) + if err != nil { + return diag.FromErr(err) + } + } else { + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + return diag.FromErr(err) + } } err = l.validateState(state) diff --git a/bundle/statemgmt/state_manager.go b/bundle/statemgmt/state_manager.go deleted file mode 100644 index b1a23fb262..0000000000 --- a/bundle/statemgmt/state_manager.go +++ /dev/null @@ -1,74 +0,0 @@ -package statemgmt - -import ( - "context" - "errors" - "io/fs" - "os" - - "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/config/engine" - "github.com/databricks/cli/bundle/deploy" - "github.com/databricks/cli/bundle/deploy/terraform" - "github.com/databricks/cli/bundle/statemgmt/resourcestate" - "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/filer" - "github.com/databricks/cli/libs/log" -) - -// NewStateManager returns a Manager implementation for the given engine type. -func NewStateManager(b *bundle.Bundle, e engine.EngineType) resourcestate.Manager { - if e.IsDirect() { - return &directStateManager{b: b} - } - return &terraformStateManager{b: b} -} - -type directStateManager struct { - b *bundle.Bundle -} - -func (m *directStateManager) Read(ctx context.Context) (ExportedResourcesMap, error) { - _, fullPath := m.b.StateFilenameDirect(ctx) - return m.b.DeploymentBundle.ExportState(ctx, fullPath) -} - -func (m *directStateManager) Push(ctx context.Context) error { - remotePath, localPath := m.b.StateFilenameDirect(ctx) - return pushLocalState(ctx, m.b, remotePath, localPath) -} - -type terraformStateManager struct { - b *bundle.Bundle -} - -func (m *terraformStateManager) Read(ctx context.Context) (ExportedResourcesMap, error) { - return terraform.ParseResourcesState(ctx, m.b) -} - -func (m *terraformStateManager) Push(ctx context.Context) error { - remotePath, localPath := m.b.StateFilenameTerraform(ctx) - return pushLocalState(ctx, m.b, remotePath, localPath) -} - -func pushLocalState(ctx context.Context, b *bundle.Bundle, remotePath, localPath string) error { - f, err := deploy.StateFiler(b) - if err != nil { - return err - } - - local, err := os.Open(localPath) - if errors.Is(err, fs.ErrNotExist) { - // The state file can be absent if terraform apply is skipped because - // there are no changes to apply in the plan. - log.Debugf(ctx, "Local state file does not exist: %s", localPath) - return nil - } - if err != nil { - return err - } - defer local.Close() - - cmdio.LogString(ctx, "Updating deployment state...") - return f.Write(ctx, remotePath, local, filer.CreateParentDirectories, filer.OverwriteIfExists) -} diff --git a/bundle/statemgmt/state_push.go b/bundle/statemgmt/state_push.go index a67cb6de05..13f2926369 100644 --- a/bundle/statemgmt/state_push.go +++ b/bundle/statemgmt/state_push.go @@ -4,26 +4,61 @@ import ( "context" "errors" "io/fs" + "os" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/env" + "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/filer" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" ) // PushResourcesState uploads the local state file to the remote location. -// When the deployment metadata service is enabled, this is a no-op because -// operation results are reported inline during deployment. +// When the deployment metadata service is enabled, the state is written to a +// backup path (resources.json.backup) instead. This gives users a safety net +// to revert from DMS to file-based state if needed. func PushResourcesState(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { + f, err := deploy.StateFiler(b) + if err != nil { + logdiag.LogError(ctx, err) + return + } + + var remotePath, localPath string + + if engine.IsDirect() { + remotePath, localPath = b.StateFilenameDirect(ctx) + } else { + remotePath, localPath = b.StateFilenameTerraform(ctx) + } + + // When DMS is enabled, write state to a backup path instead of the + // primary path. The DMS is the source of truth for resource state, + // but we keep a backup for rollback purposes. if useDMS, _ := env.ManagedState(ctx); useDMS == "true" { - log.Debugf(ctx, "Skipping state push: using deployment metadata service") + remotePath += ".backup" + } + + local, err := os.Open(localPath) + if errors.Is(err, fs.ErrNotExist) { + // The state file can be absent if terraform apply is skipped because + // there are no changes to apply in the plan. + log.Debugf(ctx, "Local state file does not exist: %s", localPath) + return + } + if err != nil { + logdiag.LogError(ctx, err) return } + defer local.Close() - mgr := NewStateManager(b, engine) - if err := mgr.Push(ctx); err != nil { + // Upload state file from local cache directory to filer. + cmdio.LogString(ctx, "Updating deployment state...") + err = f.Write(ctx, remotePath, local, filer.CreateParentDirectories, filer.OverwriteIfExists) + if err != nil { logdiag.LogError(ctx, err) } } diff --git a/libs/testserver/deployment_metadata.go b/libs/testserver/deployment_metadata.go index ef66507cd0..1554712c4e 100644 --- a/libs/testserver/deployment_metadata.go +++ b/libs/testserver/deployment_metadata.go @@ -11,9 +11,9 @@ import ( "github.com/databricks/cli/libs/tmpdms" ) -// deploymentMetadataState holds in-memory state for the deployment metadata service. +// deploymentMetadata holds in-memory state for the deployment metadata service. // Stored per-workspace inside FakeWorkspace. -type deploymentMetadataState struct { +type deploymentMetadata struct { // deployments keyed by deployment_id deployments map[string]tmpdms.Deployment @@ -31,8 +31,8 @@ type deploymentMetadataState struct { lockExpiry map[string]time.Time // deploymentId -> expiry time } -func newDeploymentMetadataState() *deploymentMetadataState { - return &deploymentMetadataState{ +func newDeploymentMetadata() *deploymentMetadata { + return &deploymentMetadata{ deployments: map[string]tmpdms.Deployment{}, versions: map[string]tmpdms.Version{}, operations: map[string]tmpdms.Operation{}, diff --git a/libs/testserver/fake_workspace.go b/libs/testserver/fake_workspace.go index a2462c4c6d..64faaf78e2 100644 --- a/libs/testserver/fake_workspace.go +++ b/libs/testserver/fake_workspace.go @@ -174,7 +174,7 @@ type FakeWorkspace struct { // matching cloud behavior where libraries are cached on running clusters. clusterVenvs map[string]*clusterEnv - deploymentMetadata *deploymentMetadataState + deploymentMetadata *deploymentMetadata } func (s *FakeWorkspace) LockUnlock() func() { @@ -299,7 +299,7 @@ func NewFakeWorkspace(url, token string) *FakeWorkspace { PostgresEndpoints: map[string]postgres.Endpoint{}, PostgresOperations: map[string]postgres.Operation{}, clusterVenvs: map[string]*clusterEnv{}, - deploymentMetadata: newDeploymentMetadataState(), + deploymentMetadata: newDeploymentMetadata(), Alerts: map[string]sql.AlertV2{}, Experiments: map[string]ml.GetExperimentResponse{}, ModelRegistryModels: map[string]ml.Model{}, From defda067bc40a497cecb0b7dbb2e0a08b19afd36 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 8 Apr 2026 23:57:13 +0000 Subject: [PATCH 23/25] Return error for force lock with DMS, fix stale comment and error message Co-authored-by: Isaac --- bundle/deploy/lock/deployment_metadata_service.go | 6 +++++- libs/tmpdms/api.go | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index 23d3e540d5..9df6d59f99 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -44,6 +44,10 @@ func newMetadataServiceLock(b *bundle.Bundle, versionType tmpdms.VersionType) *m } func (l *metadataServiceLock) Acquire(ctx context.Context) error { + if l.b.Config.Bundle.Deployment.Lock.Force { + return fmt.Errorf("force lock is not supported with the deployment metadata service") + } + svc, err := tmpdms.NewDeploymentMetadataAPI(l.b.WorkspaceClient()) if err != nil { return fmt.Errorf("failed to create metadata service client: %w", err) @@ -331,7 +335,7 @@ func makeOperationReporter(svc *tmpdms.DeploymentMetadataAPI, deploymentID, vers // planActionToOperationAction maps a deploy plan action to a metadata service // operation action type. No-op actions like Skip return ("", nil) and should -// be ignored. Initial registration is handled separately by registerExistingResources. +// be ignored. Initial registration is handled separately by makeInitialRegistrationReporter. func planActionToOperationAction(action deployplan.ActionType) (tmpdms.OperationActionType, error) { switch action { case deployplan.Skip: diff --git a/libs/tmpdms/api.go b/libs/tmpdms/api.go index fb233f15ae..b39590ad3f 100644 --- a/libs/tmpdms/api.go +++ b/libs/tmpdms/api.go @@ -154,8 +154,7 @@ func mapError(operation string, err error) error { switch apiErr.StatusCode { case http.StatusConflict: return fmt.Errorf("%s: deployment is locked by another active deployment. "+ - "If the prior deployment failed, the lock will expire automatically after 5 minutes. "+ - "You can also force-acquire the lock by running deploy with the --force-lock flag: %w", operation, err) + "If the prior deployment failed, the lock will expire automatically after 5 minutes: %w", operation, err) case http.StatusNotFound: return fmt.Errorf("%s: resource not found: %w", operation, err) case http.StatusBadRequest: From 83d4f17fb2da5e31d3f765052d5a1fb6db3e5c71 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 10 Apr 2026 12:07:21 +0000 Subject: [PATCH 24/25] Remove mapError, move liteswap transport to bundle, add test badness - Remove mapError from tmpdms/api.go; let SDK errors pass through directly - Move liteswap transport setup from Workspace.Client() to Bundle.WorkspaceClientE() where context is available - Fix RoundTrip to clone request before mutating headers - Revert Workspace.Client() signature to not require context - Add Badness line to DMS acceptance tests for future cloud enablement Co-authored-by: Isaac --- acceptance/bundle/dms/test.toml | 1 + bundle/bundle.go | 33 +++++++++++++++++++++++++-- bundle/config/workspace.go | 30 +------------------------ bundle/config/workspace_test.go | 18 +++++++-------- libs/tmpdms/api.go | 40 ++++++++------------------------- 5 files changed, 51 insertions(+), 71 deletions(-) diff --git a/acceptance/bundle/dms/test.toml b/acceptance/bundle/dms/test.toml index 4cebdfc83a..5d95b8d05d 100644 --- a/acceptance/bundle/dms/test.toml +++ b/acceptance/bundle/dms/test.toml @@ -1,3 +1,4 @@ +Badness = "Uses local test server; enable on cloud once the deployment metadata service is in production" EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] EnvMatrix.DATABRICKS_BUNDLE_MANAGED_STATE = ["true"] RecordRequests = true diff --git a/bundle/bundle.go b/bundle/bundle.go index 2d6e691886..0e703d9303 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -10,6 +10,7 @@ import ( "context" "errors" "fmt" + "net/http" "os" "path/filepath" "sync" @@ -18,6 +19,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/bundle/env" + libsenv "github.com/databricks/cli/libs/env" "github.com/databricks/cli/bundle/metadata" "github.com/databricks/cli/libs/auth" "github.com/databricks/cli/libs/cache" @@ -228,9 +230,23 @@ func TryLoad(ctx context.Context) *Bundle { func (b *Bundle) WorkspaceClientE(ctx context.Context) (*databricks.WorkspaceClient, error) { b.clientOnce.Do(func() { var err error - b.client, err = b.Config.Workspace.Client(ctx) + b.client, err = b.Config.Workspace.Client() if err != nil { b.clientErr = fmt.Errorf("cannot resolve bundle auth configuration: %w", err) + return + } + + // If DATABRICKS_LITESWAP_ID is set, wrap the transport to inject the + // x-databricks-traffic-id header for routing to the liteswap instance. + if liteswapID := libsenv.Get(ctx, "DATABRICKS_LITESWAP_ID"); liteswapID != "" { + inner := b.client.Config.HTTPTransport + if inner == nil { + inner = http.DefaultTransport + } + b.client.Config.HTTPTransport = &liteswapTransport{ + inner: inner, + trafficID: "testenv://liteswap/" + liteswapID, + } } }) @@ -238,7 +254,7 @@ func (b *Bundle) WorkspaceClientE(ctx context.Context) (*databricks.WorkspaceCli } func (b *Bundle) WorkspaceClient() *databricks.WorkspaceClient { - client, err := b.WorkspaceClientE(context.TODO()) + client, err := b.WorkspaceClientE(context.Background()) if err != nil { panic(err) } @@ -246,6 +262,19 @@ func (b *Bundle) WorkspaceClient() *databricks.WorkspaceClient { return client } +// liteswapTransport injects the x-databricks-traffic-id header to route +// requests to a liteswap service instance. +type liteswapTransport struct { + inner http.RoundTripper + trafficID string +} + +func (t *liteswapTransport) RoundTrip(req *http.Request) (*http.Response, error) { + clone := req.Clone(req.Context()) + clone.Header.Set("x-databricks-traffic-id", t.trafficID) + return t.inner.RoundTrip(clone) +} + // SetWorkpaceClient sets the workspace client for this bundle. // This is used to inject a mock client for testing. func (b *Bundle) SetWorkpaceClient(w *databricks.WorkspaceClient) { diff --git a/bundle/config/workspace.go b/bundle/config/workspace.go index 608c8aab63..c699dc070b 100644 --- a/bundle/config/workspace.go +++ b/bundle/config/workspace.go @@ -1,14 +1,11 @@ package config import ( - "context" - "net/http" "os" "path/filepath" "github.com/databricks/cli/libs/auth" "github.com/databricks/cli/libs/databrickscfg" - "github.com/databricks/cli/libs/env" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/config" "github.com/databricks/databricks-sdk-go/marshal" @@ -159,7 +156,7 @@ func (w *Workspace) NormalizeHostURL() { } } -func (w *Workspace) Client(ctx context.Context) (*databricks.WorkspaceClient, error) { +func (w *Workspace) Client() (*databricks.WorkspaceClient, error) { // Extract query parameters (?o=, ?a=) from the host URL before building // the SDK config. This ensures workspace_id and account_id are available // for profile resolution during EnsureResolved(). @@ -196,34 +193,9 @@ func (w *Workspace) Client(ctx context.Context) (*databricks.WorkspaceClient, er } } - // If DATABRICKS_LITESWAP_ID is set, wrap the transport to inject the - // x-databricks-traffic-id header for routing to the liteswap instance. - if liteswapID := env.Get(ctx, "DATABRICKS_LITESWAP_ID"); liteswapID != "" { - inner := cfg.HTTPTransport - if inner == nil { - inner = http.DefaultTransport - } - cfg.HTTPTransport = &liteswapTransport{ - inner: inner, - trafficID: "testenv://liteswap/" + liteswapID, - } - } - return databricks.NewWorkspaceClient((*databricks.Config)(cfg)) } -// liteswapTransport injects the x-databricks-traffic-id header to route -// requests to a liteswap service instance. -type liteswapTransport struct { - inner http.RoundTripper - trafficID string -} - -func (t *liteswapTransport) RoundTrip(req *http.Request) (*http.Response, error) { - req.Header.Set("x-databricks-traffic-id", t.trafficID) - return t.inner.RoundTrip(req) -} - func init() { arg0 := os.Args[0] diff --git a/bundle/config/workspace_test.go b/bundle/config/workspace_test.go index 4d87503a03..4181d17170 100644 --- a/bundle/config/workspace_test.go +++ b/bundle/config/workspace_test.go @@ -34,7 +34,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { t.Run("no config file", func(t *testing.T) { setupWorkspaceTest(t) - _, err := w.Client(t.Context()) + _, err := w.Client() assert.NoError(t, err) }) @@ -49,7 +49,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { }) require.NoError(t, err) - client, err := w.Client(t.Context()) + client, err := w.Client() assert.NoError(t, err) assert.Equal(t, "default", client.Config.Profile) }) @@ -67,7 +67,7 @@ func TestWorkspaceResolveProfileFromHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - client, err := w.Client(t.Context()) + client, err := w.Client() assert.NoError(t, err) assert.Equal(t, "custom", client.Config.Profile) }) @@ -149,7 +149,7 @@ func TestWorkspaceClientNormalizesHostBeforeProfileResolution(t *testing.T) { w := Workspace{ Host: "https://spog.databricks.com/?o=222", } - client, err := w.Client(t.Context()) + client, err := w.Client() require.NoError(t, err) assert.Equal(t, "ws2", client.Config.Profile) } @@ -165,7 +165,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { t.Run("no config file", func(t *testing.T) { setupWorkspaceTest(t) - _, err := w.Client(t.Context()) + _, err := w.Client() assert.ErrorIs(t, err, fs.ErrNotExist) }) @@ -179,7 +179,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { }) require.NoError(t, err) - _, err = w.Client(t.Context()) + _, err = w.Client() assert.NoError(t, err) }) @@ -193,7 +193,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { }) require.NoError(t, err) - _, err = w.Client(t.Context()) + _, err = w.Client() assert.ErrorContains(t, err, "doesn’t match the host configured in the bundle") }) @@ -209,7 +209,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - _, err = w.Client(t.Context()) + _, err = w.Client() assert.NoError(t, err) }) @@ -225,7 +225,7 @@ func TestWorkspaceVerifyProfileForHost(t *testing.T) { require.NoError(t, err) t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(home, "customcfg")) - _, err = w.Client(t.Context()) + _, err = w.Client() assert.ErrorContains(t, err, "doesn’t match the host configured in the bundle") }) } diff --git a/libs/tmpdms/api.go b/libs/tmpdms/api.go index b39590ad3f..a729553b03 100644 --- a/libs/tmpdms/api.go +++ b/libs/tmpdms/api.go @@ -2,12 +2,10 @@ package tmpdms import ( "context" - "errors" "fmt" "net/http" "github.com/databricks/databricks-sdk-go" - "github.com/databricks/databricks-sdk-go/apierr" "github.com/databricks/databricks-sdk-go/client" ) @@ -37,7 +35,7 @@ func (a *DeploymentMetadataAPI) CreateDeployment(ctx context.Context, request Cr query := map[string]any{"deployment_id": request.DeploymentID} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Deployment, &resp) if err != nil { - return nil, mapError("create deployment", err) + return nil, err } return &resp, nil } @@ -47,7 +45,7 @@ func (a *DeploymentMetadataAPI) GetDeployment(ctx context.Context, request GetDe path := fmt.Sprintf("%s/deployments/%s", basePath, request.DeploymentID) err := a.api.Do(ctx, http.MethodGet, path, nil, nil, nil, &resp) if err != nil { - return nil, mapError("get deployment", err) + return nil, err } return &resp, nil } @@ -57,7 +55,7 @@ func (a *DeploymentMetadataAPI) DeleteDeployment(ctx context.Context, request De path := fmt.Sprintf("%s/deployments/%s", basePath, request.DeploymentID) err := a.api.Do(ctx, http.MethodDelete, path, nil, nil, nil, &resp) if err != nil { - return nil, mapError("delete deployment", err) + return nil, err } return &resp, nil } @@ -68,7 +66,7 @@ func (a *DeploymentMetadataAPI) CreateVersion(ctx context.Context, request Creat query := map[string]any{"version_id": request.VersionID} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Version, &resp) if err != nil { - return nil, mapError("create version", err) + return nil, err } return &resp, nil } @@ -78,7 +76,7 @@ func (a *DeploymentMetadataAPI) GetVersion(ctx context.Context, request GetVersi path := fmt.Sprintf("%s/deployments/%s/versions/%s", basePath, request.DeploymentID, request.VersionID) err := a.api.Do(ctx, http.MethodGet, path, nil, nil, nil, &resp) if err != nil { - return nil, mapError("get version", err) + return nil, err } return &resp, nil } @@ -88,7 +86,7 @@ func (a *DeploymentMetadataAPI) Heartbeat(ctx context.Context, request Heartbeat path := fmt.Sprintf("%s/deployments/%s/versions/%s/heartbeat", basePath, request.DeploymentID, request.VersionID) err := a.api.Do(ctx, http.MethodPost, path, nil, nil, struct{}{}, &resp) if err != nil { - return nil, mapError("heartbeat", err) + return nil, err } return &resp, nil } @@ -98,7 +96,7 @@ func (a *DeploymentMetadataAPI) CompleteVersion(ctx context.Context, request Com path := fmt.Sprintf("%s/deployments/%s/versions/%s/complete", basePath, request.DeploymentID, request.VersionID) err := a.api.Do(ctx, http.MethodPost, path, nil, nil, request, &resp) if err != nil { - return nil, mapError("complete version", err) + return nil, err } return &resp, nil } @@ -109,7 +107,7 @@ func (a *DeploymentMetadataAPI) CreateOperation(ctx context.Context, request Cre query := map[string]any{"resource_key": request.ResourceKey} err := a.api.Do(ctx, http.MethodPost, path, nil, query, request.Operation, &resp) if err != nil { - return nil, mapError("create operation", err) + return nil, err } return &resp, nil } @@ -131,7 +129,7 @@ func (a *DeploymentMetadataAPI) ListResources(ctx context.Context, request ListR err := a.api.Do(ctx, http.MethodGet, path, nil, q, nil, &resp) if err != nil { - return nil, mapError("list resources", err) + return nil, err } allResources = append(allResources, resp.Resources...) @@ -143,23 +141,3 @@ func (a *DeploymentMetadataAPI) ListResources(ctx context.Context, request ListR return allResources, nil } - -// mapError translates API errors into user-friendly messages. -func mapError(operation string, err error) error { - var apiErr *apierr.APIError - if !errors.As(err, &apiErr) { - return fmt.Errorf("%s: %w", operation, err) - } - - switch apiErr.StatusCode { - case http.StatusConflict: - return fmt.Errorf("%s: deployment is locked by another active deployment. "+ - "If the prior deployment failed, the lock will expire automatically after 5 minutes: %w", operation, err) - case http.StatusNotFound: - return fmt.Errorf("%s: resource not found: %w", operation, err) - case http.StatusBadRequest: - return fmt.Errorf("%s: bad request: %w", operation, err) - default: - return fmt.Errorf("%s: %w", operation, err) - } -} From ad6a2b1e90258a624f231363585afbc3c806a278 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 10 Apr 2026 12:19:36 +0000 Subject: [PATCH 25/25] Return error for force lock with DMS, fix stale comment and error message Co-authored-by: Isaac --- bundle/bundle.go | 4 ++++ .../lock/deployment_metadata_service.go | 24 +++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/bundle/bundle.go b/bundle/bundle.go index 0e703d9303..fec79e7ea9 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -253,6 +253,10 @@ func (b *Bundle) WorkspaceClientE(ctx context.Context) (*databricks.WorkspaceCli return b.client, b.clientErr } +// WorkspaceClient returns the workspace client, panicking on error. +// It uses context.Background() because the client is always already initialized +// by WorkspaceClientE(ctx) during command setup (cmd/root/auth.go or cmd/root/bundle.go), +// so the sync.Once never fires here. func (b *Bundle) WorkspaceClient() *databricks.WorkspaceClient { client, err := b.WorkspaceClientE(context.Background()) if err != nil { diff --git a/bundle/deploy/lock/deployment_metadata_service.go b/bundle/deploy/lock/deployment_metadata_service.go index 9df6d59f99..6ac497ba01 100644 --- a/bundle/deploy/lock/deployment_metadata_service.go +++ b/bundle/deploy/lock/deployment_metadata_service.go @@ -99,9 +99,10 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat Name: fmt.Sprintf("deployments/%s/versions/%s", l.deploymentID, l.versionID), CompletionReason: reason, }) - if completeErr == nil { - log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) + if completeErr != nil { + return completeErr } + log.Infof(ctx, "Released deployment lock: deployment=%s version=%s reason=%s", l.deploymentID, l.versionID, reason) // For destroy operations, delete the deployment record after // successfully releasing the lock. @@ -110,11 +111,11 @@ func (l *metadataServiceLock) Release(ctx context.Context, status DeploymentStat DeploymentID: l.deploymentID, }) if deleteErr != nil { - log.Warnf(ctx, "Failed to delete deployment: %v", deleteErr) + return fmt.Errorf("failed to delete deployment: %w", deleteErr) } } - return completeErr + return nil } // acquireLock implements the lock acquisition protocol using the deployment @@ -376,6 +377,12 @@ func startHeartbeat(ctx context.Context, svc *tmpdms.DeploymentMetadataAPI, depl VersionID: versionID, }) if err != nil { + // A 409 ABORTED is expected if the version was completed + // between the ticker firing and the heartbeat request. + if isAborted(err) { + log.Debugf(ctx, "Heartbeat stopped: version already completed") + return + } log.Warnf(ctx, "Failed to send deployment heartbeat: %v", err) } else { log.Debugf(ctx, "Deployment heartbeat sent for deployment=%s version=%s", deploymentID, versionID) @@ -395,3 +402,12 @@ func isAlreadyExists(err error) bool { } return false } + +// isAborted checks if an error indicates the operation was aborted (HTTP 409 with ABORTED error code). +func isAborted(err error) bool { + var apiErr *apierr.APIError + if errors.As(err, &apiErr) && apiErr.StatusCode == http.StatusConflict && apiErr.ErrorCode == "ABORTED" { + return true + } + return false +}