diff --git a/.autover/autover.json b/.autover/autover.json index 8985c52bb..02f2ad0db 100644 --- a/.autover/autover.json +++ b/.autover/autover.json @@ -47,6 +47,11 @@ "Name": "Amazon.Lambda.Core", "Path": "Libraries/src/Amazon.Lambda.Core/Amazon.Lambda.Core.csproj" }, + { + "Name": "Amazon.Lambda.DurableExecution", + "Path": "Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj", + "PrereleaseLabel": "preview" + }, { "Name": "Amazon.Lambda.DynamoDBEvents", "Path": "Libraries/src/Amazon.Lambda.DynamoDBEvents/Amazon.Lambda.DynamoDBEvents.csproj" diff --git a/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json b/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json new file mode 100644 index 000000000..41fab0859 --- /dev/null +++ b/.autover/changes/91693d62-b0c7-49b0-a74f-531aa1509864.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Initial preview release of the Durable Execution SDK for .NET. Build long-running Lambda workflows with automatic checkpointing via `StepAsync`, `WaitAsync`, `RunInChildContextAsync`, `CreateCallbackAsync`, and `WaitForCallbackAsync` on `IDurableContext`." + ] + } + ] +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1caae6fe4..f86678d7a 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,6 @@ global.json **/cdk.out/** **/.DS_Store + +# JetBrains Rider per-project cache +**/*.lscache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31e288af2..63777c644 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,6 +85,7 @@ The available projects are: * Amazon.Lambda.ConfigEvents * Amazon.Lambda.ConnectEvents * Amazon.Lambda.Core +* Amazon.Lambda.DurableExecution * Amazon.Lambda.DynamoDBEvents * Amazon.Lambda.DynamoDBEvents.SDK.Convertor * Amazon.Lambda.KafkaEvents diff --git a/Docs/durable-execution-design.md b/Docs/durable-execution-design.md new file mode 100644 index 000000000..59ced6a15 --- /dev/null +++ b/Docs/durable-execution-design.md @@ -0,0 +1,2267 @@ +# .NET Lambda Durable Execution SDK Design + +## Table of Contents + +- [Overview](#overview) +- [Motivation](#motivation) +- [How Durable Execution Works](#how-durable-execution-works) +- [User Experience](#user-experience) + - [Quick Start](#quick-start) + - [Steps](#steps) + - [Wait Operations](#wait-operations) + - [Callbacks](#callbacks) + - [Invoke (Chained Functions)](#invoke-chained-functions) + - [Parallel Execution](#parallel-execution) + - [Map Operations](#map-operations) + - [Child Contexts](#child-contexts) + - [Error Handling & Retry](#error-handling--retry) + - [Logging](#logging) +- [Internals](#internals) +- [API Reference](#api-reference) + - [IDurableContext](#idurablecontext) + - [Configuration Types](#configuration-types) + - [Result Types](#result-types) + - [Exception Types](#exception-types) +- [Serialization](#serialization) +- [Integration with Existing Libraries](#integration-with-existing-libraries) +- [Testing](#testing) +- [Local development (Test Tool v2 and Aspire)](#local-development-test-tool-v2-and-aspire) +- [Requirements & Constraints](#requirements--constraints) +- [Package Structure](#package-structure) +- [Implementation plan](#implementation-plan) +- [Cross-SDK API comparison](#cross-sdk-api-comparison) +- [Common Patterns](#common-patterns) + +--- + +## Overview + +Lambda Durable Functions let you write multi-step workflows that persist state automatically. They can run for days or months, survive failures, and you only pay for actual compute time. + +This doc covers the **.NET Durable Execution SDK** (`Amazon.Lambda.DurableExecution`). SDKs already exist for [Python](https://github.com/aws/aws-durable-execution-sdk-python) and [JavaScript/TypeScript](https://github.com/aws/aws-durable-execution-sdk-js). + +Related: [GitHub Issue #2216](https://github.com/aws/aws-lambda-dotnet/issues/2216) + +--- + +## Motivation + +### The problem + +Today, building multi-step Lambda workflows in .NET requires one of: + +1. **Step Functions** -- a separate service with its own state machine language (ASL), adding latency between steps and forcing you to learn a second programming model. +2. **Manual state management** -- rolling your own checkpointing with DynamoDB or S3, plus retry logic, idempotency keys, and resumption code. +3. **Event-driven choreography** -- chaining functions through SQS/SNS/EventBridge, scattering a single workflow's logic across half a dozen Lambda functions. + +All three push infrastructure concerns into your business logic. The code gets harder to read and test, and nobody wants to inherit it. + +### What durable functions do instead + +With this SDK, you write sequential code and the runtime handles persistence: +- Checkpoints each step's result +- Suspends when waiting (no compute charges while idle) +- Resumes from the last checkpoint on the next invocation +- Retries failed steps with configurable backoff +- Waits for callbacks from external systems + +Your function reads like a normal async method. The SDK deals with state, replay, and recovery. + +### Why build a .NET SDK + +.NET has a large Lambda user base, especially in enterprise shops running order processing, document pipelines, and (increasingly) AI agent workflows. Today those teams either use Step Functions or build custom state machines. A native .NET SDK removes that tradeoff. + +--- + +## How Durable Execution Works + +### The replay model + +Durable functions use a replay-based execution model. Every invocation runs your code from the top, but previously completed steps return their cached result instead of re-executing. + +1. Lambda invokes your function with a `DurableExecutionInvocationInput` containing: + - `DurableExecutionArn` -- unique execution identifier + - `CheckpointToken` -- for optimistic concurrency + - `InitialExecutionState` -- previously checkpointed operations + +2. Your function code runs **from the beginning** on every invocation. + +3. When a **step** is encountered: + - Previously completed → return cached result (no re-execution) + - New → execute it, checkpoint the result, continue + +4. When a **wait** is encountered: + - Already elapsed → continue + - Still pending → return `PENDING`, Lambda terminates, service re-invokes later + +5. The function returns one of: + - `SUCCEEDED` -- workflow completed + - `FAILED` -- workflow failed + - `PENDING` -- workflow suspended (waiting for time or callback) + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ First Invocation (t=0s) │ +│ │ +│ handler(event, context) │ +│ │ │ +│ ├─► context.StepAsync(FetchData) → executes, checkpoints │ +│ │ │ +│ ├─► context.WaitAsync(30 seconds) → returns PENDING │ +│ │ │ +│ └── (Lambda terminates, environment recyclable) │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ Second Invocation (t=30s) │ +│ │ +│ handler(event, context) │ +│ │ │ +│ ├─► context.StepAsync(FetchData) → returns cached result │ +│ │ │ +│ ├─► context.WaitAsync(30 seconds) → already elapsed, skip │ +│ │ │ +│ ├─► context.StepAsync(ProcessData) → executes, checkpoints │ +│ │ │ +│ └── return result → SUCCEEDED │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## User Experience + +### Quick Start + +#### Installation + +```shell +dotnet add package Amazon.Lambda.DurableExecution +``` + +#### Minimal Example + +```csharp +using Amazon.Lambda.Annotations; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace MyDurableFunction; + +public class Function +{ + [LambdaFunction] + [DurableExecution] + public async Task Handler(OrderEvent input, IDurableContext context) + { + // Step 1: Validate the order (checkpointed automatically) + var validation = await context.StepAsync( + async (step) => await ValidateOrder(input.OrderId), + name: "validate_order"); + + if (!validation.IsValid) + return new OrderResult { Status = "rejected" }; + + // Step 2: Wait for processing (Lambda is NOT running during this time) + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "processing_delay"); + + // Step 3: Process the order + var result = await context.StepAsync( + async (step) => await ProcessOrder(input.OrderId), + name: "process_order"); + + return new OrderResult { Status = "approved", OrderId = result.OrderId }; + } + + private async Task ValidateOrder(string orderId) { /* ... */ } + private async Task ProcessOrder(string orderId) { /* ... */ } +} +``` + +Things to notice: +- `[LambdaFunction]` + `[DurableExecution]` triggers source generation, so you don't wire up the handler yourself +- Each step function receives an `IStepContext` with a step-scoped logger, attempt number, and operation ID +- Each `StepAsync` call checkpoints its result automatically +- `WaitAsync` suspends the function -- Lambda is not running (or billing you) during the wait +- On replay, completed steps return their cached result without re-executing +- The generated wrapper handles checkpoint batching and cleanup + +#### Manual Handler (Without Annotations) + +If you don't use `Amazon.Lambda.Annotations`, use `DurableFunction.WrapAsync` — a static helper (inspired by [OpenTelemetry's `AWSLambdaWrapper.TraceAsync`](https://github.com/open-telemetry/opentelemetry-dotnet-contrib/tree/main/src/OpenTelemetry.Instrumentation.AWSLambda#lambda-function)) that handles the entire durable execution envelope for you: + +```csharp +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace MyDurableFunction; + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (step) => await ValidateOrder(input.OrderId), + name: "validate_order"); + + if (!validation.IsValid) + return new OrderResult { Status = "rejected" }; + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "processing_delay"); + + var result = await context.StepAsync( + async (step) => await ProcessOrder(input.OrderId), + name: "process_order"); + + return new OrderResult { Status = "approved", OrderId = result.OrderId }; + } + + private async Task ValidateOrder(string orderId) { /* ... */ } + private async Task ProcessOrder(string orderId) { /* ... */ } +} +``` + +`DurableFunction.WrapAsync` handles all the plumbing: +- Hydrates `ExecutionState` from `invocationInput.InitialExecutionState` +- Extracts the user payload from the service envelope +- Runs the workflow through `DurableExecutionHandler.RunAsync` +- Constructs and returns the `DurableExecutionInvocationOutput` envelope (status mapping, JSON serialization) +- Sets execution environment tracking + +For workflows that return no value, use the single-type-parameter overload: + +```csharp +public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + +private async Task MyWorkflow(OrderEvent input, IDurableContext context) +{ + await context.StepAsync(async (step) => await SendNotification(input.UserId), name: "notify"); + await context.WaitAsync(TimeSpan.FromHours(1), name: "cooldown"); + await context.StepAsync(async (step) => await Cleanup(input.UserId), name: "cleanup"); +} +``` + +For **NativeAOT** deployments, register an AOT-aware `ILambdaSerializer` with the Lambda runtime. `WrapAsync` reads the registered serializer from `ILambdaContext.Serializer` and uses it for both envelope and step-checkpoint (de)serialization — there is no per-call `JsonSerializerContext` argument, and AOT and reflection callers share the same `WrapAsync` overloads. + +In the class library programming model, register via the assembly attribute: + +```csharp +[assembly: LambdaSerializer(typeof(SourceGeneratorLambdaJsonSerializer))] + +// The user's context must include the wire-envelope types (the typed handler +// signature is DurableExecutionInvocationInput → DurableExecutionInvocationOutput, +// so Lambda's runtime needs to deserialize them with this serializer) plus every +// TInput/TOutput/step-result POCO the workflow uses. +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(OrderEvent))] +[JsonSerializable(typeof(OrderResult))] +public partial class MyJsonContext : JsonSerializerContext { } + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + // ... + } +} +``` + +In an executable / custom-runtime deployment, pass the serializer to `LambdaBootstrapBuilder.Create(handler, serializer)` instead of using the assembly attribute — `RuntimeSupport` will propagate it onto `ILambdaContext.Serializer` for the SDK to pick up. + +To inject a custom `IAmazonLambda` client (e.g., for VPC endpoints or unit testing), use the overload that accepts one: + +```csharp +public class Function +{ + private readonly IAmazonLambda _lambdaClient; + + public Function(IAmazonLambda lambdaClient) => _lambdaClient = lambdaClient; + + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync( + MyWorkflow, invocationInput, context, _lambdaClient); +} +``` + +You'd also need to manually configure the CloudFormation template with `DurableConfig` and managed policies: + +```json +{ + "Resources": { + "MyFunction": { + "Type": "AWS::Serverless::Function", + "Properties": { + "Handler": "MyDurableFunction::MyDurableFunction.Function::FunctionHandler", + "Policies": [ + "AWSLambdaBasicExecutionRole", + "AWSLambdaBasicDurableExecutionRolePolicy" + ], + "DurableConfig": { + "Enabled": true + } + } + } + } +} +``` + +##### What WrapAsync does internally + +For reference, here's the expanded version of what `DurableFunction.WrapAsync` eliminates — this is effectively what the source generator produces for the Annotations path: + +```csharp +public async Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) +{ + // 1. Hydrate execution state from previously checkpointed operations + var state = new ExecutionState(); + state.LoadFromCheckpoint(invocationInput.InitialExecutionState); + + // 2. Extract user payload from the service envelope (internal) + var userPayload = ExtractUserPayload(invocationInput); + + // 3. Run the user's workflow via DurableExecutionHandler.RunAsync + var result = await DurableExecutionHandler.RunAsync( + state, + async (durableContext) => await MyWorkflow(userPayload, durableContext), + invocationInput.DurableExecutionArn); + + // 4. Construct and return the service output envelope + return new DurableExecutionInvocationOutput + { + Status = result.Status, + Result = result.Status == InvocationStatus.Succeeded + ? JsonSerializer.Serialize(result.Result) + : null, + ErrorMessage = result.Message + }; +} +``` + +Key differences between `WrapAsync` and the Annotations approach: +- `WrapAsync` still requires you to define the Lambda entry point signature (`DurableExecutionInvocationInput` → `DurableExecutionInvocationOutput`) +- You configure `DurableConfig` + managed policies in your CloudFormation template manually (not generated) +- No `[LambdaFunction]` or `[DurableExecution]` attributes needed + +With `[LambdaFunction] + [DurableExecution]`, even the entry point and CloudFormation config are generated at compile time — you just write the workflow method. + +--- + +### Steps + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/step.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/step-handler/step-handler.ts) + +A step runs your code and checkpoints the result. On replay, the cached result comes back without re-executing. Each step function receives an `IStepContext` with a step-scoped logger and attempt metadata. + +```csharp +// Basic step +var result = await context.StepAsync(async (step) => await CallExternalApi()); + +// Named step (recommended for debugging/testing) +var user = await context.StepAsync( + async (step) => await FetchUser(userId), + name: "fetch_user"); + +// Using the step-scoped logger (includes step name, attempt number, operation ID) +var order = await context.StepAsync( + async (step) => + { + step.Logger.LogInformation("Fetching order {OrderId}", orderId); + return await orderService.GetOrder(orderId); + }, + name: "get_order"); + +// Step with configuration +var payment = await context.StepAsync( + async (step) => await chargeCard(amount), + name: "charge_card", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, initialDelay: TimeSpan.FromSeconds(1)) + }); +``` + +#### Step Semantics + +| Semantics | Behavior | Use Case | +|-----------|----------|----------| +| `AtLeastOncePerRetry` (default) | Step re-executes on each retry | Idempotent operations (calculations, reads) | +| `AtMostOncePerRetry` | Step executes at most once per retry | Side effects (payments, emails, writes) | + +--- + +### Wait Operations + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/wait.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/wait-handler/wait-handler.ts) + +Waits suspend the function without consuming compute time. Lambda can recycle the execution environment. + +```csharp +// Wait for a specific duration +await context.WaitAsync(TimeSpan.FromSeconds(30)); +await context.WaitAsync(TimeSpan.FromMinutes(5), name: "cooldown"); +await context.WaitAsync(TimeSpan.FromHours(24), name: "daily_check"); +await context.WaitAsync(TimeSpan.FromDays(7), name: "weekly_reminder"); +``` + +> **Validation:** The duration must be at least 1 second. Values less than 1 second throw `ArgumentOutOfRangeException`. Sub-second precision is truncated to whole seconds (the underlying service operates at second granularity). + +--- + +### Callbacks + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/callback.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/callback-handler/callback.ts) + +Callbacks let your workflow pause until an external system responds (human approval, a webhook, a third-party API). + +#### Create a Callback (Advanced) + +```csharp +// Create a callback and get the callback ID +var callback = await context.CreateCallbackAsync( + name: "approval_callback", + config: new CallbackConfig + { + Timeout = TimeSpan.FromHours(24), + HeartbeatTimeout = TimeSpan.FromHours(2) + }); + +// Send the callback ID to an external system +await context.StepAsync( + async () => await SendApprovalEmail(callback.CallbackId, recipientEmail), + name: "send_approval_email"); + +// Wait for the external system to respond +var result = await callback.GetResultAsync(); +``` + +#### Wait For Callback (Simple) + +```csharp +// Combined pattern: create callback, submit to external system, wait for result +var approval = await context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + await SendApprovalEmail(callbackId, managerEmail); + }, + name: "wait_for_approval", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromHours(24), + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3) + }); + +if (approval.Approved) +{ + await context.StepAsync(async (step) => await ExecutePlan(), name: "execute"); +} +``` + +**Example `SendApprovalEmail` stub:** +```csharp +private async Task SendApprovalEmail(string callbackId, string recipientEmail) +{ + // Include the callbackId in the approval link so the external system + // can complete the callback via the AWS API + var approvalLink = $"https://my-app.example.com/approve?callbackId={callbackId}"; + await emailService.SendAsync(recipientEmail, "Approval Required", $"Please approve: {approvalLink}"); +} +``` + +**External system completes the callback via AWS API:** +```bash +aws lambda send-durable-execution-callback-success \ + --function-name my-function:1 \ + --callback-id "cb-12345" \ + --payload '{"approved": true, "approver": "jane@example.com"}' +``` + +--- + +### Invoke (Chained Functions) + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/invoke.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/invoke-handler/invoke-handler.ts) + +Call another durable function. The invocation is checkpointed, so it survives failures and won't double-fire. + +```csharp +// Invoke another durable function +var paymentResult = await context.InvokeAsync( + functionName: "arn:aws:lambda:us-east-1:123456789012:function:payment-processor:prod", + payload: new PaymentRequest { Amount = 100, Currency = "USD" }, + name: "process_payment", + config: new InvokeConfig + { + TenantId = "tenant-42" + }); +``` + +> **Note:** Durable function invocations require **qualified identifiers** — include a version number, alias, or `$LATEST`: +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:prod` (alias) +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:42` (version) +> - ✅ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor:$LATEST` +> - ❌ `arn:aws:lambda:us-east-1:123456789012:function:payment-processor` (unqualified — not supported) + +--- + +### Parallel Execution + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/parallel.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/parallel-handler/parallel-handler.ts) + +Run independent operations concurrently. The JS SDK uses a `DurablePromise` pattern where operations are deferred until awaited; in .NET that isn't necessary because `ParallelAsync` and `MapAsync` cover the same use case idiomatically. `Task`-returning methods start immediately and `await` retrieves the result, so there's no gap to fill with a lazy wrapper. + +> **Prefer `ParallelAsync` over `Task.WhenAll`:** While `Task.WhenAll` works correctly with durable operations (operation IDs are allocated deterministically), it bypasses completion policies, concurrency limits, branch naming, and `IBatchResult` structured output. Always use `ParallelAsync` or `MapAsync` for concurrent durable operations. A future Roslyn analyzer (DE004) will flag `Task.WhenAll` usage with durable tasks and suggest `ParallelAsync` as a replacement. + +```csharp +// Run multiple operations in parallel +var results = await context.ParallelAsync( + new Func>[] + { + async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId), name: "fetch_user"), + async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId), name: "fetch_orders"), + async (ctx) => await ctx.StepAsync(async (step) => await FetchPreferences(userId), name: "fetch_prefs"), + }, + name: "parallel_fetch", + config: new ParallelConfig + { + MaxConcurrency = 3, + CompletionConfig = CompletionConfig.AllSuccessful() + }); + +// Access individual results +var userData = results.GetResults()[0]; +var orderHistory = results.GetResults()[1]; +var preferences = results.GetResults()[2]; +``` + +#### Named Parallel Branches + +For better observability, you can name individual branches (matching the JS SDK pattern): + +```csharp +// Named branches for easier debugging and testing +var results = await context.ParallelAsync( + new NamedBranch[] + { + new("fetch_user", async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId))), + new("fetch_orders", async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId))), + new("fetch_prefs", async (ctx) => await ctx.StepAsync(async (step) => await FetchPreferences(userId))), + }, + name: "parallel_fetch"); + +// In tests, you can find specific branches by name +var fetchUserBranch = result.GetOperation("fetch_user"); +``` + +#### Completion Configurations + +`ParallelAsync` and `MapAsync` accept a `CompletionConfig` to control when the overall operation is considered complete: + +```csharp +// All must succeed (default) +CompletionConfig.AllSuccessful() + +// Complete when any one succeeds +CompletionConfig.FirstSuccessful() + +// Complete when all finish (regardless of success/failure) +CompletionConfig.AllCompleted() + +// Custom: succeed if at least 3 succeed, tolerate up to 2 failures +new CompletionConfig +{ + MinSuccessful = 3, + ToleratedFailureCount = 2 +} +``` + +--- + +### Map Operations + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/map.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/map-handler/map-handler.ts) + +Process a collection in parallel with configurable concurrency. The `items` parameter accepts any `IReadOnlyList` (arrays, lists, etc.). + +```csharp +var orders = new[] { "order-1", "order-2", "order-3", "order-4", "order-5" }; + +var results = await context.MapAsync( + items: orders, // IReadOnlyList + func: async (ctx, orderId, index, allItems) => + { + return await ctx.StepAsync( + async () => await ProcessOrder(orderId), + name: $"process_order_{index}"); + }, + name: "process_all_orders", + config: new MapConfig + { + MaxConcurrency = 3, + CompletionConfig = CompletionConfig.AllSuccessful(), + ItemNamer = (orderId, index) => $"Order-{orderId}" // Readable names for observability + }); + +// Check results +results.ThrowIfError(); // Throws if any item failed +var processedOrders = results.GetResults(); +``` + +--- + +### Child Contexts + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/operation/child.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/handlers/run-in-child-context-handler/run-in-child-context-handler.ts) + +Child contexts group related durable operations into a sub-workflow. Use them when you need waits or multiple steps inside a logical unit (you cannot nest durable calls inside a step directly). + +```csharp +// Group operations into a child context +var enrichedData = await context.RunInChildContextAsync( + async (childCtx) => + { + var validated = await childCtx.StepAsync( + async () => await Validate(data), + name: "validate"); + + await childCtx.WaitAsync(TimeSpan.FromSeconds(1), name: "rate_limit"); + + var enriched = await childCtx.StepAsync( + async () => await Enrich(validated), + name: "enrich"); + + return enriched; + }, + name: "validation_phase"); + +// Use the enriched data in a subsequent step +var finalResult = await context.StepAsync( + async () => await SubmitEnrichedData(enrichedData), + name: "submit"); +``` + +> **Why child contexts?** You cannot nest durable operations inside a step. Steps are leaf operations. If you need multiple durable operations grouped together, use a child context. + +--- + +### Error Handling & Retry + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/retries.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/retry/retry-config/index.ts) + +#### Retry Strategies + +```csharp +// Exponential backoff with jitter +var result = await context.StepAsync( + async () => await CallUnreliableApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(60), + backoffRate: 2.0, + jitter: JitterStrategy.Full) + }); + +// Using presets +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Default // 6 attempts, 2x backoff, 5s initial, Full jitter + }); + +// Available presets: +// RetryStrategy.None — maxAttempts: 1 (no retry) +// RetryStrategy.Default — 6 attempts, 2x backoff, 5s initial delay, Full jitter +// RetryStrategy.Transient — 3 attempts, 2x backoff, 1s initial delay, Full jitter + +// Custom retry strategy +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = new CustomRetryStrategy((exception, attemptCount) => + { + // Only retry transient errors + if (exception is HttpRequestException httpEx && httpEx.StatusCode >= 500) + return RetryDecision.RetryAfter(TimeSpan.FromSeconds(Math.Pow(2, attemptCount))); + + return RetryDecision.DoNotRetry(); + }) + }); + +// Retry with specific exception types +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException), typeof(HttpRequestException) }) + }); + +// Retry with message pattern matching (regex) +var result = await context.StepAsync( + async () => await CallApi(), + name: "api_call", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(HttpRequestException) }, + retryableMessagePatterns: new[] { "timeout", "throttl", "5\\d{2}" }) + }); +``` + +#### Jitter Strategies + +Jitter prevents thundering-herd scenarios where multiple retrying clients converge on the same backoff schedule. The SDK supports three jitter strategies: + +```csharp +public enum JitterStrategy +{ + /// No randomization — delay is exactly the calculated backoff value. + None, + + /// Random delay between 0 and the calculated backoff value (recommended). + Full, + + /// Random delay between 50% and 100% of the calculated backoff value. + Half +} +``` + +The default jitter for `RetryStrategy.Exponential()` is `JitterStrategy.Full`. All built-in presets (`RetryStrategy.Default`, `RetryStrategy.Transient`) also use `JitterStrategy.Full`. Use `JitterStrategy.None` only when you need deterministic retry timing (e.g., for testing). + +#### Retry Strategy Interface + +```csharp +public interface IRetryStrategy +{ + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +public record RetryDecision +{ + public bool ShouldRetry { get; } + public TimeSpan Delay { get; } + + public static RetryDecision DoNotRetry() => new() { ShouldRetry = false }; + public static RetryDecision RetryAfter(TimeSpan delay) => new() { ShouldRetry = true, Delay = delay }; +} +``` + +`IRetryStrategy` supports implicit conversion from `Func`, enabling inline lambdas: + +```csharp +config: new StepConfig +{ + RetryStrategy = (ex, attempt) => + attempt < 3 && ex is HttpRequestException + ? RetryDecision.RetryAfter(TimeSpan.FromSeconds(Math.Pow(2, attempt))) + : RetryDecision.DoNotRetry() +} +``` + +#### Saga Pattern (Compensating Transactions) + +```csharp +[DurableExecution] +public async Task Handler(BookingRequest input, IDurableContext context) +{ + var compensations = new List<(string Name, Func Action)>(); + + try + { + var flight = await context.StepAsync( + async () => await BookFlight(input), + name: "book_flight"); + compensations.Add(("cancel_flight", async () => await CancelFlight(flight.Id))); + + var hotel = await context.StepAsync( + async () => await BookHotel(input), + name: "book_hotel"); + compensations.Add(("cancel_hotel", async () => await CancelHotel(hotel.Id))); + + var car = await context.StepAsync( + async () => await BookCar(input), + name: "book_car"); + compensations.Add(("cancel_car", async () => await CancelCar(car.Id))); + + return new BookingResult { Status = "confirmed" }; + } + catch (Exception ex) + { + // Execute compensations in reverse order + foreach (var (name, action) in compensations.AsEnumerable().Reverse()) + { + await context.StepAsync(action, name: name); + } + return new BookingResult { Status = "cancelled", Error = ex.Message }; + } +} +``` + +--- + +### Logging + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/logger.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/logger/logger.ts) + +`context.Logger` is replay-aware: it suppresses duplicate messages that would otherwise repeat on every invocation. Use it instead of `Console.WriteLine`. + +> **Implementation note:** The replay-aware logger is implemented entirely in the durable execution SDK. During replay, the SDK tracks which operations are being restored from checkpoint state vs. executing for the first time, and suppresses log output for replayed operations. No changes to `Amazon.Lambda.RuntimeSupport` or the Lambda Runtime API are required. + +```csharp +[DurableExecution] +public async Task Handler(MyEvent input, IDurableContext context) +{ + // ✅ Replay-safe: only logs once even during replay + context.Logger.LogInformation("Starting workflow for {OrderId}", input.OrderId); + + var result = await context.StepAsync( + async () => await ProcessData(input.Data), + name: "process_data"); + + // ✅ Replay-safe + context.Logger.LogInformation("Processing complete: {Result}", result); + + // ❌ NOT replay-safe: will log on every replay + Console.WriteLine("This will repeat!"); + + return result; +} +``` + +The logger integrates with `Microsoft.Extensions.Logging`: + +```csharp +// context.Logger implements ILogger +context.Logger.LogDebug("Debug info"); +context.Logger.LogInformation("Info message"); +context.Logger.LogWarning("Warning: {Detail}", detail); +context.Logger.LogError(exception, "Error occurred"); +``` + +#### Custom Logger Configuration + +You can swap the logger or disable replay-aware filtering (e.g., to see logs during replay for debugging): + +```csharp +// Use a custom logger (e.g., Serilog, AWS Lambda Powertools) +context.ConfigureLogger(new LoggerConfig +{ + CustomLogger = myCustomLogger, + ModeAware = true // true = suppress during replay (default), false = always log +}); + +// Disable replay-aware filtering to see ALL logs (useful for debugging) +context.ConfigureLogger(new LoggerConfig { ModeAware = false }); +``` + +--- + +## Internals + +### AWS APIs used + +| API | Purpose | +|-----|---------| +| `CheckpointDurableExecution` | Persist operation state (step results, waits, etc.) | +| `GetDurableExecutionState` | Retrieve previously checkpointed state on replay | +| `SendDurableExecutionCallbackSuccess` | External systems signal callback completion | +| `SendDurableExecutionCallbackFailure` | External systems signal callback failure | +| `SendDurableExecutionCallbackHeartbeat` | External systems send heartbeat signals | + +### How suspension works internally + +This follows the same pattern as the JavaScript SDK's `Promise.race`. The .NET equivalent is `Task.WhenAny`. + +When `RunAsync` starts, it kicks off two tasks in parallel: user code and a termination signal (a `TaskCompletionSource` that starts unresolved). Whoever finishes first wins: + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ DurableExecutionHandler.RunAsync │ +│ │ +│ var userTask = userHandler(context); │ +│ var terminationTask = terminationManager.TerminationTask; │ +│ │ +│ var winner = await Task.WhenAny(userTask, terminationTask); │ +│ │ +│ ┌─── userTask ───────────────────┐ ┌─── terminationTask ────────┐ │ +│ │ StepAsync("fetch") → execute │ │ (unresolved TCS - waiting) │ │ +│ │ WaitAsync("delay") → ... │ │ │ │ +│ │ calls Terminate() ──────────────► SetResult() → resolves! │ │ +│ │ awaits forever (blocked) │ │ │ │ +│ └────────────────────────────────┘ └────────────────────────────┘ │ +│ │ +│ winner == terminationTask → return PENDING │ +│ (userTask is abandoned, GC collects it) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +The `TerminationManager` is a thin wrapper around `TaskCompletionSource`: +- `TerminationTask` -- a Task that hangs forever until `Terminate()` is called +- `Terminate(reason)` -- resolves the TCS, causing the race to pick termination + +When user code hits a pending wait or callback: +1. It checkpoints the operation state +2. Calls `terminationManager.Terminate(WaitScheduled)` +3. Awaits a new never-completing `TaskCompletionSource` (blocks itself permanently) +4. `Task.WhenAny` sees the termination task resolved and picks it as the winner +5. `RunAsync` returns PENDING; the abandoned user task is left to be GC'd; Lambda terminates + +### Lifecycle and cleanup + +`RunAsync` manages the full lifecycle internally. When the handler completes (SUCCEEDED/FAILED) or suspends (PENDING), `RunAsync` stops the background checkpoint batcher, flushes any pending checkpoint operations, and disposes internal state. Users never call `Dispose` or wrap anything in `await using`. + +--- + +## API Reference + +### DurableFunction + +Static helper for the non-Annotations handler path. Wraps a workflow function, handling all envelope translation between `DurableExecutionInvocationInput`/`DurableExecutionInvocationOutput` and user types. + +```csharp +/// +/// Static helper that wraps a durable workflow function, handling all envelope +/// translation between DurableExecutionInvocationInput/Output and user types. +/// +/// All four overloads dispatch through the ILambdaSerializer registered on +/// ILambdaContext.Serializer, so AOT-safe and reflection-based callers share a +/// single code path. Callers wire AOT support by registering an AOT-aware +/// serializer with the runtime (e.g., SourceGeneratorLambdaJsonSerializer<TContext>) +/// — there is no per-call JsonSerializerContext argument. +/// +public static class DurableFunction +{ + /// + /// Wrap a workflow (typed input + output). + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext); + + /// + /// Wrap a workflow (typed input + output) with explicit Lambda client. + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient); + + /// + /// Wrap a void workflow (typed input, no output). + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext); + + /// + /// Wrap a void workflow with explicit Lambda client. + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient); +} +``` + +`WrapAsync` requires an `ILambdaSerializer` on `ILambdaContext.Serializer`. If none is registered the helper throws `InvalidOperationException` with a message that points at the three places to register one (assembly attribute, `LambdaBootstrapBuilder.Create`, or `TestLambdaContext.Serializer` for tests). + +### IDurableContext + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/context.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/durable-context.ts) + +The primary interface developers interact with: + +```csharp +public interface IDurableContext +{ + /// + /// Replay-safe logger. Messages are de-duplicated during replay. + /// + ILogger Logger { get; } + + /// + /// Metadata about the current durable execution. + /// + IExecutionContext ExecutionContext { get; } + + /// + /// The underlying Lambda context. + /// + ILambdaContext LambdaContext { get; } + + // ── StepAsync overloads ──────────────────────────────────────────── + // The user's function always receives IStepContext, matching the + // Python and JS SDKs (Java has no-context overloads but deprecated + // them — see https://github.com/aws/aws-durable-execution-sdk-java). + // Step results are serialized via the ILambdaSerializer registered on + // ILambdaContext.Serializer. AOT and reflection callers share one + // overload — the AOT story is determined by the registered serializer. + + /// + /// Execute a step with automatic checkpointing. The IStepContext provides + /// a step-scoped logger with operation metadata (step name, attempt number, + /// operation ID) and the current attempt number. + /// + Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute a step that returns no value. + /// + Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Suspend execution for the specified duration. + /// Throws ArgumentOutOfRangeException if duration is less than 1 second. + /// + Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); + + /// + /// Create a callback for an external system to complete. Returns an + /// handle exposing the service-allocated + /// (pass to the external system) and + /// + /// (await to suspend until a result arrives). + /// + /// + /// The callback result is deserialized using the + /// registered on . AOT and reflection-based + /// scenarios share this single overload — the AOT story is determined by the + /// registered serializer (e.g., + /// SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// Errors are deferred to ; + /// CreateCallbackAsync always returns successfully so user code + /// between CreateCallbackAsync and the result-await runs deterministically + /// across replays. + /// + /// + Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Composite operation that creates a callback, runs the supplied submitter + /// (which hands the callbackId to an external system), and suspends + /// until the external system delivers a result. Equivalent to manually + /// composing + /// + + /// + + /// inside a child context. + /// + /// + /// Submitter failures (after retries are exhausted) surface as + /// . Callback failures and timeouts + /// surface as / + /// . + /// + Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Invoke another durable function. + /// + Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple operations in parallel (unnamed branches). + /// + Task> ParallelAsync( + IReadOnlyList>> functions, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple named operations in parallel. Named branches appear in + /// execution traces and can be inspected by name in tests. + /// + Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Process a collection of items in parallel. + /// + Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Run operations in an isolated child context. + /// + Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Poll until a condition is met. + /// + Task WaitForConditionAsync( + Func> check, + WaitForConditionConfig config, + string? name = null, + CancellationToken cancellationToken = default); +} +``` + +#### Supporting Types + +```csharp +/// +/// Context passed to step functions. Provides step-scoped logging and metadata. +/// +public interface IStepContext +{ + /// + /// Logger scoped to this step. Includes step name, operation ID, and attempt + /// number in structured log metadata automatically. + /// + ILogger Logger { get; } + + /// + /// The current retry attempt number (1-based). + /// + int AttemptNumber { get; } + + /// + /// The deterministic operation ID for this step. + /// + string OperationId { get; } +} + +/// +/// Context passed to the submitter delegate of WaitForCallbackAsync. +/// Distinct from so the submitter API can evolve +/// independently. Mirrors WaitForCallbackContext in the Python and +/// JavaScript SDKs (logger-only surface). +/// +public interface IWaitForCallbackContext +{ + /// + /// Logger scoped to the submitter step (replay-safe). + /// + ILogger Logger { get; } +} + +/// +/// A named branch for parallel execution. Named branches appear in execution +/// traces and can be inspected by name in the test runner. +/// +public record DurableBranch(string Name, Func> Func); +``` + +#### CancellationToken behavior + +All methods accept a per-call `CancellationToken` that follows standard .NET semantics: cancellation throws `OperationCanceledException` and the execution fails. Cancellation does **not** trigger suspension — those are separate concepts. + +The durable execution service handles timeout scenarios automatically: if Lambda terminates mid-execution, the next invocation simply replays from the last checkpoint. For advanced users who want to suspend gracefully before timeout, check `context.LambdaContext.RemainingTime` and return early. + +### Configuration Types + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/config.py) | JavaScript: [step](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/step.ts) | [batch](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/types/batch.ts) + +```csharp +/// +/// Configuration for step execution. +/// +public class StepConfig +{ + /// + /// Retry strategy for failed steps. Default is no retry. + /// Accepts IRetryStrategy implementations (RetryStrategy.Exponential, etc.) + /// or an inline function via implicit conversion from + /// Func<Exception, int, RetryDecision>. + /// + public IRetryStrategy? RetryStrategy { get; set; } + + /// + /// Execution semantics. Default is AtLeastOncePerRetry. + /// + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; + + // Note: there is no Serializer property here. Step result serialization + // is delegated to the ILambdaSerializer registered on + // ILambdaContext.Serializer (assembly attribute or + // LambdaBootstrapBuilder.Create). +} + +public enum StepSemantics +{ + /// + /// Step re-executes on each retry attempt. Safe for idempotent operations. + /// + AtLeastOncePerRetry, + + /// + /// Step executes at most once per retry attempt. Use for side effects. + /// + AtMostOncePerRetry +} + +/// +/// Configuration for callback operations. +/// +public class CallbackConfig +{ + /// + /// Maximum time to wait for callback response. Default (TimeSpan.Zero) means no timeout. + /// + public TimeSpan Timeout { get; set; } = TimeSpan.Zero; + + /// + /// Maximum time between heartbeat signals before timeout. Default (TimeSpan.Zero) means no heartbeat timeout. + /// + public TimeSpan HeartbeatTimeout { get; set; } = TimeSpan.Zero; + + // Note: there is no Serializer property here. Callback result + // serialization flows through the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. +} + +/// +/// Configuration for wait-for-callback operations. +/// +public class WaitForCallbackConfig : CallbackConfig +{ + /// + /// Retry strategy for the submitter function. + /// + public IRetryStrategy? RetryStrategy { get; set; } +} + +/// +/// Configuration for invoke operations. +/// +public class InvokeConfig +{ + /// + /// Optional tenant identifier propagated to the chained invocation. + /// Matches the tenantId field on Python/JS/Java InvokeConfig. + /// + public string? TenantId { get; set; } + + // Note: there are no payload/result serializer properties here. Both + // flow through the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. +} + +/// +/// Controls how branches are represented in the checkpoint graph. +/// +public enum NestingType +{ + /// + /// Each branch creates a full isolated CONTEXT operation. Higher observability + /// in execution traces but more checkpoint operations (default). + /// + Nested, + + /// + /// Branches use virtual contexts sharing the parent. Reduces checkpoint cost + /// by ~30% at the expense of less granular execution traces. + /// + Flat +} + +/// +/// Configuration for parallel execution. +/// +public class ParallelConfig +{ + /// + /// Maximum concurrent branches. Null = unlimited. + /// + public int? MaxConcurrency { get; set; } + + /// + /// When to consider the operation complete. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How branches are represented in the checkpoint graph. + /// Nested = full isolated context per branch (default). + /// Flat = virtual contexts sharing parent (~30% fewer checkpoint operations). + /// + public NestingType NestingType { get; set; } = NestingType.Nested; +} + +/// +/// Configuration for map operations. +/// +public class MapConfig +{ + /// + /// Maximum concurrent items. Null = unlimited. + /// + public int? MaxConcurrency { get; set; } + + /// + /// When to consider the operation complete. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How item branches are represented in the checkpoint graph. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; + + /// + /// Optional batching configuration for grouping items before processing. + /// When set, items are grouped into batches and each batch is processed as a unit. + /// Reduces checkpoint overhead for large collections. + /// + public ItemBatcher? Batcher { get; set; } + + /// + /// Optional function to generate a custom name for each item's branch. + /// Improves observability in execution traces. Receives the item and its index. + /// If null, branches are named by index (e.g., "0", "1", "2"). + /// + public Func? ItemNamer { get; set; } +} + +/// +/// Groups items into batches for map operations to reduce checkpoint overhead. +/// At least one of MaxItemsPerBatch or MaxBytesPerBatch must be set. +/// +public class ItemBatcher +{ + /// + /// Maximum number of items per batch. Null = no count limit. + /// + public int? MaxItemsPerBatch { get; set; } + + /// + /// Maximum serialized size (bytes) per batch. Null = no size limit. + /// + public int? MaxBytesPerBatch { get; set; } +} + +/// +/// Defines completion criteria for parallel/map operations. +/// +public class CompletionConfig +{ + public int? MinSuccessful { get; set; } + public int? ToleratedFailureCount { get; set; } + public double? ToleratedFailurePercentage { get; set; } + + public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; + public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; + public static CompletionConfig AllCompleted() => new(); +} + +/// +/// Configuration for child context operations. +/// +public class ChildContextConfig +{ + // Note: there is no Serializer property here. The child context's + // return value is serialized via the ILambdaSerializer registered on + // ILambdaContext.Serializer, the same as StepAsync. + + /// + /// Operation sub-type label for observability (e.g., in test runner output). + /// + public string? SubType { get; set; } + + /// + /// Optional function to transform exceptions from the child context before + /// surfacing them to the parent. Useful for wrapping low-level errors into + /// domain-specific exceptions. + /// + public Func? ErrorMapping { get; set; } +} + +/// +/// Configuration for wait-for-condition (polling). +/// +public class WaitForConditionConfig +{ + /// + /// Initial state passed to the first check invocation. + /// + public required TState InitialState { get; set; } + + /// + /// Strategy controlling how long to wait between checks. + /// + public required IWaitStrategy WaitStrategy { get; set; } +} +``` + +### Result Types + +```csharp +/// +/// Result of a parallel or map operation. +/// +public interface IBatchResult +{ + /// + /// All items, in original index order. + /// + IReadOnlyList> All { get; } + + /// + /// Items whose Status is Succeeded. + /// + IReadOnlyList> Succeeded { get; } + + /// + /// Items whose Status is Failed. + /// + IReadOnlyList> Failed { get; } + + /// + /// Items still in flight when the batch resolved (CompletionConfig short-circuit). + /// + IReadOnlyList> Started { get; } + + /// + /// Get all successful results in original index order. Throws if any failed. + /// + IReadOnlyList GetResults(); + + /// + /// Get all errors from failed items. + /// + IReadOnlyList GetErrors(); + + /// + /// Throw a single aggregated exception if any item failed. + /// + void ThrowIfError(); + + /// + /// True if any item is in the Failed state. + /// + bool HasFailure { get; } + + /// + /// Why the batch resolved. + /// + CompletionReason CompletionReason { get; } + + int SuccessCount { get; } + int FailureCount { get; } + int StartedCount { get; } + int TotalCount { get; } +} + +public interface IBatchItem +{ + int Index { get; } + BatchItemStatus Status { get; } + T? Result { get; } + DurableExecutionException? Error { get; } +} + +/// +/// Status of an individual item in a batch result. +/// Mirrors the wire-state observed at the time the batch resolved — items still +/// running when a CompletionConfig short-circuits remain in . +/// +public enum BatchItemStatus +{ + /// + /// The branch ran to completion and produced a result. + /// + Succeeded, + + /// + /// The branch ran to completion and threw. + /// + Failed, + + /// + /// The branch was still in flight when the batch's CompletionConfig + /// resolved (e.g., FirstSuccessful returned before this branch finished). + /// + Started +} +public enum CompletionReason { AllCompleted, MinSuccessfulReached, FailureToleranceExceeded } + +/// +/// Represents a pending callback. +/// +public interface ICallback +{ + /// + /// The callback ID to send to external systems. + /// + string CallbackId { get; } + + /// + /// Wait for and return the callback result. + /// Suspends execution until the result is available. + /// + /// External system reported failure. + /// Service marked the callback TIMED_OUT. + Task GetResultAsync(CancellationToken cancellationToken = default); +} + +/// +/// Metadata about the current execution. +/// +public interface IExecutionContext +{ + /// + /// The ARN of the current durable execution. + /// + string DurableExecutionArn { get; } +} +``` + +### Exception Types + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/exceptions.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/errors/durable-error/durable-error.ts) + +```csharp +/// +/// Base exception for all durable execution errors. +/// +public class DurableExecutionException : Exception { } + +/// +/// Thrown when user code inside a step fails (after retries exhausted). +/// Contains the original error details from the checkpoint. +/// +public class StepException : DurableExecutionException +{ + public string? ErrorType { get; } + public string? ErrorData { get; } + public IReadOnlyList? StackTrace { get; } +} + +/// +/// Base exception for callback failures. Concrete subclasses distinguish +/// failure modes — pattern-match the subclass type rather than inspecting +/// a flag. +/// +public class CallbackException : DurableExecutionException +{ + public string? CallbackId { get; init; } + public string? ErrorType { get; init; } + public string? ErrorData { get; init; } + public IReadOnlyList? OriginalStackTrace { get; init; } +} + +/// External system reported a failure result for the callback. +public class CallbackFailedException : CallbackException { } + +/// Service marked the callback TIMED_OUT (overall or heartbeat). +public class CallbackTimeoutException : CallbackException { } + +/// +/// Submitter step (the inner step inside WaitForCallbackAsync) failed +/// after retries are exhausted. Wraps the underlying StepException. +/// Only thrown from WaitForCallbackAsync. +/// +public class CallbackSubmitterException : CallbackException { } + +/// +/// Base exception for chained-invoke failures. Catch InvokeException +/// to handle every non-success terminal state uniformly, or pattern-match the +/// concrete subclasses (InvokeFailedException, InvokeTimedOutException, +/// InvokeStoppedException) to react differently to specific outcomes. +/// Mirrors the Java SDK's invoke exception tree. +/// +public class InvokeException : DurableExecutionException +{ + public string? FunctionName { get; init; } + public string? ErrorType { get; init; } + public string? ErrorData { get; init; } + public IReadOnlyList? OriginalStackTrace { get; init; } +} + +/// The chained function ran and threw. +public class InvokeFailedException : InvokeException { } + +/// The chained invocation reached the service-side TIMED_OUT terminal state. +public class InvokeTimedOutException : InvokeException { } + +/// The chained execution was stopped by the service before reaching a normal terminal state. +public class InvokeStoppedException : InvokeException { } + +/// +/// Thrown when a child context operation fails. +/// +public class ChildContextException : DurableExecutionException +{ + public string? SubType { get; } +} + +/// +/// Thrown when a wait-for-condition operation exhausts all attempts +/// without the condition being met. +/// +public class WaitForConditionException : DurableExecutionException +{ + public int AttemptsExhausted { get; } +} + +/// +/// Thrown when the operation sequence during replay does not match +/// the previously checkpointed history. Indicates non-deterministic code. +/// +public class NonDeterministicException : DurableExecutionException +{ + public string? ExpectedOperationId { get; } + public string? ActualOperationId { get; } +} + +/// +/// Thrown when a step is interrupted mid-execution (e.g., Lambda timeout or +/// runtime termination). The step did not complete and its result was not +/// checkpointed. On the next invocation, the step will re-execute from scratch. +/// +public class StepInterruptedException : DurableExecutionException +{ + public string? StepName { get; } + public int AttemptNumber { get; } +} + +/// +/// Thrown when checkpoint serialization or deserialization fails. +/// +public class SerializationException : DurableExecutionException { } + +/// +/// Thrown when input validation fails. +/// +public class DurableValidationException : DurableExecutionException { } + +/// +/// Thrown when the checkpoint API call fails. +/// +public class CheckpointException : DurableExecutionException +{ + public bool IsRetriable { get; } +} +``` + +--- + +## Serialization + +> **Implementations:** [Python](https://github.com/aws/aws-durable-execution-sdk-python/blob/main/src/aws_durable_execution_sdk_python/serdes.py) | [JavaScript](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js/src/utils/serdes/serdes.ts) + +### Default behavior + +Step results are serialized to JSON (via `System.Text.Json`) before checkpointing. Your return types need to be JSON-serializable. + +```csharp +// ✅ GOOD: JSON-serializable types +public record OrderResult(string OrderId, decimal Total, bool IsCompleted); + +// ❌ BAD: Non-serializable types +public class BadResult +{ + public Stream DataStream { get; set; } // Not serializable + public HttpClient Client { get; set; } // Not serializable +} +``` + +### Custom Serialization + +There is no per-call serializer override on any durable-execution API. Every checkpoint — step results, callback results, invoke payloads/results, child-context results — is serialized via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. To customize, register a different `ILambdaSerializer` for the function: + +```csharp +// Class library mode — register via the assembly attribute. +[assembly: LambdaSerializer(typeof(MyCustomSerializer))] + +// Executable / custom runtime — pass to LambdaBootstrapBuilder.Create. +using var bootstrap = LambdaBootstrapBuilder.Create(handler, new MyCustomSerializer()).Build(); +``` + +The customization applies uniformly to the whole function — there is no way today to swap the format for a single step or a single result type. See [NativeAOT compatibility](#nativeaot-compatibility) for how the registration flows in JIT vs. AOT. + +### Class library vs. executable output + +All samples in this doc use the class library pattern (no `Main` method). This is the default for Lambda functions. To turn a durable function project into an executable (required for NativeAOT or custom runtimes): + +**With Annotations** — add the global attribute to auto-generate a `Main` method: +```csharp +[assembly: LambdaGlobalProperties(GenerateMain = true)] +``` + +**Without Annotations** — provide your own `Main` method: +```csharp +public static async Task Main(string[] args) +{ + using var bootstrap = new LambdaBootstrap( + new Function().FunctionHandler, + new DefaultLambdaJsonSerializer()); + await bootstrap.RunAsync(); +} +``` + +Both approaches produce a self-contained executable that the Lambda custom runtime can invoke. + +### NativeAOT compatibility + +The SDK is AOT-friendly but does not require AOT. The default JSON serialization uses reflection (standard `System.Text.Json` behavior), which works in JIT mode. **AOT safety is determined entirely by which `ILambdaSerializer` the user registers with the Lambda runtime** — there is no separate AOT-only API surface in the SDK, and no per-call `JsonSerializerContext` argument anywhere on `WrapAsync` or `IDurableContext`. The same overloads work in JIT and AOT; the difference is whether `ILambdaContext.Serializer` resolves to `DefaultLambdaJsonSerializer` (reflection) or `SourceGeneratorLambdaJsonSerializer` (AOT). + +The SDK itself avoids `Activator.CreateInstance`, `Type.GetType()`, and other reflection patterns, and uses `[DynamicallyAccessedMembers]` trimming annotations where needed. + +#### What the user registers in their `JsonSerializerContext` + +For AOT, the user's source-generated context must include: + +1. **Wire-envelope types** — `DurableExecutionInvocationInput` and `DurableExecutionInvocationOutput`. The handler signature is typed against these, so Lambda's runtime calls `serializer.Deserialize(...)` on each invoke and the source generator needs `JsonTypeInfo` for both. +2. **Workflow input / output POCOs** — every `TInput` / `TOutput` that appears in a `WrapAsync` call. +3. **Step result types** — every `T` that appears in `context.StepAsync(...)`. The SDK serializes step results via the same `ILambdaSerializer`, so each result type needs source-gen registration too. + +```csharp +// Class library mode — register via the assembly attribute. +[assembly: LambdaSerializer(typeof(SourceGeneratorLambdaJsonSerializer))] + +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(OrderEvent))] +[JsonSerializable(typeof(OrderResult))] +[JsonSerializable(typeof(Order))] // step result +public partial class MyJsonContext : JsonSerializerContext { } + +public class Function +{ + public Task FunctionHandler( + DurableExecutionInvocationInput invocationInput, ILambdaContext context) + => DurableFunction.WrapAsync(MyWorkflow, invocationInput, context); + + private async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + // Same StepAsync overload in JIT and AOT — the registered serializer decides. + var order = await context.StepAsync(async (step) => await GetOrder(), name: "get_order"); + // ... + } +} +``` + +For executable / custom-runtime deployments (no class library attribute), the same context is registered by passing the serializer to `LambdaBootstrapBuilder.Create(handler, serializer)` — see the [Manual Handler](#manual-handler-without-annotations) section. + +### Large payload and checkpoint overflow + +The durable execution service imposes size limits: + +- **256 KB** per individual operation checkpoint +- **6 MB** maximum Lambda response payload + +The SDK handles overflow transparently: + +**Step results exceeding 256 KB:** When a step's serialized result exceeds the checkpoint size limit, the SDK splits the checkpoint into a START operation (before execution) and a separate result checkpoint (after execution). On replay, the SDK fetches the result via the paginated `GetDurableExecutionState` API rather than reading it inline from the operation record. + +**Batch results (map/parallel) exceeding limits:** For large map/parallel operations, the SDK generates a compact summary for the parent operation's checkpoint. The summary includes item count, success/failure counts, and completion reason — but not individual item results. During replay, the SDK sets `ReplayChildren = true` on the state request, which causes the service to return child operation records so full results can be reconstructed. + +**Lambda response exceeding 6 MB:** If the final orchestration result exceeds the response payload limit, the SDK checkpoints the result before returning the `DurableExecutionInvocationOutput`. The service reads the result from the checkpoint rather than from the response body. + +**Guidance for very large results:** For results that are inherently large (multi-MB payloads), do the offload yourself inside the step — write the payload to external storage (S3, DynamoDB) and return a reference (e.g. an S3 key) from the step. The reference is what the SDK serializes and checkpoints, so the checkpoint stays small and pagination is avoided. Subsequent steps fetch the payload from external storage on demand. + +--- + +## Integration with Existing Libraries + +### Amazon.Lambda.Core + +The SDK uses existing Lambda core interfaces: +- `ILambdaContext` -- available via `context.LambdaContext` +- `ILambdaSerializer` -- used for event deserialization + +### Amazon.Lambda.RuntimeSupport + +The durable execution handler integrates with the existing runtime support bootstrap: + +```csharp +// The [DurableExecution] attribute signals that the handler +// receives DurableExecutionInvocationInput and returns DurableExecutionInvocationOutput +// The SDK handles the translation to/from the user's handler signature +``` + +### Amazon.Lambda.Annotations (optional) + +`Amazon.Lambda.Annotations` is an **optional** dependency. Users can write durable functions without it (see [Manual Handler](#manual-handler-without-annotations) above), but adding Annotations to the project reduces boilerplate significantly. + +When both packages are referenced, the Annotations source generator detects `[DurableExecution]` by fully-qualified name and at compile time: + +1. Generates a handler wrapper that translates `DurableExecutionInvocationInput` to/from your types +2. Manages context lifecycle (creation, checkpoint batching, cleanup) +3. Adds `DurableConfig` to the CloudFormation template +4. Adds the `AWSLambdaBasicDurableExecutionRolePolicy` managed policy + +```csharp +public class Functions +{ + [LambdaFunction] + [DurableExecution(ExecutionTimeout = 3600, RetentionPeriodInDays = 7)] + public async Task ProcessOrder( + [FromBody] OrderRequest request, + IDurableContext context) + { + var validated = await context.StepAsync( + async (step) => await Validate(request), + name: "validate"); + // ... + } +} +``` + +#### Custom Lambda Client + +For VPC endpoints, custom retry policies, or testing with mocked clients, inject a custom `IAmazonLambda` client via the `[DurableExecution]` attribute: + +```csharp +public class Functions +{ + private readonly IAmazonLambda _lambdaClient; + + public Functions(IAmazonLambda lambdaClient) + { + _lambdaClient = lambdaClient; + } + + [LambdaFunction] + [DurableExecution(LambdaClientFactory = nameof(_lambdaClient))] + public async Task ProcessOrder( + [FromBody] OrderRequest request, + IDurableContext context) + { + // ... + } +} +``` + +When no `LambdaClientFactory` is specified, the generated code creates a default `AmazonLambdaClient`. For the manual handler path (`DurableFunction.WrapAsync`), pass the client directly via the `IAmazonLambda lambdaClient` overload. + +> **Dependency boundaries:** `Amazon.Lambda.Annotations` has **no dependency** on the AWS SDK or on `Amazon.Lambda.DurableExecution`. The Annotations source generator references durable execution types by fully-qualified name strings only — it never takes a compile-time dependency on the durable package. The `[DurableExecution]` attribute is defined in `Amazon.Lambda.DurableExecution`, and the generated code resolves against the user's project references. There is only one source generator (Annotations) — no coordination between multiple generators is needed. + +### AWSSDK.Lambda + +The `Amazon.Lambda.DurableExecution` package depends on the AWS SDK for .NET Lambda client to make checkpoint API calls. This dependency is confined to the durable execution package — `Amazon.Lambda.Annotations` does not depend on the AWS SDK. + + +- `CheckpointDurableExecutionAsync` +- `GetDurableExecutionStateAsync` + +--- + +## Testing (customer-facing package) + +> **Implementations:** [JavaScript (local runner)](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-testing/src/test-runner/local/local-durable-test-runner.ts) | [JavaScript (cloud runner)](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-testing/src/test-runner/cloud/cloud-durable-test-runner.ts) + +We ship a separate NuGet package (`Amazon.Lambda.DurableExecution.Testing`) that lets developers test their durable functions locally without deploying to AWS. + +**Why this needs to exist:** A durable function requires multiple Lambda invocations to complete (invoke → PENDING → wait → re-invoke → SUCCEEDED). You can't test that with a normal unit test because there's no Lambda service orchestrating the re-invocations. The test runner simulates this loop in-process: it calls your handler, gets PENDING, marks waits as elapsed, calls your handler again with the prior checkpoint state, and repeats until the workflow completes. + +```csharp +var runner = new DurableTestRunner( + handler: new Function().Handler, + options: new TestRunnerOptions + { + SkipTime = true, // Waits complete instantly (no real delays) + MaxInvocations = 10 // Safety limit to prevent infinite loops + }); + +var result = await runner.RunAsync( + input: new OrderEvent { OrderId = "order-123" }, + timeout: TimeSpan.FromSeconds(30)); + +Assert.Equal(InvocationStatus.Succeeded, result.Status); +Assert.Equal("approved", result.Result.Status); + +// Inspect individual steps +var validateStep = result.GetStep("validate_order"); +Assert.True(validateStep.GetResult().IsValid); +``` + +The Python and JS SDKs both ship equivalent test runner packages. + +### Cloud Test Runner + +For integration testing against deployed functions, the testing package also ships a `CloudDurableTestRunner` with the same API as the local runner. This lets developers run the exact same assertions against a real Lambda function: + +```csharp +var runner = new CloudDurableTestRunner( + functionArn: "arn:aws:lambda:us-east-1:123456789012:function:process-order:$LATEST"); + +var result = await runner.RunAsync( + input: new OrderEvent { OrderId = "order-123" }, + timeout: TimeSpan.FromSeconds(60)); + +Assert.Equal(InvocationStatus.Succeeded, result.Status); +var validateStep = result.GetStep("validate_order"); +Assert.True(validateStep.GetResult().IsValid); +``` + +The cloud runner invokes the deployed function and polls `GetDurableExecutionState` until the execution reaches a terminal state, then reconstructs the same `TestResult` structure as the local runner. + +### Function Registration for Invoke Testing + +To test workflows that use `InvokeAsync` without deploying, register sibling functions with the local test runner: + +```csharp +var paymentHandler = new PaymentFunction().Handler; + +var runner = new DurableTestRunner( + handler: new OrderFunction().Handler, + options: new TestRunnerOptions { SkipTime = true }); + +runner.RegisterFunction("process-payment", paymentHandler); +runner.RegisterFunction( + "arn:aws:lambda:us-east-1:123:function:process-payment:$LATEST", + paymentHandler); + +var result = await runner.RunAsync(input: new OrderEvent { OrderId = "123" }); +``` + +When the workflow calls `context.InvokeAsync("process-payment", payload)`, the test runner routes to the registered handler instead of making an AWS API call. + +--- + +## Local development (Test Tool v2 and Aspire) + +The Lambda Test Tool v2 and the Aspire Lambda integration currently emulate single-invocation Lambda functions. Durable functions require a multi-invocation loop that neither tool supports today. To add support, the local emulator needs three things: + +### Checkpoint API endpoints + +The SDK calls these during execution. The emulator would serve them locally with in-memory storage: + +- `POST /checkpoint-durable-execution` -- store step results, wait records +- `GET /durable-execution-state` -- return accumulated state for replay + +### An orchestration loop + +When the function returns `PENDING`, the emulator needs to: +- Parse the checkpoint to determine what's pending (timer, callback, retry) +- Wait for that condition (or skip it in fast mode) +- Re-invoke the function with the accumulated `DurableExecutionInvocationInput` +- Repeat until `SUCCEEDED` or `FAILED` + +### Callback delivery + +An endpoint that external tools (or the developer via the UI) can call to deliver callback results: + +- `POST /send-durable-execution-callback-success` +- This triggers a re-invocation of the waiting execution + +### How this relates to the testing SDK + +The `DurableTestRunner` in the testing package implements the same orchestration loop programmatically. The test tool / Aspire enhancement would reuse this engine and wrap it in a web UI or Aspire dashboard, giving developers a visual way to see execution state, deliver callbacks manually, skip timers, and inspect checkpoint history. + +### Priority + +This is post-v1 work. For the initial release, developers test durable functions using the programmatic `DurableTestRunner` or by deploying to AWS. Test tool and Aspire support are a fast-follow once the core SDK is stable. + +--- + +## Requirements & Constraints + +- **Target framework:** `net8.0` only. .NET 6 is EOL and not supported. Durable functions are a new feature — adopters will be on the latest managed runtime. Targeting .NET 8 gives access to `required` properties, improved `System.Text.Json` source generation, and better NativeAOT support. +- **Lambda runtime:** Requires the managed .NET 8 runtime or a custom runtime (`provided.al2023`) for NativeAOT deployments. +- **Durable execution service:** The function must be configured with `DurableConfig` (handled automatically by the `[DurableExecution]` source generator). +- **Qualified function identifiers:** `InvokeAsync` requires a version number, alias, or `$LATEST` — unqualified ARNs are not supported for durable invocations. +- **Serializable results:** All step return types must be serializable by the `ILambdaSerializer` registered on `ILambdaContext.Serializer` (default: `System.Text.Json`). + +--- + +## Package Structure + +### Amazon.Lambda.DurableExecution (Runtime) + +The core SDK that runs in Lambda. Minimal dependencies. + +**Dependencies:** +- `Amazon.Lambda.Core` (existing) +- `AWSSDK.Lambda` (for checkpoint/state APIs) +- `Microsoft.Extensions.Logging.Abstractions` (for ILogger) + +### Amazon.Lambda.DurableExecution.Testing (Dev-only) + +Test runner and helpers for local/cloud testing. + +**Dependencies:** +- `Amazon.Lambda.DurableExecution` +- `Amazon.Lambda.TestUtilities` (existing) + +### Blueprints (`dotnet new` Templates) + +New `dotnet new` templates ship as part of the existing `Amazon.Lambda.Templates` NuGet package (same as all other Lambda blueprints in this repo under `Blueprints/BlueprintDefinitions/`). + +**Templates to ship:** + +| Template short name | Description | +|---------------------|-------------| +| `lambda.DurableFunction` | Minimal durable function with a single step and wait. Includes test project with `DurableTestRunner`. | +| `lambda.DurableFunction.Agentic` | GenAI agentic loop pattern (invoke model → check tool call → execute tool → repeat). | +| `lambda.DurableFunction.HumanInTheLoop` | Callback-based human approval workflow. | + +Each template includes: +- `.csproj` with correct NuGet references (`Amazon.Lambda.DurableExecution`, `Amazon.Lambda.Annotations`) +- Handler class with `[LambdaFunction]` + `[DurableExecution]` attributes +- `serverless.template` (auto-generated by source generator on build) +- Test project with `DurableTestRunner` and a passing test +- `aws-lambda-tools-defaults.json` for deployment via `dotnet lambda deploy-function` + +Running `dotnet new lambda.DurableFunction` should produce a buildable, testable, deployable project in under 30 seconds. + +--- + +## Implementation plan + +| Workstream | Scope | Estimate | +|------------|-------|----------| +| **Durable execution runtime** | Core SDK: replay engine, all context operations (step, wait, callback, invoke, parallel, map), checkpoint batching, retry, logging | ~5-6 weeks | +| **Annotations / source generator** | `[DurableExecution]` attribute, handler wrapper codegen, CloudFormation DurableConfig + IAM policy generation | ~2 weeks | +| **Testing SDK** | Local test runner (in-memory, time-skipping), cloud test runner, step inspection API | ~1.5 weeks | +| **Blueprints, docs, examples** | `dotnet new` project templates, developer guide, API reference, sample projects | ~2 weeks | +| **Roslyn analyzers** (P1 follow-up) | Static analysis detecting non-determinism, nesting violations, closure mutations | ~2 weeks | + +**Total: ~10-11 weeks (1 engineer familiar with the Python/JS SDKs)** + Roslyn analyzers as follow-up + +### Roslyn Analyzers (P1 Follow-up) + +> **Reference implementation:** JavaScript ESLint plugin — [no-non-deterministic-outside-step](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-non-deterministic-outside-step/no-non-deterministic-outside-step.ts) | [no-nested-durable-operations](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-nested-durable-operations/no-nested-durable-operations.ts) | [no-closure-in-durable-operations](https://github.com/aws/aws-durable-execution-sdk-js/blob/main/packages/aws-durable-execution-sdk-js-eslint-plugin/src/rules/no-closure-in-durable-operations/no-closure-in-durable-operations.ts) + +Ship as a separate NuGet package: `Amazon.Lambda.DurableExecution.Analyzers` + +The JavaScript SDK ships an ESLint plugin (`@aws/durable-execution-sdk-js-eslint-plugin`) with three rules that catch the most common durable execution mistakes at author time. The .NET equivalent uses Roslyn diagnostic analyzers: + +| Diagnostic ID | Severity | Rule | Rationale | +|---------------|----------|------|-----------| +| DE001 | Warning | `DateTime.Now`, `DateTime.UtcNow`, `Guid.NewGuid()`, `Random.Next()`, `Random.Shared`, `Environment.TickCount` used outside a `StepAsync` body | Non-deterministic values produce different results on replay, breaking checkpoint consistency | +| DE002 | Error | Calling `context.StepAsync`, `WaitAsync`, `ParallelAsync`, `MapAsync`, `InvokeAsync`, `RunInChildContextAsync`, `CreateCallbackAsync`, or `WaitForCallbackAsync` inside a `StepAsync` lambda | Steps are leaf operations — nesting durable operations inside a step produces unpredictable behavior | +| DE003 | Warning | Mutable variable captured by a `StepAsync` lambda and written to inside the lambda body | On replay the step returns cached result without executing, so the write never happens — the outer variable has stale state | +| DE004 | Info | `Task.WhenAll` or `Task.WhenAny` called with tasks returned by durable context methods | Suggest using `ParallelAsync` for completion policies, nesting control, and observability | + +These analyzers run at compile time in the IDE (IntelliSense squiggles) and during `dotnet build`, preventing the most confusing class of runtime failures. + +--- + +## Cross-SDK API comparison + +All four SDKs expose the same core operations. The differences are naming conventions, parameter ordering, and concurrency model. + +| Operation | .NET | Python | JavaScript | Java | +|-----------|------|--------|------------|------| +| Step | `context.StepAsync(func, name?, config?)` | `context.step(func, name?, config?)` | `context.step(name?, fn, config?)` → `DurablePromise` | `context.step(name, type, func, config?)` (blocking) / `context.stepAsync(...)` → `DurableFuture` | +| Wait | `context.WaitAsync(duration, name?)` | `context.wait(duration, name?)` | `context.wait(name?, duration)` → `DurablePromise` | +| Create callback | `context.CreateCallbackAsync(name?, config?)` | `context.create_callback(name?, config?)` | `context.createCallback(name?, config?)` | +| Wait for callback | `context.WaitForCallbackAsync(submitter, name?, config?)` | `context.wait_for_callback(submitter, name?, config?)` | `context.waitForCallback(name?, submitter, config?)` | +| Invoke | `context.InvokeAsync(funcName, payload, name?, config?)` | `context.invoke(func_name, payload, name?, config?)` | `context.invoke(name?, funcId, input, config?)` → `DurablePromise` | +| Parallel | `context.ParallelAsync(functions, name?, config?)` | `context.parallel(functions, name?, config?)` | `context.parallel(name?, branches, config?)` | +| Map | `context.MapAsync(items, func, name?, config?)` | `context.map(inputs, func, name?, config?)` | `context.map(name?, items, mapFunc, config?)` | +| Child context | `context.RunInChildContextAsync(func, name?, config?)` | `context.run_in_child_context(func, name?, config?)` | `context.runInChildContext(name?, fn, config?)` | +| Wait for condition | `context.WaitForConditionAsync(check, config, name?)` | `context.wait_for_condition(check, config, name?)` | `context.waitForCondition(name?, checkFunc, config?)` | +| Logger | `context.Logger` (ILogger) | `context.logger` (Logger) | `context.logger` (DurableContextLogger) | +| Lambda context | `context.LambdaContext` | `context.lambda_context` | `context.lambdaContext` | +| Execution context | `context.ExecutionContext` | `context.execution_context` | *(via logger metadata)* | +| Promise combinators | `CompletionConfig` on `ParallelAsync` | `CompletionConfig` on `parallel`/`map` | `context.promise.all/allSettled/any/race` | +| Configure logger | `context.ConfigureLogger(config)` | `context.set_logger(logger)` | `context.configureLogger(config)` | +| Cancellation | `CancellationToken` on all methods | *(N/A)* | *(N/A)* | +| Jitter strategy | `JitterStrategy` enum on `Exponential()` | `jitter_strategy` on `RetryStrategyConfig` | `jitter` on `createRetryStrategy()` | +| Retry presets | `RetryStrategy.None/Default/Transient` | `RetryPresets.none()/default()/transient()` | `retryPresets.default/linear/noRetry` | +| Nesting type | `NestingType` on `ParallelConfig`/`MapConfig` | `NestingType` on parallel/map config | `NestingType` on parallel/map config | +| Item batching | `ItemBatcher` on `MapConfig` | `ItemBatcher` on `MapConfig` | *(checkpoint manager handles batching)* | +| Item namer | `ItemNamer` on `MapConfig` | Item naming function on `MapConfig` | `itemNamer` on `MapConfig` | +| Error mapping | `ErrorMapping` on `ChildContextConfig` | *(typed exception wrapping)* | `errorMapping` on child context config | +| Message-based retry filter | `retryableMessagePatterns` (regex) | `retryable_errors` (regex) | `retryableErrors` (RegExp[]) | +| Step context / scoped logger | `IStepContext` with `Logger`, `AttemptNumber` | `StepContext` with `logger` | `ctx` with `logger` in step callback | +| Named parallel branches | `DurableBranch(name, func)` | Function `__name__` | `{ name, func }` objects | +| Inline retry lambda | `Func` | `Callable[[Exception, int], RetryDecision]` | `(error, attempt) => RetryDecision` | +| Static analysis | Roslyn analyzers (P1 follow-up) | *(N/A)* | ESLint plugin (3 rules) | +| Cloud test runner | `CloudDurableTestRunner` | `pytest --runner-mode=cloud` | `CloudDurableTestRunner` | + +**Key differences:** + +- **Concurrency model:** JS returns `DurablePromise` (lazy, deferred until awaited). Python is synchronous (blocks the thread). Java exposes both `step` (blocking) and `stepAsync` (returns `DurableFuture`). .NET returns `Task` (standard async/await). Note: `Task.WhenAll` works with durable operations but `ParallelAsync`/`MapAsync` are preferred for completion policies and observability. +- **Why .NET ships only the async form:** Java's two-API split exists because Java has no language-level `await` — `step` is the simple blocking ergonomic, `stepAsync` is the composable form. In .NET, `Task` is *already* both: `await context.StepAsync(...)` reads as sequential code, and `Task.WhenAll(...)` composes concurrently. A `Step` (blocking, returns `T`) overload would do nothing except call `.GetAwaiter().GetResult()` on the async version, which is also a Lambda-thread anti-pattern (deadlock-prone, blocks a thread the runtime needs). So .NET intentionally has one shape — `*Async` — matching the rest of `IAmazonLambda` and the broader .NET async convention. Python is single-shape for the same reason in reverse: no async runtime in scope, so blocking is the only ergonomic shape. +- **Step function signature:** Python and JS only expose `Func` — the user always receives a step context. Java has both `Function` and `Supplier` overloads, but the `Supplier` ones are deprecated (*"use the variants accepting StepContext instead"*). .NET follows Python/JS: `IStepContext` is always passed. +- **Name parameter position:** JS puts `name` first; Python, Java, and .NET put it after the function/duration. +- **Parallel semantics in JS:** JS uses `context.promise.all/any/race/allSettled` to combine DurablePromises. .NET, Python, and Java use `CompletionConfig` on the `Parallel`/`Map` operations instead. +- **.NET-only:** `CancellationToken` on every method (standard .NET pattern). +- **Jitter default:** All four SDKs default to full jitter on retry strategies. + +--- + +## Common Patterns + +### GenAI Agentic Loop + +```csharp +[DurableExecution] +public async Task AgentHandler(AgentRequest input, IDurableContext context) +{ + var messages = new List + { + new Message { Role = "user", Content = input.Prompt } + }; + + while (true) + { + var response = await context.StepAsync( + async (step) => await InvokeModel(messages), + name: "invoke_model"); + + if (response.ToolCall == null) + return response.Content; + + var toolResult = await context.StepAsync( + async (step) => await ExecuteTool(response.ToolCall), + name: $"tool_{response.ToolCall.Name}"); + + messages.Add(new Message { Role = "assistant", Content = toolResult }); + } +} +``` + +### Human-in-the-Loop + +```csharp +[DurableExecution] +public async Task ReviewHandler(ReviewRequest input, IDurableContext context) +{ + var analysis = await context.StepAsync( + async (step) => await AnalyzeDocument(input.DocumentUrl), + name: "analyze_document"); + + context.Logger.LogInformation("Analysis complete, requesting human review"); + + var review = await context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + await NotifyReviewer(input.ReviewerEmail, callbackId, analysis); + }, + name: "human_review", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromDays(7), + HeartbeatTimeout = TimeSpan.FromHours(24) + }); + + if (review.Approved) + { + await context.StepAsync( + async (step) => await PublishDocument(input.DocumentUrl), + name: "publish"); + } + + return new ReviewResult { Status = review.Approved ? "published" : "rejected" }; +} +``` + +### Scheduled Pipeline with Retries + +```csharp +[DurableExecution] +public async Task DataPipeline(PipelineInput input, IDurableContext context) +{ + // Extract + var rawData = await context.StepAsync( + async (step) => await ExtractFromSource(input.SourceId), + name: "extract", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 5, initialDelay: TimeSpan.FromSeconds(2)) + }); + + // Transform (fan-out) + var transformed = await context.MapAsync( + items: rawData.Chunks, + func: async (ctx, chunk, index, _) => + { + return await ctx.StepAsync( + async (step) => await TransformChunk(chunk), + name: $"transform_{index}"); + }, + name: "transform_all", + config: new MapConfig { MaxConcurrency = 10 }); + + transformed.ThrowIfError(); + + // Load + var loadResult = await context.StepAsync( + async (step) => await LoadToDestination(transformed.GetResults()), + name: "load", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry + }); + + // Wait before next run + await context.WaitAsync(TimeSpan.FromHours(1), name: "schedule_delay"); + + return new PipelineResult { RecordsProcessed = loadResult.Count }; +} +``` + +--- + +## References + +- [AWS Blog: Build multi-step applications and AI workflows with AWS Lambda durable functions](https://aws.amazon.com/blogs/aws/build-multi-step-applications-and-ai-workflows-with-aws-lambda-durable-functions/) +- [AWS Documentation: Lambda Durable Functions](https://docs.aws.amazon.com/lambda/latest/dg/durable-functions.html) +- [Python SDK Repository](https://github.com/aws/aws-durable-execution-sdk-python) +- [JavaScript/TypeScript SDK Repository](https://github.com/aws/aws-durable-execution-sdk-js) +- [GitHub Issue #2216: .NET Durable Functions Support](https://github.com/aws/aws-lambda-dotnet/issues/2216) +- [Existing .NET Annotations Design Doc](lambda-annotations-design.md) diff --git a/Libraries/Libraries.sln b/Libraries/Libraries.sln index e42c40045..65b4cd9e0 100644 --- a/Libraries/Libraries.sln +++ b/Libraries/Libraries.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 18 -VisualStudioVersion = 18.5.11709.299 stable +VisualStudioVersion = 18.5.11709.299 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{AAB54E74-20B1-42ED-BC3D-CE9F7BC7FD12}" EndProject @@ -155,6 +155,14 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ResponseStreamingFunctionHa EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AspNetCoreStreamingApiGatewayTest", "test\Amazon.Lambda.RuntimeSupport.Tests\AspNetCoreStreamingApiGatewayTest\AspNetCoreStreamingApiGatewayTest.csproj", "{0768FA72-CF49-2B59-BC4C-E4CE579E5D93}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution", "src\Amazon.Lambda.DurableExecution\Amazon.Lambda.DurableExecution.csproj", "{9097B5A4-E100-47FD-A676-0B666A36FAFF}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.Tests", "test\Amazon.Lambda.DurableExecution.Tests\Amazon.Lambda.DurableExecution.Tests.csproj", "{57150BA6-3826-431F-8F58-B1D11FAFC5D4}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.IntegrationTests", "test\Amazon.Lambda.DurableExecution.IntegrationTests\Amazon.Lambda.DurableExecution.IntegrationTests.csproj", "{CA132CAB-FF4F-4312-B3A3-66DE9D360F27}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Amazon.Lambda.DurableExecution.AotPublishTest", "test\Amazon.Lambda.DurableExecution.AotPublishTest\Amazon.Lambda.DurableExecution.AotPublishTest.csproj", "{16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -969,6 +977,54 @@ Global {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x64.Build.0 = Release|Any CPU {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x86.ActiveCfg = Release|Any CPU {0768FA72-CF49-2B59-BC4C-E4CE579E5D93}.Release|x86.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x64.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x64.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x86.ActiveCfg = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Debug|x86.Build.0 = Debug|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|Any CPU.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x64.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x64.Build.0 = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x86.ActiveCfg = Release|Any CPU + {9097B5A4-E100-47FD-A676-0B666A36FAFF}.Release|x86.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x64.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x64.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x86.ActiveCfg = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Debug|x86.Build.0 = Debug|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|Any CPU.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x64.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x64.Build.0 = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x86.ActiveCfg = Release|Any CPU + {57150BA6-3826-431F-8F58-B1D11FAFC5D4}.Release|x86.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x64.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x64.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x86.ActiveCfg = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Debug|x86.Build.0 = Debug|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|Any CPU.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x64.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x64.Build.0 = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x86.ActiveCfg = Release|Any CPU + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27}.Release|x86.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|Any CPU.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x64.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x64.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x86.ActiveCfg = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Debug|x86.Build.0 = Debug|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|Any CPU.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|Any CPU.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x64.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x64.Build.0 = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x86.ActiveCfg = Release|Any CPU + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1045,6 +1101,10 @@ Global {80594C21-C6EB-469E-83CC-68F9F661CA5E} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} {E404A7AC-812B-BC03-CA76-02C0BC2BA7F9} = {B5BD0336-7D08-492C-8489-42C987E29B39} {0768FA72-CF49-2B59-BC4C-E4CE579E5D93} = {B5BD0336-7D08-492C-8489-42C987E29B39} + {9097B5A4-E100-47FD-A676-0B666A36FAFF} = {AAB54E74-20B1-42ED-BC3D-CE9F7BC7FD12} + {57150BA6-3826-431F-8F58-B1D11FAFC5D4} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} + {CA132CAB-FF4F-4312-B3A3-66DE9D360F27} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} + {16B1B1CC-3AFC-4DC7-8DB6-D14AE12924A2} = {1DE4EE60-45BA-4EF7-BE00-B9EB861E4C69} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {503678A4-B8D1-4486-8915-405A3E9CF0EB} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj b/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj new file mode 100644 index 000000000..ae173e365 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Amazon.Lambda.DurableExecution.csproj @@ -0,0 +1,40 @@ + + + + + + $(DefaultPackageTargets) + Amazon Lambda .NET SDK for Durable Execution - write multi-step workflows that persist state automatically. + Amazon.Lambda.DurableExecution + 0.0.1 + Amazon.Lambda.DurableExecution + Amazon.Lambda.DurableExecution + AWS;Amazon;Lambda;Durable;Workflow + true + true + enable + enable + true + IL2026,IL2067,IL2075,IL3050 + + $(NoWarn);AWSLAMBDA001 + + + + + <_Parameter1>Amazon.Lambda.DurableExecution.Tests, PublicKey="0024000004800000940000000602000000240000525341310004000001000100db5f59f098d27276c7833875a6263a3cc74ab17ba9a9df0b52aedbe7252745db7274d5271fd79c1f08f668ecfa8eaab5626fa76adc811d3c8fc55859b0d09d3bc0a84eecd0ba891f2b8a2fc55141cdcc37c2053d53491e650a479967c3622762977900eddbf1252ed08a2413f00a28f3a0752a81203f03ccb7f684db373518b4" + + + + + + + + + + + + + diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs new file mode 100644 index 000000000..e565ddb06 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackConfig.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for callback operations created via +/// . +/// +public class CallbackConfig +{ + private TimeSpan _timeout = TimeSpan.Zero; + private TimeSpan _heartbeatTimeout = TimeSpan.Zero; + + /// + /// Maximum total time the service will wait for the external system to + /// complete the callback. (default) means no + /// overall timeout — only applies (if set). + /// + /// + /// The service's timer granularity is 1 second, so values strictly between + /// and 1 second are rejected to avoid silent + /// rounding. Use to disable the timeout, or a + /// value of at least 1 second. + /// + /// + /// Thrown when set to a positive value less than 1 second. + /// + public TimeSpan Timeout + { + get => _timeout; + set + { + ValidateTimeout(value, nameof(Timeout)); + _timeout = value; + } + } + + /// + /// Maximum gap between heartbeat signals from the external system before + /// the service marks the callback as timed-out. + /// (default) means no heartbeat timeout. + /// + /// + /// The service's timer granularity is 1 second, so values strictly between + /// and 1 second are rejected to avoid silent + /// rounding. Use to disable the heartbeat + /// timeout, or a value of at least 1 second. + /// + /// + /// Thrown when set to a positive value less than 1 second. + /// + public TimeSpan HeartbeatTimeout + { + get => _heartbeatTimeout; + set + { + ValidateTimeout(value, nameof(HeartbeatTimeout)); + _heartbeatTimeout = value; + } + } + + private static void ValidateTimeout(TimeSpan value, string paramName) + { + // Allow Zero (means "not set"); reject negative; reject sub-second + // positive values to mirror WaitAsync's behavior and prevent silent + // rounding-up inside BuildCallbackOptions. + if (value < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException( + paramName, value, $"{paramName} must be non-negative."); + } + if (value > TimeSpan.Zero && value < TimeSpan.FromSeconds(1)) + { + throw new ArgumentOutOfRangeException( + paramName, value, + $"{paramName} must be at least 1 second (or TimeSpan.Zero to disable)."); + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs new file mode 100644 index 000000000..2d1244b2b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CallbackException.cs @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Base exception type for callback failures surfaced from +/// +/// or +/// . +/// Concrete subclasses distinguish failure modes — pattern-match +/// , , +/// or in catch clauses. +/// +public class CallbackException : DurableExecutionException +{ + /// The callback ID associated with the failure (if known). + public string? CallbackId { get; init; } + + /// The fully-qualified type name of the original error, if known. + public string? ErrorType { get; init; } + + /// Optional structured error data attached by the external system. + public string? ErrorData { get; init; } + + /// Stack trace of the original error, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public CallbackException() { } + + /// Creates a with the given message. + public CallbackException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when the external system reports a failure result for a callback +/// (via SendDurableExecutionCallbackFailure). +/// +public class CallbackFailedException : CallbackException +{ + /// Creates an empty . + public CallbackFailedException() { } + + /// Creates a with the given message. + public CallbackFailedException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackFailedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when the durable execution service marks a callback as timed-out — +/// either the overall or the +/// elapsed. +/// +public class CallbackTimeoutException : CallbackException +{ + /// Creates an empty . + public CallbackTimeoutException() { } + + /// Creates a with the given message. + public CallbackTimeoutException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackTimeoutException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown only from +/// +/// when the user-supplied submitter delegate (the step that hands the callback +/// ID to the external system) fails after retries are exhausted. Wraps the +/// underlying as . +/// +public class CallbackSubmitterException : CallbackException +{ + /// Creates an empty . + public CallbackSubmitterException() { } + + /// Creates a with the given message. + public CallbackSubmitterException(string message) : base(message) { } + + /// Creates a wrapping an inner exception. + public CallbackSubmitterException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs new file mode 100644 index 000000000..c97418a6a --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ChildContextConfig.cs @@ -0,0 +1,35 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for a child context. +/// +/// +/// A child context is a logical sub-workflow with its own deterministic +/// operation-ID space, persisted as a CONTEXT operation. Use +/// +/// (and overloads) to run code inside one. +/// +public sealed class ChildContextConfig +{ + /// + /// Operation sub-type label for observability (e.g. "WaitForCallback"). + /// Surfaces on the wire OperationUpdate.SubType field. + /// + public string? SubType { get; set; } + + /// + /// Optional function to transform exceptions thrown by the child context's + /// user function before they surface to the caller. Useful for wrapping + /// low-level errors into domain-specific exceptions. + /// + /// + /// Applied when the user function throws (the mapped exception propagates + /// to the caller of RunInChildContextAsync) and on replay of a + /// FAILED child context (the constructed + /// is mapped before being thrown). + /// + public Func? ErrorMapping { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs new file mode 100644 index 000000000..7072f34a2 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -0,0 +1,415 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution.Internal; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Implementation of . Constructs and dispatches +/// per-operation classes (, ); +/// the replay logic lives in those classes. +/// +internal sealed class DurableContext : IDurableContext +{ + private readonly ExecutionState _state; + private readonly TerminationManager _terminationManager; + private readonly OperationIdGenerator _idGenerator; + private readonly string _durableExecutionArn; + private readonly CheckpointBatcher? _batcher; + + public DurableContext( + ExecutionState state, + TerminationManager terminationManager, + OperationIdGenerator idGenerator, + string durableExecutionArn, + ILambdaContext lambdaContext, + CheckpointBatcher? batcher = null) + { + _state = state; + _terminationManager = terminationManager; + _idGenerator = idGenerator; + _durableExecutionArn = durableExecutionArn; + _batcher = batcher; + LambdaContext = lambdaContext; + } + + // Replay-safe logger ships in a follow-up PR; see IDurableContext.Logger doc. + public ILogger Logger => NullLogger.Instance; + public IExecutionContext ExecutionContext => new DurableExecutionContext(_durableExecutionArn); + public ILambdaContext LambdaContext { get; } + + public Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default) + => RunStep(func, name, config, cancellationToken); + + public async Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default) + { + // Void steps don't carry a meaningful payload — wrap with an object?-typed + // step that always returns null. The serializer isn't actually invoked + // with a non-null value, so any registered ILambdaSerializer suffices. + await RunStep( + async (ctx) => { await func(ctx); return null; }, + name, config, cancellationToken); + } + + private Task RunStep( + Func> func, + string? name, + StepConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new StepOperation( + operationId, name, _idGenerator.ParentId, func, config, serializer, Logger, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default) + { + // Service timer granularity is 1 second; sub-second waits would round to 0. + // WaitOptions.WaitSeconds is integer in [1, 31_622_400] (1 second to ~1 year). + if (duration < TimeSpan.FromSeconds(1)) + throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at least 1 second."); + + if (duration > TimeSpan.FromSeconds(31_622_400)) + throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at most 31,622,400 seconds (~1 year)."); + + cancellationToken.ThrowIfCancellationRequested(); + + var operationId = _idGenerator.NextId(); + var waitSeconds = (int)Math.Max(1, Math.Ceiling(duration.TotalSeconds)); + var op = new WaitOperation( + operationId, name, _idGenerator.ParentId, waitSeconds, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default) + => RunChildContext(func, name, config, cancellationToken); + + public async Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default) + { + // Void child contexts don't carry a meaningful payload; the wrapper + // returns null so the registered ILambdaSerializer is never asked to + // serialize a real value. + await RunChildContext( + async (ctx) => { await func(ctx); return null; }, + name, config, cancellationToken); + } + + private Task RunChildContext( + Func> func, + string? name, + ChildContextConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + + // Capture this DurableContext's collaborators; the child shares state, + // termination, batcher, ARN, and Lambda context — but uses a child + // OperationIdGenerator so its operation IDs are deterministically + // namespaced under the parent op ID. + IDurableContext ChildFactory(string parentOpId) => new DurableContext( + _state, _terminationManager, _idGenerator.CreateChild(parentOpId), + _durableExecutionArn, LambdaContext, _batcher); + + var op = new ChildContextOperation( + operationId, name, _idGenerator.ParentId, func, config, serializer, ChildFactory, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default) + => RunCallback(name, config, cancellationToken); + + private Task> RunCallback( + string? name, + CallbackConfig? config, + CancellationToken cancellationToken) + { + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new CallbackOperation( + operationId, name, _idGenerator.ParentId, config, serializer, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default) + => RunWaitForCallback(submitter, name, config, cancellationToken); + + /// + /// Composes WaitForCallback over RunInChildContextAsync + CreateCallbackAsync + /// + StepAsync(submitter) + callback.GetResultAsync. + /// + /// + /// Sub-operation naming follows kebab-style: "{name}-callback" and + /// "{name}-submitter". When the parent is null, + /// the inner ops are also nameless (no leading hyphen). + /// + /// remaps a submitter + /// to . + /// Callback errors () pass through unchanged. + /// + /// + private Task RunWaitForCallback( + Func submitter, + string? name, + WaitForCallbackConfig? config, + CancellationToken cancellationToken) + { + var callbackName = name == null ? null : $"{name}-callback"; + var submitterName = name == null ? null : $"{name}-submitter"; + + var callbackConfig = config == null ? null : new CallbackConfig + { + Timeout = config.Timeout, + HeartbeatTimeout = config.HeartbeatTimeout, + }; + + var stepConfig = config?.RetryStrategy == null + ? null + : new StepConfig { RetryStrategy = config.RetryStrategy }; + + // Delegate to RunInChildContextAsync; the inner CreateCallbackAsync and + // StepAsync calls each pull the registered ILambdaSerializer from + // ILambdaContext.Serializer, so AOT and reflection-based scenarios share + // the same code path. + return RunInChildContextAsync( + async childCtx => + { + var callback = await childCtx.CreateCallbackAsync( + name: callbackName, + config: callbackConfig, + cancellationToken: cancellationToken); + + await childCtx.StepAsync( + async (stepCtx) => + { + var submitterCtx = new WaitForCallbackContext(stepCtx.Logger); + await submitter(callback.CallbackId, submitterCtx); + }, + name: submitterName, + config: stepConfig, + cancellationToken: cancellationToken); + + return await callback.GetResultAsync(cancellationToken); + }, + name, + new ChildContextConfig + { + SubType = OperationSubTypes.WaitForCallback, + ErrorMapping = MapWaitForCallbackException, + }, + cancellationToken); + } + + private static Exception MapWaitForCallbackException(Exception ex) + { + // Callback errors are already user-meaningful (CallbackFailed/Timeout + // from inside the callback await). Pass through. + if (ex is CallbackException) return ex; + + // The ChildContextOperation wraps thrown exceptions in + // ChildContextException; unwrap to surface the underlying cause. + if (ex is ChildContextException childEx) + { + // CallbackException thrown from GetResultAsync (callback completed + // with FAILED/TIMED_OUT) — surface directly. + // + // Fresh-execution path: InnerException is the live exception object. + // Replay path: InnerException is null but ErrorType carries the string. + if (childEx.InnerException is CallbackException nestedLive) + return nestedLive; + if (IsCallbackErrorTypeString(childEx.ErrorType)) + { + // Replay-side reconstruction: preserve subclass fidelity by + // dispatching on the stored ErrorType FullName so a stored + // CallbackTimeoutException remaps to CallbackTimeoutException + // (not the more generic CallbackFailedException). + return BuildCallbackExceptionForReplay(childEx); + } + + // Submitter step exhausted retries → wrap as CallbackSubmitterException. + // Fresh path: InnerException is the live StepException. + if (childEx.InnerException is StepException stepLive) + { + return new CallbackSubmitterException(stepLive.Message, stepLive) + { + ErrorType = stepLive.ErrorType, + ErrorData = stepLive.ErrorData, + OriginalStackTrace = stepLive.OriginalStackTrace, + }; + } + // Replay path: InnerException is null; ErrorType is the type string. + if (childEx.ErrorType == typeof(StepException).FullName) + { + return new CallbackSubmitterException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + } + + // Anything else — surface unchanged so the user sees the original cause. + return ex; + } + + private static CallbackException BuildCallbackExceptionForReplay(ChildContextException childEx) + { + // Dispatch on the stored ErrorType FullName to preserve the original + // subclass across replays. Caller has already verified + // IsCallbackErrorTypeString(childEx.ErrorType) is true. + if (childEx.ErrorType == typeof(CallbackTimeoutException).FullName) + { + return new CallbackTimeoutException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + if (childEx.ErrorType == typeof(CallbackSubmitterException).FullName) + { + return new CallbackSubmitterException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + if (childEx.ErrorType == typeof(CallbackException).FullName) + { + return new CallbackException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + // CallbackFailedException.FullName (or any future callback subtype not + // listed above) defaults to CallbackFailedException — the most general + // "callback failed" surface that preserves user-catchable behavior. + return new CallbackFailedException(childEx.Message, childEx) + { + ErrorType = childEx.ErrorType, + ErrorData = childEx.ErrorData, + OriginalStackTrace = childEx.OriginalStackTrace, + }; + } + + private static bool IsCallbackErrorTypeString(string? errorType) => + errorType == typeof(CallbackFailedException).FullName + || errorType == typeof(CallbackTimeoutException).FullName + || errorType == typeof(CallbackSubmitterException).FullName + || errorType == typeof(CallbackException).FullName; + + public Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default) + => RunInvoke( + functionName, payload, + name, config, cancellationToken); + + private Task RunInvoke( + string functionName, + TPayload payload, + string? name, + InvokeConfig? config, + CancellationToken cancellationToken) + { + // Argument validation runs synchronously at the call site (matches the + // .NET convention of failing fast for misuse). Match Python/JS/Java + // parity: only check for null/empty here; the durable execution service + // enforces the qualified-ARN rule and surfaces a precise error when an + // unqualified identifier is used. + ArgumentNullException.ThrowIfNull(functionName); + if (string.IsNullOrWhiteSpace(functionName)) + throw new ArgumentException("Function name must not be empty or whitespace.", nameof(functionName)); + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + cancellationToken.ThrowIfCancellationRequested(); + + var operationId = _idGenerator.NextId(); + var op = new InvokeOperation( + operationId, name, _idGenerator.ParentId, functionName, payload, config, + serializer, + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } +} + +internal sealed class WaitForCallbackContext : IWaitForCallbackContext +{ + public WaitForCallbackContext(ILogger logger) + { + Logger = logger; + } + + public ILogger Logger { get; } +} + +internal sealed class DurableExecutionContext : IExecutionContext +{ + public DurableExecutionContext(string durableExecutionArn) + { + DurableExecutionArn = durableExecutionArn; + } + + public string DurableExecutionArn { get; } +} + +internal sealed class StepContext : IStepContext +{ + public StepContext(string operationId, int attemptNumber, ILogger logger) + { + OperationId = operationId; + AttemptNumber = attemptNumber; + Logger = logger; + } + + public ILogger Logger { get; } + public int AttemptNumber { get; } + public string OperationId { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs new file mode 100644 index 000000000..7f8707966 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Base exception for all durable execution errors. +/// +public class DurableExecutionException : Exception +{ + /// Creates an empty . + public DurableExecutionException() { } + /// Creates a with the given message. + public DurableExecutionException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public DurableExecutionException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when code has changed between invocations, causing a replay mismatch. +/// For example, a step at index 0 was previously a WAIT but is now a STEP. +/// +public class NonDeterministicExecutionException : DurableExecutionException +{ + /// Creates an empty . + public NonDeterministicExecutionException() { } + /// Creates a with the given message. + public NonDeterministicExecutionException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public NonDeterministicExecutionException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when user code inside a step fails (after retries exhausted). +/// Contains the original error details from the checkpoint. +/// +public class StepException : DurableExecutionException +{ + /// The fully-qualified type name of the original exception. + public string? ErrorType { get; init; } + /// Optional structured error data attached by the user. + public string? ErrorData { get; init; } + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public StepException() { } + /// Creates a with the given message. + public StepException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public StepException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a step under is +/// detected to have been interrupted mid-execution on a prior invocation +/// (replay sees a STARTED checkpoint with no terminal record). +/// +/// +/// Surfaces in so user-supplied +/// strategies can distinguish "my code threw" from "a previous attempt +/// crashed before it could record a result". +/// +public class StepInterruptedException : StepException +{ + /// Creates an empty . + public StepInterruptedException() { } + /// Creates a with the given message. + public StepInterruptedException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public StepInterruptedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a child context's user function fails. Surfaces from +/// RunInChildContextAsync; the underlying error is preserved on the +/// // +/// fields. Use to remap into a +/// domain-specific exception. +/// +public class ChildContextException : DurableExecutionException +{ + /// + /// The child context's , if any. + /// + public string? SubType { get; init; } + /// The fully-qualified type name of the original exception. + public string? ErrorType { get; init; } + /// Optional structured error data attached by the user. + public string? ErrorData { get; init; } + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public ChildContextException() { } + /// Creates a with the given message. + public ChildContextException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public ChildContextException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs new file mode 100644 index 000000000..971ac6f64 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionHandler.cs @@ -0,0 +1,122 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The result of running a durable execution handler. +/// +internal sealed class HandlerResult +{ + public required InvocationStatus Status { get; init; } + public TResult? Result { get; init; } + public string? Message { get; init; } + public Exception? Exception { get; init; } +} + +/// +/// Core orchestration engine for durable execution. Races user code against +/// a termination signal using Task.WhenAny. When user code completes, returns +/// SUCCEEDED/FAILED. When termination wins (wait, callback, invoke), returns PENDING. +/// +internal static class DurableExecutionHandler +{ + /// + /// Runs the user's workflow function within the durable execution engine. + /// + /// + /// + /// Suspension flow — example: await ctx.WaitAsync(TimeSpan.FromSeconds(5)): + /// + /// + /// user code DurableContext TerminationMgr RunAsync + /// ───────── ────────────── ────────────── ──────── + /// WaitAsync(5s) ─────► queue WAIT START + /// checkpoint + /// Terminate() ──────► TerminationTask + /// completes + /// ◄────── new TCS().Task + /// (never completes) + /// await blocks + /// forever WhenAny: + /// ── termination wins + /// ── userTask abandoned + /// ── return Pending + /// + /// + /// Key insight: WaitAsync never returns a completed Task — it hands back + /// a TaskCompletionSource that is never resolved. The user's await blocks + /// indefinitely. The escape signal is terminationManager.Terminate(), + /// which Task.WhenAny picks up. We return Pending; the dangling user + /// Task is GC'd. The service flushes checkpoints, fires the wait timer, then + /// re-invokes Lambda — on replay, WaitAsync sees the matching SUCCEED + /// checkpoint and returns Task.CompletedTask normally. + /// + /// + /// The same pattern applies to retries (RetryScheduled), callbacks + /// (CallbackPending), and chained invokes (InvokePending). + /// + /// + /// The workflow return type. + /// Hydrated execution state from prior invocations. + /// Manages the suspension signal. + /// The user's workflow function receiving a DurableContext. + /// The handler result indicating SUCCEEDED, FAILED, or PENDING. + internal static async Task> RunAsync( + ExecutionState executionState, + TerminationManager terminationManager, + Func> userHandler) + { + // Run user code on a threadpool thread so it executes independently of + // the termination signal. When TerminationManager fires (e.g., WaitAsync), + // we need the WhenAny race below to resolve immediately without waiting + // for the user task to reach an await point. + var userTask = Task.Run(userHandler); + + // Race: user code completing vs. termination signal (wait/callback/retry). + // If termination wins, we return PENDING and the abandoned userTask is never awaited. + var winner = await Task.WhenAny(userTask, terminationManager.TerminationTask); + + if (winner == terminationManager.TerminationTask) + { + var terminationResult = await terminationManager.TerminationTask; + + if (terminationResult.Exception != null) + { + return new HandlerResult + { + Status = InvocationStatus.Failed, + Message = terminationResult.Exception.Message, + Exception = terminationResult.Exception + }; + } + + return new HandlerResult + { + Status = InvocationStatus.Pending, + Message = terminationResult.Message + }; + } + + try + { + var result = await userTask; + return new HandlerResult + { + Status = InvocationStatus.Succeeded, + Result = result + }; + } + catch (Exception ex) + { + return new HandlerResult + { + Status = InvocationStatus.Failed, + Message = ex.Message, + Exception = ex + }; + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs new file mode 100644 index 000000000..9c2b22b41 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationInput.cs @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The service envelope input for a durable execution invocation. +/// This is what Lambda receives from the durable execution service. +/// +public sealed class DurableExecutionInvocationInput +{ + /// + /// The unique ARN identifying this durable execution. + /// + [JsonPropertyName("DurableExecutionArn")] + public required string DurableExecutionArn { get; set; } + + /// + /// Token for optimistic concurrency on checkpoint operations. + /// + [JsonPropertyName("CheckpointToken")] + public string? CheckpointToken { get; set; } + + /// + /// Previously checkpointed operation state for replay. Consumed by + /// DurableFunction.WrapAsync for replay correlation; user code + /// should not modify this on a live invocation envelope. + /// + [JsonPropertyName("InitialExecutionState")] + public InitialExecutionState? InitialExecutionState { get; set; } +} + +/// +/// The previously checkpointed execution state provided on replay invocations. +/// +public sealed class InitialExecutionState +{ + /// + /// The list of operations from prior invocations. + /// + [JsonPropertyName("Operations")] + public IReadOnlyList? Operations { get; set; } + + /// + /// If present, indicates that more operations are available. Use this value + /// with GetDurableExecutionState to fetch the next page. + /// + [JsonPropertyName("NextMarker")] + public string? NextMarker { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs new file mode 100644 index 000000000..f02e38a99 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionInvocationOutput.cs @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The service envelope output returned by a durable execution invocation. +/// +public sealed class DurableExecutionInvocationOutput +{ + /// + /// The terminal status of this invocation. + /// + [JsonPropertyName("Status")] + [JsonConverter(typeof(UpperSnakeCaseEnumConverter))] + public required InvocationStatus Status { get; set; } + + /// + /// The serialized result (only present when Status is Succeeded). + /// + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// + /// Error details (only present when Status is Failed). + /// + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs new file mode 100644 index 000000000..4c076adf9 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableFunction.cs @@ -0,0 +1,247 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using System.Threading; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.DurableExecution.Services; +using Amazon.Lambda.Model; +using Amazon.Runtime; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Static helper that wraps a durable workflow function, handling all envelope +/// translation between DurableExecutionInvocationInput/Output and user types. +/// +/// All four overloads dispatch through the registered +/// on , so AOT-safe and reflection-based +/// callers share a single code path. Callers wire AOT support by registering an +/// AOT-aware serializer with the runtime +/// (e.g., SourceGeneratorLambdaJsonSerializer<TContext>) — no per-call +/// JsonSerializerContext argument is required. +/// +public static class DurableFunction +{ + private static readonly Lazy _cachedLambdaClient = + new(() => new AmazonLambdaClient(), LazyThreadSafetyMode.ExecutionAndPublication); + + /// + /// Wrap a workflow (typed input + output). + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) + => WrapAsyncCore(workflow, invocationInput, lambdaContext, _cachedLambdaClient.Value); + + /// + /// Wrap a workflow (typed input + output) with explicit Lambda client. + /// + public static Task WrapAsync( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + => WrapAsyncCore(workflow, invocationInput, lambdaContext, lambdaClient); + + /// + /// Wrap a void workflow (typed input, no output). + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext) + => WrapAsync(workflow, invocationInput, lambdaContext, _cachedLambdaClient.Value); + + /// + /// Wrap a void workflow with explicit Lambda client. + /// + public static Task WrapAsync( + Func workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + => WrapAsyncCore( + async (input, ctx) => { await workflow(input, ctx); return null; }, + invocationInput, lambdaContext, lambdaClient); + + private static async Task WrapAsyncCore( + Func> workflow, + DurableExecutionInvocationInput invocationInput, + ILambdaContext lambdaContext, + IAmazonLambda lambdaClient) + { + var serializer = LambdaSerializerHelper.GetRequired(lambdaContext); + + var state = new ExecutionState(); + state.LoadFromCheckpoint(invocationInput.InitialExecutionState); + + var serviceClient = new LambdaDurableServiceClient(lambdaClient); + var checkpointToken = invocationInput.CheckpointToken; + + var nextMarker = invocationInput.InitialExecutionState?.NextMarker; + while (!string.IsNullOrEmpty(nextMarker)) + { + var (operations, marker) = await serviceClient.GetExecutionStateAsync( + invocationInput.DurableExecutionArn, checkpointToken, nextMarker); + state.AddOperations(operations); + nextMarker = marker; + } + + var userPayload = ExtractUserPayload(invocationInput, serializer); + var terminationManager = new TerminationManager(); + var idGenerator = new OperationIdGenerator(); + + await using var batcher = new CheckpointBatcher( + checkpointToken, + (token, ops, ct) => serviceClient.CheckpointAsync( + invocationInput.DurableExecutionArn, token, ops, + // The service stamps a freshly-allocated CallbackId onto a started + // CALLBACK op (and may emit terminal-state callbacks/timers); merge + // those back into ExecutionState so the next ExecuteAsync sees them. + onNewOperations: state.AddOperations, + cancellationToken: ct)); + + var context = new DurableContext( + state, terminationManager, idGenerator, + invocationInput.DurableExecutionArn, lambdaContext, batcher); + + HandlerResult result; + try + { + result = await DurableExecutionHandler.RunAsync( + state, terminationManager, + async () => await workflow(userPayload, context)); + + await batcher.DrainAsync(); + } + catch (DurableExecutionException ex) when (ex.InnerException is AmazonServiceException sdkEx && IsTerminalCheckpointError(sdkEx)) + { + return new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = ErrorObject.FromException(ex) + }; + } + + return MapToOutput(result, serializer); + } + + /// + /// Returns true for checkpoint-flush SDK errors that should fail the workflow + /// (Failed envelope) instead of escaping to the host (Lambda retry). The catch + /// site unwraps a first because + /// wraps every SDK error so + /// user logs show durable-execution context — this method then classifies the + /// inner . + /// + /// + /// Classification rule: + /// - 4xx (except 429) → terminal: permanent caller-side failure (missing ARN/KMS key, + /// IAM denial, validation). Retrying will not fix it, so return Failed. + /// - 429 / 5xx / no status (network or SDK-internal) → not terminal: transient, + /// allow the exception to escape so Lambda retries the invocation. + /// - Carve-out: InvalidParameterValueException with a message starting with + /// "Invalid Checkpoint Token" is treated as transient — the service rejects a + /// stale token but a retry with a fresh token will succeed. + /// + /// Only checkpoint-flush errors flow through this catch. There are two paths: + /// 1. A flush triggered synchronously from inside a user StepAsync call + /// (the user awaits EnqueueAsync → batch flush → SDK throws → service client + /// wraps). + /// 2. The final after the workflow returns. + /// + /// State-hydration errors (GetExecutionStateAsync) propagate as + /// too, but they are NOT caught here — they + /// flow up to the host so Lambda retries. + /// + /// User-code SDK errors (e.g. an SDK call inside a Step body) are caught by + /// StepRunner and surfaced as StepException for the workflow's normal + /// step-failure handling. + /// + private static bool IsTerminalCheckpointError(AmazonServiceException ex) + { + var status = (int)ex.StatusCode; + if (status < 400 || status >= 500 || status == 429) + return false; + + if (ex.ErrorCode == "InvalidParameterValueException" + && ex.Message != null + && ex.Message.StartsWith("Invalid Checkpoint Token", StringComparison.Ordinal)) + { + return false; + } + + return true; + } + + // The user's input payload is stored inside the service envelope as an EXECUTION-type + // operation. This is part of the durable execution wire format — each invocation includes + // its input as a checkpoint record so the service can validate replay consistency. + // A missing EXECUTION op is a malformed envelope: surfacing it as a typed exception here + // gives a clear error instead of letting default!/null bubble into user code as an opaque + // NullReferenceException. + private static TInput ExtractUserPayload( + DurableExecutionInvocationInput input, + ILambdaSerializer serializer) + { + if (input.InitialExecutionState?.Operations != null) + { + foreach (var op in input.InitialExecutionState.Operations) + { + if (op.Type != OperationTypes.Execution || op.ExecutionDetails?.InputPayload == null) + continue; + + var payload = op.ExecutionDetails.InputPayload; + var bytes = Encoding.UTF8.GetBytes(payload); + using var ms = new MemoryStream(bytes); + return serializer.Deserialize(ms); + } + } + + throw new DurableExecutionException( + "Durable execution envelope is malformed: no EXECUTION-type operation with an input payload was found. " + + "The service must include an EXECUTION op carrying the workflow's input on every invocation."); + } + + private static DurableExecutionInvocationOutput MapToOutput( + HandlerResult result, + ILambdaSerializer serializer) + { + return result.Status switch + { + InvocationStatus.Succeeded => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Succeeded, + Result = SerializeOutput(result.Result, serializer) + }, + InvocationStatus.Failed => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = result.Exception != null + ? ErrorObject.FromException(result.Exception) + : new ErrorObject { ErrorMessage = result.Message } + }, + // Pending = workflow suspended (wait/retry/callback). No Result or Error — + // the service will re-invoke with accumulated checkpoints when ready. + InvocationStatus.Pending => new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Pending + }, + _ => throw new InvalidOperationException($"Unexpected status: {result.Status}") + }; + } + + private static string? SerializeOutput(TOutput? value, ILambdaSerializer serializer) + { + if (value == null) return null; + + using var ms = new MemoryStream(); + serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs new file mode 100644 index 000000000..7b8c02402 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Enums.cs @@ -0,0 +1,17 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The terminal status of a durable execution invocation. +/// +public enum InvocationStatus +{ + /// The workflow completed successfully. + Succeeded, + /// The workflow failed with an unhandled exception. + Failed, + /// The workflow suspended (waiting for time, callback, or invocation). + Pending +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs new file mode 100644 index 000000000..88618f2cb --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ErrorObject.cs @@ -0,0 +1,91 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Serializable error representation stored in checkpoint state. +/// +public sealed class ErrorObject +{ + /// + /// The fully-qualified exception type name. + /// + [JsonPropertyName("ErrorType")] + public string? ErrorType { get; set; } + + /// + /// The exception message. + /// + [JsonPropertyName("ErrorMessage")] + public string? ErrorMessage { get; set; } + + /// + /// Stack trace frames. + /// + [JsonPropertyName("StackTrace")] + public IReadOnlyList? StackTrace { get; set; } + + /// + /// Additional serialized error data. + /// + [JsonPropertyName("ErrorData")] + public string? ErrorData { get; set; } + + /// + /// Creates an ErrorObject from an exception. + /// + /// + /// SDK operation wrappers (, + /// , , + /// ) unwrap to the original error captured + /// from the failed operation — preserving the user-visible + /// ErrorType/ErrorData/StackTrace instead of recording + /// the wrapper's type. This way a chained invoker sees the originating + /// exception (e.g. System.InvalidOperationException) rather than + /// Amazon.Lambda.DurableExecution.StepException. Mirrors the Java + /// SDK's DurableExecutor.buildErrorObject behavior. + /// + public static ErrorObject FromException(Exception exception) + { + return exception switch + { + StepException step => new ErrorObject + { + ErrorType = step.ErrorType, + ErrorMessage = step.Message, + StackTrace = step.OriginalStackTrace, + ErrorData = step.ErrorData + }, + ChildContextException child => new ErrorObject + { + ErrorType = child.ErrorType, + ErrorMessage = child.Message, + StackTrace = child.OriginalStackTrace, + ErrorData = child.ErrorData + }, + InvokeException invoke => new ErrorObject + { + ErrorType = invoke.ErrorType, + ErrorMessage = invoke.Message, + StackTrace = invoke.OriginalStackTrace, + ErrorData = invoke.ErrorData + }, + CallbackException callback => new ErrorObject + { + ErrorType = callback.ErrorType, + ErrorMessage = callback.Message, + StackTrace = callback.OriginalStackTrace, + ErrorData = callback.ErrorData + }, + _ => new ErrorObject + { + ErrorType = exception.GetType().FullName, + ErrorMessage = exception.Message, + StackTrace = exception.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries) + } + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs new file mode 100644 index 000000000..a6484a480 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ICallback.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// A pending callback created by +/// . +/// Hands back a for external systems to use, plus a +/// hook that +/// suspends the workflow until the external system completes the callback. +/// +/// The callback result type. +public interface ICallback +{ + /// + /// The callback ID generated by the durable execution service. External + /// systems pass this ID to SendDurableExecutionCallbackSuccess / + /// SendDurableExecutionCallbackFailure / + /// SendDurableExecutionCallbackHeartbeat to deliver a result. + /// + string CallbackId { get; } + + /// + /// Suspends the workflow until the callback is completed, then returns the + /// deserialized result. + /// + /// + /// On the first invocation that reaches this call, the workflow suspends + /// (Lambda terminates). When the external system completes the callback + /// the service re-invokes Lambda; this call then returns the cached result + /// without re-executing user code. + /// + /// + /// Thrown when the external system reported a failure result. + /// + /// + /// Thrown when the service timed out the callback (overall timeout or + /// heartbeat timeout elapsed). + /// + Task GetResultAsync(CancellationToken cancellationToken = default); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs new file mode 100644 index 000000000..dbee39335 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs @@ -0,0 +1,204 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// The primary interface for durable execution operations. +/// Passed to user workflow functions to access checkpointed steps and waits. +/// Additional operations (callbacks, parallel, map, etc.) are added in +/// follow-up PRs. +/// +public interface IDurableContext +{ + /// + /// A logger scoped to the durable execution. Currently returns + /// ; + /// the replay-safe DurableLogger (suppresses messages during replay) + /// ships in a follow-up PR. + /// + ILogger Logger { get; } + + /// + /// Metadata about the current durable execution. + /// + IExecutionContext ExecutionContext { get; } + + /// + /// The underlying Lambda context. + /// + ILambdaContext LambdaContext { get; } + + /// + /// Execute a step with automatic checkpointing. The step result is serialized + /// to a checkpoint using the registered on + /// . AOT and reflection-based scenarios + /// share this single overload — the AOT story is determined by the registered + /// serializer (e.g., SourceGeneratorLambdaJsonSerializer<TContext>). + /// + Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute a step that returns no value. + /// + Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Suspend execution for the specified duration without consuming compute time. + /// The Lambda is suspended and the service re-invokes it after the wait elapses. + /// Duration must be at least 1 second (service timer granularity). + /// + Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); + + /// + /// Run a user function inside a logical sub-workflow (a "child context"). + /// The child has its own deterministic operation-ID space; its result is + /// checkpointed as a CONTEXT operation so subsequent invocations + /// replay the cached value without re-executing the func. + /// + /// + /// Use child contexts to group related durable operations (e.g. a step plus + /// a wait plus a step) into a single observability/error-handling boundary. + /// On failure, surfaces as ; supply + /// to remap into a + /// domain-specific exception. + /// The child context's return value is serialized to a checkpoint using the + /// registered on + /// . + /// + Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Run a user function inside a logical sub-workflow (a "child context") + /// that returns no value. The child has its own deterministic operation-ID + /// space and is checkpointed as a CONTEXT operation so subsequent + /// invocations skip re-executing the func. + /// + /// + /// Use child contexts to group related durable operations (e.g. a step plus + /// a wait plus a step) into a single observability/error-handling boundary. + /// On failure, surfaces as ; supply + /// to remap into a + /// domain-specific exception. + /// + Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Create a callback for an external system to complete. Returns an + /// handle exposing the service-allocated + /// (pass to the external system) and + /// + /// (await to suspend until a result arrives). + /// + /// + /// The callback result is deserialized using the + /// registered on . AOT and reflection-based + /// scenarios share this single overload — the AOT story is determined by the + /// registered serializer (e.g., + /// SourceGeneratorLambdaJsonSerializer<TContext>). + /// + /// Errors are deferred to ; + /// CreateCallbackAsync always returns successfully so user code + /// between CreateCallbackAsync and the result-await runs deterministically + /// across replays. + /// + /// + Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Composite operation that creates a callback, runs the supplied submitter + /// (which hands the callbackId to an external system), and suspends + /// until the external system delivers a result. Equivalent to manually + /// composing + /// + + /// + + /// inside a child context. + /// + /// + /// Submitter failures (after retries are exhausted) surface as + /// . Callback failures and timeouts + /// surface as / + /// . + /// + Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Invoke another durable Lambda function and await its result. The + /// invocation is checkpointed so it survives parent failures and is not + /// double-fired on replay. The payload and result are serialized to/from + /// a checkpoint using the registered on + /// . + /// + /// + /// must be a qualified identifier (version, + /// alias, or $LATEST); unqualified ARNs are rejected by the durable + /// execution service. + /// + Task InvokeAsync( + string functionName, + TPayload payload, + string? name = null, + InvokeConfig? config = null, + CancellationToken cancellationToken = default); +} + +/// +/// Context passed to step functions. +/// +public interface IStepContext +{ + /// + /// Logger scoped to this step. + /// + ILogger Logger { get; } + + /// + /// The current retry attempt number (1-based). + /// + int AttemptNumber { get; } + + /// + /// The deterministic operation ID for this step. + /// + string OperationId { get; } +} + +/// +/// Metadata about the current execution. +/// +public interface IExecutionContext +{ + /// + /// The ARN of the current durable execution. + /// + string DurableExecutionArn { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs new file mode 100644 index 000000000..d871ebb98 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IRetryStrategy.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Determines whether a failed step should be retried and with what delay. +/// +public interface IRetryStrategy +{ + /// + /// Evaluates whether the given exception warrants a retry. + /// + /// The exception that caused the step to fail. + /// The 1-based attempt number that just failed. + /// A decision indicating whether to retry and the delay before the next attempt. + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +/// +/// The outcome of a retry evaluation. +/// +public readonly struct RetryDecision +{ + /// Whether the step should be retried. + public bool ShouldRetry { get; } + + /// The delay before the next retry attempt. + public TimeSpan Delay { get; } + + private RetryDecision(bool shouldRetry, TimeSpan delay) + { + ShouldRetry = shouldRetry; + Delay = delay; + } + + /// Indicates the step should not be retried. + public static RetryDecision DoNotRetry() => new(false, TimeSpan.Zero); + + /// Indicates the step should be retried after the specified delay. + public static RetryDecision RetryAfter(TimeSpan delay) => new(true, delay); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs new file mode 100644 index 000000000..866fb3bab --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IWaitForCallbackContext.cs @@ -0,0 +1,23 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Microsoft.Extensions.Logging; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Context passed to the submitter delegate of +/// . +/// Provides a replay-safe logger scoped to the submitter step. +/// +/// +/// Distinct from so the submitter API can evolve +/// independently. Logger-only surface. +/// +public interface IWaitForCallbackContext +{ + /// + /// Logger scoped to the submitter step. + /// + ILogger Logger { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs new file mode 100644 index 000000000..16b06480c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CallbackOperation.cs @@ -0,0 +1,272 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkCallbackOptions = Amazon.Lambda.Model.CallbackOptions; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable callback operation. Sync-flushes a CALLBACK START checkpoint +/// (the service stamps a freshly-allocated CallbackId onto the response, +/// which the batcher merges back into ), then hands +/// the user an they can later +/// +/// to suspend on. +/// +/// +/// Replay branches — example: +/// +/// var cb = await ctx.CreateCallbackAsync<ApprovalResult>(name: "approval"); +/// // ... external system told to use cb.CallbackId ... +/// var result = await cb.GetResultAsync(); +/// +/// +/// Fresh: no prior state → sync-flush CALLBACK START; +/// the service responds with a CallbackId (merged into state by the +/// batcher); construct the and return it. +/// then suspends. +/// STARTED: a CallbackId is already on the checkpoint; reuse it. +/// suspends (the external system hasn't +/// responded yet) — service re-invokes once it does. +/// SUCCEEDED / FAILED / TIMED_OUT: terminal — construct the +/// with the cached state and return. +/// immediately deserializes / throws. +/// +/// CRITICAL: CreateCallbackAsync always succeeds — it returns the +/// handle regardless of terminal state. Errors are +/// deferred to +/// so user code between CreateCallbackAsync and the result-await runs +/// deterministically across replays. +/// +/// LIFETIME: the handle returned to user code IS the operation object, so it +/// transitively roots , , +/// and . This is invocation-scoped by design — +/// do not store an across invocations (e.g. in a +/// static field on a warm Lambda container). The batcher is disposed when the +/// workflow returns and the captured state belongs to that invocation only; +/// re-using the handle later will read disposed/stale machinery. +/// +/// Serialization is delegated to the registered on +/// . AOT-safe and reflection-based callers +/// share the same code path: the AOT story is determined entirely by the serializer +/// the user registered with the runtime (e.g., +/// SourceGeneratorLambdaJsonSerializer<TContext>). +/// +internal sealed class CallbackOperation : DurableOperation>, ICallback +{ + private readonly CallbackConfig? _config; + private readonly ILambdaSerializer _serializer; + + private string? _callbackId; + + public CallbackOperation( + string operationId, + string? name, + string? parentId, + CallbackConfig? config, + ILambdaSerializer serializer, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _config = config; + _serializer = serializer; + } + + protected override string OperationType => OperationTypes.Callback; + + /// + /// Set when an existing terminal-state checkpoint was observed during + /// dispatch. reads this directly to short- + /// circuit deserialization (or throw the recorded error) without suspending. + /// + private Operation? _terminalReplay; + + /// + public string CallbackId => _callbackId + ?? throw new InvalidOperationException( + "CallbackId is unavailable. Ensure CreateCallbackAsync has completed before reading CallbackId."); + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush the START so the service can allocate a CallbackId for us. + // The batcher's onNewOperations hook merges the service's response into + // ExecutionState, so reading state.GetOperation(OperationId) right after + // the await sees the populated CallbackDetails. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Callback, + Action = OperationAction.START, + SubType = OperationSubTypes.Callback, + Name = Name, + CallbackOptions = BuildCallbackOptions() + }, cancellationToken); + + var stamped = State.GetOperation(OperationId); + var callbackId = stamped?.CallbackDetails?.CallbackId; + if (string.IsNullOrEmpty(callbackId)) + { + // Service didn't return a CallbackId — this is a service-contract + // violation, not user error. Surface as a non-deterministic error + // so the workflow fails fast rather than silently NRE-ing later. + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' was started but the service did not return a CallbackId."); + } + + _callbackId = callbackId; + + // If the service already reported a terminal state on the START response + // (the external system replied synchronously, or timeout was instant), + // record it for GetResultAsync to short-circuit on. + if (IsTerminalStatus(stamped?.Status)) + { + _terminalReplay = stamped; + } + + return this; + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + var callbackId = existing.CallbackDetails?.CallbackId; + if (string.IsNullOrEmpty(callbackId)) + { + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has no CallbackId on its checkpoint."); + } + + _callbackId = callbackId; + + // CRITICAL: we must NOT raise on terminal state here. + // CreateCallbackAsync always returns the ICallback handle so any user + // code between create and GetResult runs deterministically across + // replays. Defer status inspection to GetResultAsync below. + switch (existing.Status) + { + case OperationStatuses.Succeeded: + case OperationStatuses.Failed: + case OperationStatuses.TimedOut: + _terminalReplay = existing; + break; + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // External system hasn't responded yet — GetResultAsync will + // suspend so the service can re-invoke once it does. + break; + + default: + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + + return Task.FromResult>(this); + } + + /// + public async Task GetResultAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Terminal-state checkpoint already observed by Start/Replay — return + // (or throw) immediately without suspending. + if (_terminalReplay != null) + { + return ResolveTerminal(_terminalReplay); + } + + // A later checkpoint in this same invocation (e.g. WaitForCallback's + // submitter step flush) may have merged a terminal status into + // ExecutionState via NewExecutionState. Re-read once before suspending + // so we avoid a wasted reinvocation when the answer is already here. + var current = State.GetOperation(OperationId); + if (IsTerminalStatus(current?.Status)) + { + return ResolveTerminal(current!); + } + + // No terminal state yet. Suspend the workflow; the service re-invokes + // when the external system delivers a result. + return await Termination.SuspendAndAwait( + TerminationReason.CallbackPending, + $"callback:{Name ?? OperationId}"); + } + + private T ResolveTerminal(Operation op) + { + switch (op.Status) + { + case OperationStatuses.Succeeded: + var serialized = op.CallbackDetails?.Result; + if (serialized == null) return default!; + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + case OperationStatuses.Failed: + throw BuildFailedException(op); + + case OperationStatuses.TimedOut: + throw BuildTimeoutException(op); + + default: + // Should be unreachable — _terminalReplay is only set for terminal statuses. + throw new NonDeterministicExecutionException( + $"Callback operation '{Name ?? OperationId}' has unexpected status '{op.Status}' on result resolution."); + } + } + + private CallbackFailedException BuildFailedException(Operation op) + { + var err = op.CallbackDetails?.Error; + var message = err?.ErrorMessage ?? "Callback failed"; + return new CallbackFailedException(message) + { + CallbackId = op.CallbackDetails?.CallbackId, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace, + }; + } + + private CallbackTimeoutException BuildTimeoutException(Operation op) + { + var err = op.CallbackDetails?.Error; + var message = err?.ErrorMessage ?? "Callback timed out"; + return new CallbackTimeoutException(message) + { + CallbackId = op.CallbackDetails?.CallbackId, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace, + }; + } + + private SdkCallbackOptions? BuildCallbackOptions() + { + if (_config == null) return null; + if (_config.Timeout == TimeSpan.Zero && _config.HeartbeatTimeout == TimeSpan.Zero) return null; + + var options = new SdkCallbackOptions(); + if (_config.Timeout > TimeSpan.Zero) + options.TimeoutSeconds = (int)Math.Max(1, Math.Ceiling(_config.Timeout.TotalSeconds)); + if (_config.HeartbeatTimeout > TimeSpan.Zero) + options.HeartbeatTimeoutSeconds = (int)Math.Max(1, Math.Ceiling(_config.HeartbeatTimeout.TotalSeconds)); + return options; + } + + private static bool IsTerminalStatus(string? status) => + status == OperationStatuses.Succeeded + || status == OperationStatuses.Failed + || status == OperationStatuses.TimedOut; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs new file mode 100644 index 000000000..800d55bcf --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs @@ -0,0 +1,218 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Runtime.ExceptionServices; +using System.Threading.Channels; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Background batcher for outbound checkpoint updates. Operations are enqueued +/// via ; a single worker drains the queue and flushes +/// each batch via the supplied flushAsync delegate. Each EnqueueAsync +/// call awaits the flush of its containing batch (sync semantics). +/// +/// +/// Fire-and-forget semantics are achieved by simply not awaiting the returned +/// Task. Errors still surface deterministically via _terminalError: the +/// next sync or rethrows. +/// Callers using fire-and-forget should observe the discarded Task's exception +/// (see StepOperation.FireAndForget) so it doesn't trip the runtime's +/// UnobservedTaskException event. +/// +internal sealed class CheckpointBatcher : IAsyncDisposable +{ + private readonly Func, CancellationToken, Task> _flushAsync; + private readonly CheckpointBatcherConfig _config; + private readonly Channel _channel; + private readonly Task _worker; + private readonly CancellationTokenSource _shutdownCts = new(); + + private string? _checkpointToken; + private Exception? _terminalError; + private int _disposed; + + public CheckpointBatcher( + string? initialCheckpointToken, + Func, CancellationToken, Task> flushAsync, + CheckpointBatcherConfig? config = null) + { + _checkpointToken = initialCheckpointToken; + _flushAsync = flushAsync; + _config = config ?? new CheckpointBatcherConfig(); + _channel = Channel.CreateUnbounded(new UnboundedChannelOptions + { + SingleReader = true, + SingleWriter = false + }); + _worker = Task.Run(() => RunWorkerAsync(_shutdownCts.Token)); + } + + /// + /// The most recent checkpoint token returned by the service. Updated after + /// every successful batch flush. + /// + public string? CheckpointToken => Volatile.Read(ref _checkpointToken); + + /// + /// Queues for flushing. The returned Task completes + /// when the batch containing this update has been successfully flushed to the + /// service. If the worker has already encountered a terminal error, the + /// exception is rethrown immediately. + /// + public async Task EnqueueAsync(SdkOperationUpdate update, CancellationToken cancellationToken = default) + { + var terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var item = new BatchItem(update, tcs); + + if (!_channel.Writer.TryWrite(item)) + { + // Writer is completed (terminal error or disposed) — surface the cause. + terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + throw new ObjectDisposedException(nameof(CheckpointBatcher)); + } + + await tcs.Task.WaitAsync(cancellationToken).ConfigureAwait(false); + } + + /// + /// Closes the channel and awaits the worker. Any items already enqueued are + /// flushed; any subsequent call throws. + /// + public async Task DrainAsync() + { + _channel.Writer.TryComplete(); + try + { + await _worker.ConfigureAwait(false); + } + catch + { + // Surfaced via _terminalError below. + } + + var terminal = Volatile.Read(ref _terminalError); + if (terminal != null) ExceptionDispatchInfo.Throw(terminal); + } + + public async ValueTask DisposeAsync() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) return; + + _channel.Writer.TryComplete(); + _shutdownCts.Cancel(); + try { await _worker.ConfigureAwait(false); } + catch { /* swallow on dispose */ } + _shutdownCts.Dispose(); + } + + private async Task RunWorkerAsync(CancellationToken shutdownToken) + { + // TODO: also enforce _config.MaxBatchBytes here. Today we only cap by + // operation count; an item whose serialized size pushes the batch over + // ~750 KB will be sent and rejected service-side. See CheckpointBatcherConfig. + var batch = new List(_config.MaxBatchOperations); + + try + { + while (await _channel.Reader.WaitToReadAsync(shutdownToken).ConfigureAwait(false)) + { + // Drain everything currently queued. + while (_channel.Reader.TryRead(out var item)) + { + batch.Add(item); + if (batch.Count >= _config.MaxBatchOperations) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + + // Optionally wait for late arrivals to coalesce into one batch. + if (_config.FlushInterval > TimeSpan.Zero && batch.Count > 0) + { + using var windowCts = CancellationTokenSource.CreateLinkedTokenSource(shutdownToken); + windowCts.CancelAfter(_config.FlushInterval); + try + { + while (await _channel.Reader.WaitToReadAsync(windowCts.Token).ConfigureAwait(false)) + { + while (_channel.Reader.TryRead(out var item)) + { + batch.Add(item); + if (batch.Count >= _config.MaxBatchOperations) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + } + } + catch (OperationCanceledException) when (!shutdownToken.IsCancellationRequested) + { + // Window elapsed; fall through to flush. + } + } + + if (batch.Count > 0) + { + await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } + } + catch (OperationCanceledException) when (shutdownToken.IsCancellationRequested) + { + // Disposed mid-wait; fall through to drain. + } + catch (Exception ex) + { + // FlushBatchAsync's exception path already records _terminalError and + // signals batch members. This catch covers anything else (channel, + // logic). Make sure we still propagate. + Volatile.Write(ref _terminalError, ex); + } + finally + { + // Anything left in the channel after the worker exits — fail it. + var failure = Volatile.Read(ref _terminalError) ?? new ObjectDisposedException(nameof(CheckpointBatcher)); + foreach (var leftover in batch) + leftover.Completion.TrySetException(failure); + while (_channel.Reader.TryRead(out var item)) + item.Completion.TrySetException(failure); + + _channel.Writer.TryComplete(); + } + } + + private async Task FlushBatchAsync(IReadOnlyList batch, CancellationToken cancellationToken) + { + var updates = new SdkOperationUpdate[batch.Count]; + for (int i = 0; i < batch.Count; i++) + updates[i] = batch[i].Update; + + try + { + var newToken = await _flushAsync(_checkpointToken, updates, cancellationToken).ConfigureAwait(false); + Volatile.Write(ref _checkpointToken, newToken); + foreach (var item in batch) + item.Completion.TrySetResult(true); + } + catch (Exception ex) + { + Volatile.Write(ref _terminalError, ex); + foreach (var item in batch) + item.Completion.TrySetException(ex); + _channel.Writer.TryComplete(); + // No rethrow: the worker loop exits via the completed channel and + // RunWorkerAsync's finally handles any leftovers. + } + } + + private readonly record struct BatchItem(SdkOperationUpdate Update, TaskCompletionSource Completion); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs new file mode 100644 index 000000000..88913e868 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Tunables for . +/// +internal sealed class CheckpointBatcherConfig +{ + /// + /// How long the worker waits for additional items to coalesce into a single + /// batch before flushing. Default = flush as soon + /// as the queue drains. Increase to reduce API calls when many checkpoints + /// are emitted concurrently (e.g. parallel branches, future Map operation). + /// + public TimeSpan FlushInterval { get; init; } = TimeSpan.Zero; + + /// + /// Maximum operations per batch. Service-side limit is 200. + /// + public int MaxBatchOperations { get; init; } = 200; + + /// + /// Maximum batch size in bytes. Service-side limit is ~750 KB. + /// + /// + /// TODO: not enforced today. The worker only checks ; + /// a single oversized item (or a batch whose serialized size exceeds 750 KB) + /// will be sent to the service and rejected there. Wire this in alongside + /// the async-flush operations (Map / Parallel / child-context) since those + /// are the scenarios that can actually fill a batch — today every batch is + /// 1 item with = Zero, so the gap is latent. + /// + internal int MaxBatchBytes { get; init; } = 750 * 1024; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs new file mode 100644 index 000000000..a0abbf99e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -0,0 +1,222 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable child context operation. Runs a user-supplied function inside a +/// nested with its own deterministic operation-ID +/// space, persisting the function's result so subsequent invocations replay +/// the cached value without re-executing. +/// +/// +/// Replay branches — example: await ctx.RunInChildContextAsync(child => ..., name: "phase") +/// +/// Fresh: no prior state → sync-flush CONTEXT START → run user +/// func → on success emit CONTEXT SUCCEED → on failure emit CONTEXT FAIL +/// and throw . +/// SUCCEEDED: return cached deserialized result; user func is +/// NOT re-executed. +/// FAILED: throw with the +/// recorded error; if is +/// set, the mapped exception is thrown instead. +/// STARTED / PENDING: re-run the user func without +/// re-checkpointing START. The child's own operations recover from their +/// own checkpoints, so this is replay propagation; if a wait/callback +/// inside the child is still pending, the user func re-suspends. +/// +/// Unlike , child contexts have no retry strategy: +/// failure is terminal and surfaces immediately via +/// . +/// +internal sealed class ChildContextOperation : DurableOperation +{ + private readonly Func> _func; + private readonly ChildContextConfig? _config; + private readonly ILambdaSerializer _serializer; + private readonly Func _childContextFactory; + + public ChildContextOperation( + string operationId, + string? name, + string? parentId, + Func> func, + ChildContextConfig? config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _func = func; + _config = config; + _serializer = serializer; + _childContextFactory = childContextFactory; + } + + protected override string OperationType => OperationTypes.Context; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + // Sync-flush CONTEXT START before user code so the service has a record + // of the parent context if the inner func suspends (e.g. a Wait inside + // the child terminates the workflow before SUCCEED is reached). + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = _config?.SubType, + Name = Name + }, cancellationToken); + + return await ExecuteFunc(cancellationToken); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Side-effecting code runs at most once: replay returns the + // cached result without invoking the user func. + return Task.FromResult(DeserializeResult(existing.ContextDetails?.Result)); + + case OperationStatuses.Failed: + throw MapFailureException(BuildChildContextException(existing)); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run the user func: the child's own operations replay from + // their own checkpoints. Do NOT re-checkpoint START — the + // original is still authoritative. If something inside the + // child is still pending (Wait, callback, retry) the user func + // will re-suspend on its own. + return ExecuteFunc(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"Child context operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task ExecuteFunc(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var childContext = _childContextFactory(OperationId); + + T result; + try + { + result = await _func(childContext); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (NonDeterministicExecutionException) + { + // Replay-mismatch from an inner operation means the entire execution + // is corrupt — checkpointing this as CONTEXT FAIL would freeze the + // mismatch into history and prevent future invocations from + // re-detecting it. Bubble up untouched. + throw; + } + catch (StepInterruptedException) + { + // AtMostOncePerRetry crash recovery: a step inside the child saw a + // STARTED checkpoint with no terminal record and routed through its + // retry strategy. The step has already checkpointed its own outcome; + // wrapping this as CONTEXT FAIL would mask that. Bubble up so the + // step's strategy / replay flow stays authoritative. + throw; + } + catch (Exception ex) + { + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.FAIL, + SubType = _config?.SubType, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + + throw MapFailureException(new ChildContextException(ex.Message, ex) + { + SubType = _config?.SubType, + ErrorType = ex.GetType().FullName, + OriginalStackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }); + } + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.SUCCEED, + SubType = _config?.SubType, + Name = Name, + Payload = SerializeResult(result) + }, cancellationToken); + + return result; + } + + private Exception MapFailureException(ChildContextException ex) + { + var mapper = _config?.ErrorMapping; + if (mapper == null) return ex; + + var mapped = mapper(ex); + return mapped ?? ex; + } + + private ChildContextException BuildChildContextException(Operation failedOp) + { + var err = failedOp.ContextDetails?.Error; + return new ChildContextException(err?.ErrorMessage ?? "Child context failed") + { + SubType = failedOp.SubType ?? _config?.SubType, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private T DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private string SerializeResult(T value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private static SdkErrorObject ToSdkError(Exception ex) => new() + { + ErrorType = ex.GetType().FullName, + ErrorMessage = ex.Message, + StackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs new file mode 100644 index 000000000..9f3570fac --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableOperation.cs @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Abstract base for durable operations (Step, Wait, ...). Subclasses implement +/// (no prior checkpoint) and +/// (some checkpoint exists); the base handles lookup and dispatch. +/// +/// The operation's result type. +internal abstract class DurableOperation +{ + protected readonly ExecutionState State; + protected readonly TerminationManager Termination; + protected readonly string OperationId; + protected readonly string? Name; + protected readonly string? ParentId; + protected readonly string DurableExecutionArn; + protected readonly CheckpointBatcher? Batcher; + + protected DurableOperation( + string operationId, + string? name, + string? parentId, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + { + OperationId = operationId; + Name = name; + ParentId = parentId; + State = state; + Termination = termination; + DurableExecutionArn = durableExecutionArn; + Batcher = batcher; + } + + /// The wire-format operation type (e.g. "STEP", "WAIT"). + protected abstract string OperationType { get; } + + /// + /// Looks up any prior checkpoint for this op and dispatches to + /// (none) or (some). + /// + public Task ExecuteAsync(CancellationToken cancellationToken) + { + State.ValidateReplayConsistency(OperationId, OperationType, Name); + + // Record that the workflow has reached this op. If every completed + // checkpointed op has now been visited, the state flips out of replay. + State.TrackReplay(OperationId); + + var existing = State.GetOperation(OperationId); + return existing == null + ? StartAsync(cancellationToken) + : ReplayAsync(existing, cancellationToken); + } + + /// First-time execution path: no prior checkpoint exists. + protected abstract Task StartAsync(CancellationToken cancellationToken); + + /// + /// Replay path: a checkpoint from a prior invocation exists. Subclasses + /// switch on . + /// against constants. + /// + protected abstract Task ReplayAsync(Operation existing, CancellationToken cancellationToken); + + /// + /// Enqueues an outbound checkpoint and awaits its batch flush. No-op when + /// no batcher is wired (e.g. unit tests that don't exercise flushing). + /// + protected Task EnqueueAsync(SdkOperationUpdate update, CancellationToken cancellationToken = default) + => Batcher?.EnqueueAsync(update, cancellationToken) ?? Task.CompletedTask; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs new file mode 100644 index 000000000..989749d9b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs @@ -0,0 +1,154 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Collections.Concurrent; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// In-memory store of the operations replayed from +/// plus replay-mode tracking. Outbound checkpoints are owned by +/// ; this type is the inbound side only. +/// +/// +/// +/// At construction the workflow is "replaying" if and only if any user-replayable +/// op is present. The service always sends one EXECUTION-type op +/// carrying the input payload — that's bookkeeping, not user history, +/// so it doesn't count. +/// is called by every DurableOperation.ExecuteAsync +/// at the top of the call. Once every checkpointed completed +/// non-EXECUTION op has been visited, the workflow has caught up +/// to the replay frontier and flips to false +/// for the rest of the invocation. +/// +/// +/// is invoked from the 's +/// background worker (via the onNewOperations hook) while the workflow thread +/// concurrently reads via / — +/// e.g. the fire-and-forget StepOperation path where the workflow is not +/// awaiting the flush. _operations is therefore a . +/// The replay-tracking fields (_visitedOperations, _isReplaying, +/// _remainingReplayOps) are touched only on the workflow thread. +/// +/// +internal sealed class ExecutionState +{ + private readonly ConcurrentDictionary _operations = new(); + private readonly HashSet _visitedOperations = new(); + private bool _isReplaying; + private int _remainingReplayOps; + + public int CheckpointedOperationCount => _operations.Count; + + /// + /// True when the workflow is re-deriving prior operations from checkpointed + /// state. False when running fresh (not-yet-checkpointed) code. + /// + public bool IsReplaying => _isReplaying; + + public void LoadFromCheckpoint(InitialExecutionState? initialState) + { + if (initialState?.Operations != null) + { + AddOperations(initialState.Operations); + } + + // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, + // CANCELLED, STOPPED) we need to re-derive before resuming live work. + // The service-side EXECUTION op (input payload bookkeeping) is always + // present and doesn't count. If the only ops are in-progress + // (READY/PENDING/STARTED), there's nothing to re-derive — the next + // user call IS the next thing to run — so IsReplaying starts false. + var (_, terminalCount) = ScanReplayable(); + _remainingReplayOps = terminalCount; + _isReplaying = terminalCount > 0; + } + + public void AddOperations(IEnumerable operations) + { + foreach (var op in operations) + { + if (op.Id == null) continue; + _operations[op.Id] = op; + } + } + + /// + /// Returns the checkpointed record for , or null + /// if none. Callers should switch on against + /// constants to decide replay behavior. + /// + public Operation? GetOperation(string operationId) + { + _operations.TryGetValue(operationId, out var op); + return op; + } + + public bool HasOperation(string operationId) => _operations.ContainsKey(operationId); + + /// + /// Records that the workflow has reached . + /// Once every checkpointed completed non-EXECUTION op has been + /// visited the workflow has caught up to the replay frontier and + /// flips to false. Idempotent: calling more than + /// once with the same id has no additional effect. + /// + public void TrackReplay(string operationId) + { + if (!_isReplaying) return; + if (!_visitedOperations.Add(operationId)) return; + if (!_operations.TryGetValue(operationId, out var op)) return; + if (op.Type == OperationTypes.Execution) return; + if (!IsTerminalStatus(op.Status)) return; + + if (--_remainingReplayOps <= 0) + _isReplaying = false; + } + + public void ValidateReplayConsistency(string operationId, string expectedType, string? expectedName) + { + // Independent of IsReplaying: as long as a checkpoint record exists + // for this id, its type/name must match what user code is asking for. + // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), + // IsReplaying is false but the records still exist and code drift can + // still produce a mismatch. + if (!_operations.TryGetValue(operationId, out var op)) return; + + if (op.Type != null && op.Type != expectedType) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + + if (expectedName != null && op.Name != null && op.Name != expectedName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + } + + private (bool HasReplayable, int TerminalCount) ScanReplayable() + { + var has = false; + var count = 0; + foreach (var op in _operations.Values) + { + if (op.Type == OperationTypes.Execution) continue; + has = true; + if (IsTerminalStatus(op.Status)) count++; + } + return (has, count); + } + + private static bool IsTerminalStatus(string? status) => + status == OperationStatuses.Succeeded + || status == OperationStatuses.Failed + || status == OperationStatuses.Cancelled + || status == OperationStatuses.Stopped + || status == OperationStatuses.TimedOut; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs new file mode 100644 index 000000000..bc27ff7a3 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/InvokeOperation.cs @@ -0,0 +1,185 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda.Core; +using SdkChainedInvokeOptions = Amazon.Lambda.Model.ChainedInvokeOptions; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable chained-invoke operation. Schedules an asynchronous invocation of +/// another durable Lambda function via the durable execution service and +/// suspends the parent workflow until the chained execution reaches a terminal +/// state. The service drives the chained function and re-invokes the parent +/// with an updated operation status. +/// +/// +/// Replay branches — example: +/// await ctx.InvokeAsync<Req, Resp>("arn:...:fn:prod", req, "process_payment") +/// +/// Fresh: serialize payload → sync-flush CHAINED_INVOKE START +/// (carrying ) → suspend with +/// . +/// SUCCEEDED: deserialize and return cached result from +/// ChainedInvokeDetails.Result; the chained function is NOT +/// re-invoked. +/// FAILED: throw populated +/// from the recorded error. +/// TIMED_OUT: throw . +/// STOPPED: throw . +/// STARTED / PENDING: chained execution is still in +/// flight; re-suspend without re-checkpointing — the original +/// START remains authoritative. +/// +/// Mirrors 's "sync-flush START → suspend" idiom; +/// the chained function executes out-of-process so there is nothing to run +/// locally on either fresh or replay paths besides the suspend wiring. +/// Serialization is delegated to the registered +/// on ; AOT-safe and reflection-based +/// callers share the same code path (the AOT story is determined by the +/// registered serializer). +/// +internal sealed class InvokeOperation : DurableOperation +{ + private readonly string _functionName; + private readonly TPayload _payload; + private readonly InvokeConfig? _config; + private readonly ILambdaSerializer _serializer; + + public InvokeOperation( + string operationId, + string? name, + string? parentId, + string functionName, + TPayload payload, + InvokeConfig? config, + ILambdaSerializer serializer, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _functionName = functionName; + _payload = payload; + _config = config; + _serializer = serializer; + } + + protected override string OperationType => OperationTypes.ChainedInvoke; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var serializedPayload = SerializeValue(_payload); + + // The service is what actually invokes the chained function, so it + // must receive this START before we suspend. If we only batched it + // locally and the parent process were recycled at suspend, the START + // would be lost and the chained function would never run. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.ChainedInvoke, + Action = OperationAction.START, + SubType = OperationSubTypes.ChainedInvoke, + Name = Name, + Payload = serializedPayload, + ChainedInvokeOptions = new SdkChainedInvokeOptions + { + FunctionName = _functionName, + TenantId = _config?.TenantId + } + }, cancellationToken); + + return await Termination.SuspendAndAwait( + TerminationReason.InvokePending, $"invoke:{Name ?? _functionName}"); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + return Task.FromResult(DeserializeResult(existing.ChainedInvokeDetails?.Result)); + + case OperationStatuses.Failed: + throw BuildFailed(existing); + + case OperationStatuses.TimedOut: + throw BuildTimedOut(existing); + + case OperationStatuses.Stopped: + throw BuildStopped(existing); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Chained function is still running. Just suspend again — + // the original START is already on the service, so don't + // re-checkpoint it. Whenever the service re-invokes us next, + // it will include the updated status. + return Termination.SuspendAndAwait( + TerminationReason.InvokePending, $"invoke:{Name ?? _functionName}"); + + default: + throw new NonDeterministicExecutionException( + $"Chained invoke operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private string SerializeValue(TPayload value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private TResult DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private InvokeFailedException BuildFailed(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeFailedException(err?.ErrorMessage ?? "Chained invoke failed.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private InvokeTimedOutException BuildTimedOut(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeTimedOutException(err?.ErrorMessage ?? "Chained invoke timed out.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private InvokeStoppedException BuildStopped(Operation failedOp) + { + var err = failedOp.ChainedInvokeDetails?.Error; + return new InvokeStoppedException(err?.ErrorMessage ?? "Chained invoke was stopped.") + { + FunctionName = _functionName, + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs new file mode 100644 index 000000000..dfebe820e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/LambdaSerializerHelper.cs @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +internal static class LambdaSerializerHelper +{ + private const string MissingSerializerMessage = + "No ILambdaSerializer is registered on ILambdaContext.Serializer. " + + "In the class library programming model, register one with " + + "[assembly: LambdaSerializer(typeof(...))]. In an executable / custom " + + "runtime, pass it to LambdaBootstrapBuilder.Create(handler, serializer). " + + "In tests, set TestLambdaContext.Serializer."; + + public static ILambdaSerializer GetRequired(ILambdaContext lambdaContext) => + lambdaContext.Serializer ?? throw new InvalidOperationException(MissingSerializerMessage); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs new file mode 100644 index 000000000..bbfd3c59d --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Security.Cryptography; +using System.Text; +using System.Threading; +using Amazon.Util; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Generates deterministic operation IDs for durable operations. Each call +/// increments an internal counter and SHA-256 hashes "<parentId>-<counter>" +/// (or just "<counter>" at the root). The same workflow position +/// produces a stable, opaque ID across replays — and the human-readable step +/// name is carried separately on OperationUpdate.Name, so renaming a +/// step does not break replay correlation. +/// +internal sealed class OperationIdGenerator +{ + private int _counter; + private readonly string _prefix; + + /// + /// Creates a root-level generator. + /// + public OperationIdGenerator() + : this(parentId: null) + { + } + + /// + /// Creates a child generator scoped under a parent operation. The parent + /// ID (already hashed) becomes part of the prefix, so child IDs are + /// hash("<parentHash>-1"), hash("<parentHash>-2"), etc. + /// + public OperationIdGenerator(string? parentId) + { + _counter = 0; + ParentId = parentId; + _prefix = parentId != null ? parentId + "-" : string.Empty; + } + + /// + /// Gets the parent operation ID, if any. + /// + public string? ParentId { get; } + + /// + /// Generates the next operation ID. The counter is pre-incremented so the + /// first ID is hash("1"). + /// + /// + /// Uses so concurrent callers + /// (e.g. user code that wraps multiple StepAsync calls in + /// Task.WhenAll with Task.Run, or future ParallelAsync/ + /// MapAsync branches that fan out before awaiting) cannot collide + /// on the same ID. Determinism still requires that calls happen in a + /// deterministic order — atomicity prevents duplicate IDs but not + /// reordering between replays. + /// + public string NextId() + { + var counter = Interlocked.Increment(ref _counter); + return HashOperationId(_prefix + counter.ToString(System.Globalization.CultureInfo.InvariantCulture)); + } + + /// + /// SHA-256 hashes and returns a 64-char lowercase + /// hex digest. Public so tests and child-context construction can reproduce + /// the same hashing logic. + /// + public static string HashOperationId(string rawId) + { + var bytes = Encoding.UTF8.GetBytes(rawId); + var hash = SHA256.HashData(bytes); + return AWSSDKUtils.ToHex(hash, lowercase: true); + } + + /// + /// Creates a child generator scoped under an operation ID from this generator. + /// + public OperationIdGenerator CreateChild(string operationId) + { + return new OperationIdGenerator(operationId); + } + + /// + /// Resets the counter (used for testing only). Not safe to call concurrently + /// with ; tests must quiesce before resetting. + /// + internal void Reset() + { + Interlocked.Exchange(ref _counter, 0); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs new file mode 100644 index 000000000..71c1ff3d5 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/StepOperation.cs @@ -0,0 +1,341 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Microsoft.Extensions.Logging; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkStepOptions = Amazon.Lambda.Model.StepOptions; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable step operation. Runs the user's function (with retry support), +/// persisting its result so subsequent invocations replay the cached value +/// without re-executing. +/// +/// +/// Replay branches — example: await ctx.StepAsync(ChargeCard, "charge") +/// +/// Fresh: no prior state → run func → emit SUCCEED → return. +/// SUCCEEDED: return cached result; func is NOT re-executed. +/// FAILED: re-throw the recorded exception. +/// PENDING (retry timer not yet fired): re-suspend without +/// running func; service re-invokes once NextAttemptTimestamp elapses. +/// STARTED + AtMostOncePerRetry: crash recovery — treat as a +/// failed attempt, route through retry strategy. +/// READY: service has post-PENDING re-invoked us; the retry +/// timer fired and the next attempt is up. Run it. +/// +/// Serialization is delegated to the registered on +/// . AOT-safe and reflection-based callers +/// share the same code path: the AOT story is determined entirely by the serializer +/// the user registered with the runtime (e.g., +/// SourceGeneratorLambdaJsonSerializer<TContext>). +/// +internal sealed class StepOperation : DurableOperation +{ + private readonly Func> _func; + private readonly StepConfig? _config; + private readonly ILambdaSerializer _serializer; + private readonly ILogger _logger; + + public StepOperation( + string operationId, + string? name, + string? parentId, + Func> func, + StepConfig? config, + ILambdaSerializer serializer, + ILogger logger, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _func = func; + _config = config; + _serializer = serializer; + _logger = logger; + } + + protected override string OperationType => OperationTypes.Step; + + protected override Task StartAsync(CancellationToken cancellationToken) + => ExecuteFunc(attemptNumber: 1, cancellationToken); + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Side-effecting code runs at most once: replay returns the + // cached result without invoking func. + return Task.FromResult(DeserializeResult(existing.StepDetails?.Result)); + + case OperationStatuses.Failed: + // Retries were exhausted or never configured — re-throw so the + // user's catch-block flow matches the original execution. + throw CreateStepException(existing); + + case OperationStatuses.Pending: + return ReplayPending(existing, cancellationToken); + + case OperationStatuses.Started: + return ReplayStarted(existing, cancellationToken); + + case OperationStatuses.Ready: + return ReplayReady(existing, cancellationToken); + + default: + // CANCELLED / STOPPED / unrecognized status. Re-running the + // step would re-execute side effects and silently mask a + // service-state we don't know how to interpret. Fail loud. + throw new NonDeterministicExecutionException( + $"Step operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + /// + /// READY means the service has post-PENDING re-invoked us — the retry + /// timer fired and the step is eligible to run its next attempt. No + /// timer check is needed (the service has already decided we're up); + /// just advance the attempt counter and execute. + /// + private Task ReplayReady(Operation ready, CancellationToken cancellationToken) + { + var attemptNumber = (ready.StepDetails?.Attempt ?? 0) + 1; + return ExecuteFunc(attemptNumber, cancellationToken); + } + + /// + /// PENDING means a retry was scheduled (RETRY checkpoint). The service's + /// transition to READY when the timer fires is the authoritative "timer + /// fired" signal; we still get re-invoked in PENDING only if the service + /// re-invokes slightly early. The wall-clock check below is a safety net + /// for that case — clock skew can't cause a missed retry because if our + /// clock is fast we just run early, and if it's slow we re-suspend and + /// the service's READY transition takes over. + /// + private Task ReplayPending(Operation pending, CancellationToken cancellationToken) + { + var nextAttemptTs = pending.StepDetails?.NextAttemptTimestamp; + var attemptNumber = (pending.StepDetails?.Attempt ?? 0) + 1; + + if (nextAttemptTs is { } scheduledMs && + DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() < scheduledMs) + { + // Retry timer hasn't fired yet — re-suspend so we don't bill compute + // while the timer ticks. Service re-invokes once the timer elapses. + return Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"retry:{Name ?? OperationId}"); + } + + return ExecuteFunc(attemptNumber, cancellationToken); + } + + /// + /// STARTED means a START checkpoint was written but no SUCCEED/FAIL exists. + /// For AtMostOncePerRetry this signals a crash mid-step — treat as failure + /// and route through retry. For AtLeastOncePerRetry just re-execute. + /// + private Task ReplayStarted(Operation started, CancellationToken cancellationToken) + { + var attemptNumber = (started.StepDetails?.Attempt ?? 0) + 1; + + if (_config?.Semantics == StepSemantics.AtMostOncePerRetry) + { + // Re-running func would risk a duplicate side effect (e.g. double + // charge). Treat the lost result as a failure; let the retry + // strategy decide whether to try again or give up. + // + // Surface as StepInterruptedException so user strategies can + // distinguish "my code threw" from "a prior attempt crashed before + // recording a terminal record". + var error = started.StepDetails?.Error; + var ex = error != null + ? new StepInterruptedException(error.ErrorMessage ?? "Step failed on previous attempt") { ErrorType = error.ErrorType } + : new StepInterruptedException("Step result lost during AtMostOncePerRetry replay"); + return HandleStepFailureAsync(ex, attemptNumber, cancellationToken); + } + + return ExecuteFunc(attemptNumber, cancellationToken); + } + + private async Task ExecuteFunc(int attemptNumber, CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Emit a START checkpoint before running user code, unless we're already + // resuming a STARTED record (which means an earlier attempt wrote it). + // + // AtMostOncePerRetry: SYNC flush. If Lambda crashes before SUCCEED is + // flushed, ReplayStarted routes through retry instead of re-executing. + // A queued-but-unflushed START is indistinguishable from "never ran" if + // we die, so the sync flush is correctness-load-bearing here. + // + // AtLeastOncePerRetry (default): FIRE-AND-FORGET. Replay correctness + // doesn't depend on the START — SUCCEED alone is sufficient — so this + // is purely telemetry (attempt timing, retry count visible in history). + if (State.GetOperation(OperationId)?.Status != OperationStatuses.Started) + { + var startUpdate = new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.START, + SubType = OperationSubTypes.Step, + Name = Name + }; + + if (_config?.Semantics == StepSemantics.AtMostOncePerRetry) + { + await EnqueueAsync(startUpdate, cancellationToken); + } + else + { + FireAndForget(EnqueueAsync(startUpdate, cancellationToken)); + } + } + + + try + { + var stepContext = new StepContext(OperationId, attemptNumber, _logger); + var result = await _func(stepContext); + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.SUCCEED, + SubType = OperationSubTypes.Step, + Name = Name, + Payload = SerializeResult(result) + }, cancellationToken); + + return result; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + // Funnel into the retry/fail decision tree. May checkpoint RETRY and + // suspend (Pending), or checkpoint FAIL and rethrow to user. + return await HandleStepFailureAsync(ex, attemptNumber, cancellationToken); + } + } + + /// + /// Funnels a step failure into the retry/fail decision. May checkpoint + /// RETRY and suspend (Pending), or checkpoint FAIL and rethrow. + /// + private async Task HandleStepFailureAsync(Exception ex, int attemptNumber, CancellationToken cancellationToken) + { + var retryStrategy = _config?.RetryStrategy; + if (retryStrategy != null) + { + var decision = retryStrategy.ShouldRetry(ex, attemptNumber); + if (decision.ShouldRetry) + { + // Service requires NextAttemptDelaySeconds >= 1. Built-in + // strategies already produce >=1s delays; this guard only + // matters for user-supplied IRetryStrategy / FromDelegate. + var requestedSeconds = decision.Delay.TotalSeconds; + var delaySeconds = (int)Math.Max(1, Math.Ceiling(requestedSeconds)); + if (requestedSeconds < 1) + { + _logger.LogWarning( + "Retry delay for step '{StepName}' attempt {Attempt} was {Requested:F3}s (< 1s); coerced to {Coerced}s.", + Name ?? OperationId, attemptNumber, requestedSeconds, delaySeconds); + } + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.RETRY, + SubType = OperationSubTypes.Step, + Name = Name, + Error = ToSdkError(ex), + StepOptions = new SdkStepOptions { NextAttemptDelaySeconds = delaySeconds } + }, cancellationToken); + return await Termination.SuspendAndAwait( + TerminationReason.RetryScheduled, $"retry:{Name ?? OperationId}"); + } + } + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Step, + Action = OperationAction.FAIL, + SubType = OperationSubTypes.Step, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + + throw new StepException(ex.Message, ex) + { + ErrorType = ex.GetType().FullName + }; + } + + private T DeserializeResult(string? serialized) + { + if (serialized == null) return default!; + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + private string SerializeResult(T value) + { + using var ms = new MemoryStream(); + _serializer.Serialize(value, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + private static StepException CreateStepException(Operation failedOp) + { + var err = failedOp.StepDetails?.Error; + return new StepException(err?.ErrorMessage ?? "Step failed") + { + ErrorType = err?.ErrorType, + ErrorData = err?.ErrorData, + OriginalStackTrace = err?.StackTrace + }; + } + + private static SdkErrorObject ToSdkError(Exception ex) => new() + { + ErrorType = ex.GetType().FullName, + ErrorMessage = ex.Message, + StackTrace = ex.StackTrace?.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList() + }; + + /// + /// Discards a Task but observes any exception so it doesn't surface as an + /// UnobservedTaskException. Used for fire-and-forget START checkpoints + /// under AtLeastOncePerRetry semantics. The actual error still propagates + /// via CheckpointBatcher._terminalError: the next sync EnqueueAsync + /// or DrainAsync will rethrow with the original cause. + /// + private static void FireAndForget(Task task) + { + _ = task.ContinueWith( + static t => _ = t.Exception, + CancellationToken.None, + TaskContinuationOptions.OnlyOnFaulted | TaskContinuationOptions.ExecuteSynchronously, + TaskScheduler.Default); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs new file mode 100644 index 000000000..1218f6ceb --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/TerminationManager.cs @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// The reason the execution was terminated. +/// +internal enum TerminationReason +{ + WaitScheduled, + RetryScheduled, + CallbackPending, + InvokePending, + CheckpointFailed +} + +/// +/// The result of a termination signal. +/// +internal sealed class TerminationResult +{ + public required TerminationReason Reason { get; init; } + public string? Message { get; init; } + public Exception? Exception { get; init; } +} + +/// +/// Manages the suspension signal for durable execution. +/// Uses a TaskCompletionSource that resolves when the function should suspend. +/// Only the first Terminate() call wins; subsequent calls are ignored. +/// +internal sealed class TerminationManager +{ + private readonly TaskCompletionSource _tcs = new(TaskCreationOptions.RunContinuationsAsynchronously); + private int _terminated; + + /// + /// A Task that resolves when Terminate() is called. Used in Task.WhenAny + /// to race against user code. + /// + public Task TerminationTask => _tcs.Task; + + /// + /// Whether Terminate() has been called. + /// + public bool IsTerminated => Volatile.Read(ref _terminated) == 1; + + /// + /// Signals that the execution should suspend. Thread-safe; only the first + /// call has effect. + /// + /// true if this call triggered termination, false if already terminated. + public bool Terminate(TerminationReason reason, string? message = null, Exception? exception = null) + { + if (Interlocked.CompareExchange(ref _terminated, 1, 0) != 0) + return false; + + _tcs.TrySetResult(new TerminationResult + { + Reason = reason, + Message = message, + Exception = exception + }); + + return true; + } + + /// + /// Trips the termination signal and returns a Task that never completes. + /// This is the standard suspension idiom: the caller awaits the returned + /// Task, and 's Task.WhenAny + /// race picks up instead, returning Pending + /// to the service. The returned Task is abandoned and GC'd. + /// + public Task SuspendAndAwait(TerminationReason reason, string? message = null, Exception? exception = null) + { + Terminate(reason, message, exception); + return new TaskCompletionSource().Task; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs new file mode 100644 index 000000000..948503f26 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/WaitOperation.cs @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkWaitOptions = Amazon.Lambda.Model.WaitOptions; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable wait operation. Suspends the workflow for a given duration without +/// consuming compute time; the service schedules a timer and re-invokes Lambda +/// when it fires. +/// +/// +/// Replay semantics — example: await ctx.WaitAsync(TimeSpan.FromHours(1)) +/// +/// Fresh: emit WAIT START → flush → suspend → service schedules timer. +/// Replay (SUCCEEDED): timer fired, return CompletedTask. +/// Replay (STARTED/PENDING): timer still ticking → re-suspend (or +/// short-circuit if the deadline already elapsed but SUCCEEDED hasn't +/// been stamped yet). +/// +/// See for the +/// suspension mechanics (Task.WhenAny race against TerminationManager). +/// +internal sealed class WaitOperation : DurableOperation +{ + private readonly int _waitSeconds; + + public WaitOperation( + string operationId, + string? name, + string? parentId, + int waitSeconds, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _waitSeconds = waitSeconds; + } + + protected override string OperationType => OperationTypes.Wait; + + protected override async Task StartAsync(CancellationToken cancellationToken) + { + // Sync-flush WAIT START before suspending — the service can't schedule + // a timer for a checkpoint it hasn't received. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Wait, + Action = OperationAction.START, + SubType = OperationSubTypes.Wait, + Name = Name, + WaitOptions = new SdkWaitOptions { WaitSeconds = _waitSeconds } + }, cancellationToken); + + return await Termination.SuspendAndAwait( + TerminationReason.WaitScheduled, $"wait:{Name ?? OperationId}"); + } + + protected override Task ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + // Common post-timer case: service stamped the wait as SUCCEEDED + // and re-invoked Lambda. Workflow proceeds to the next step. + return Task.FromResult(null); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Service hasn't marked the wait complete yet. Either the timer + // is still ticking, or the deadline elapsed but SUCCEEDED hasn't + // been stamped yet — treat elapsed deadlines as "done" to avoid + // a pointless extra round-trip. + var expiresAtMs = existing.WaitDetails?.ScheduledEndTimestamp; + if (expiresAtMs is { } ts && DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() >= ts) + { + return Task.FromResult(null); + } + + // Timer still ticking — re-suspend without re-checkpointing. + // The original WAIT START is still authoritative. + return Termination.SuspendAndAwait( + TerminationReason.WaitScheduled, $"wait:{Name ?? OperationId}"); + + default: + throw new NonDeterministicExecutionException( + $"Wait operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs new file mode 100644 index 000000000..b58f810a1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeConfig.cs @@ -0,0 +1,27 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for chained invoke operations. +/// +/// +/// Use with +/// to configure a single chained invocation. Payload/result serialization is +/// performed by the registered on +/// (typically configured via +/// LambdaBootstrapBuilder.Create(handler, serializer)); there are +/// intentionally no serializer fields here, matching the pattern established +/// by . +/// +public sealed class InvokeConfig +{ + /// + /// Optional tenant identifier propagated to the chained invocation via + /// ChainedInvokeOptions.TenantId. Used to route the invocation to a + /// tenant-isolated function. Matches the tenantId field on the + /// Python, JavaScript, and Java SDKs. + /// + public string? TenantId { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs new file mode 100644 index 000000000..da631b687 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/InvokeException.cs @@ -0,0 +1,87 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Thrown when a chained invoke operation reaches a non-success terminal state. +/// +/// +/// Base class for the invoke exception tree. Catch +/// to handle every chained-invoke failure mode uniformly, or pattern-match the +/// concrete subclasses to react differently to specific outcomes: +/// +/// — the chained function threw. +/// — the chained invocation +/// reached the service-side TIMED_OUT terminal state. +/// — the chained execution was +/// stopped by the service or an operator. +/// +/// Mirrors the Java SDK's InvokeException / InvokeFailedException +/// / InvokeTimedOutException / InvokeStoppedException tree; the +/// .NET SDK keeps non-abstract so callers can also +/// rethrow it directly when wrapping fallback logic. +/// +public class InvokeException : DurableExecutionException +{ + /// The fully-qualified name of the invoked function (ARN, alias, or version). + public string? FunctionName { get; init; } + + /// The fully-qualified type name of the original exception, when known. + public string? ErrorType { get; init; } + + /// Optional structured error data attached by the invoked function. + public string? ErrorData { get; init; } + + /// Stack trace of the original exception, captured before serialization. + public IReadOnlyList? OriginalStackTrace { get; init; } + + /// Creates an empty . + public InvokeException() { } + /// Creates an with the given message. + public InvokeException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status FAILED — +/// the invoked function ran and threw. +/// +public class InvokeFailedException : InvokeException +{ + /// Creates an empty . + public InvokeFailedException() { } + /// Creates an with the given message. + public InvokeFailedException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeFailedException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status TIMED_OUT. +/// +public class InvokeTimedOutException : InvokeException +{ + /// Creates an empty . + public InvokeTimedOutException() { } + /// Creates an with the given message. + public InvokeTimedOutException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeTimedOutException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a chained invoke operation completes with status STOPPED +/// — the invocation was stopped administratively by the durable execution +/// service before reaching a normal terminal state. +/// +public class InvokeStoppedException : InvokeException +{ + /// Creates an empty . + public InvokeStoppedException() { } + /// Creates an with the given message. + public InvokeStoppedException(string message) : base(message) { } + /// Creates an wrapping an inner exception. + public InvokeStoppedException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs new file mode 100644 index 000000000..3b55cfa86 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -0,0 +1,229 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// One operation in the durable execution service's invocation envelope. +/// Property names mirror the wire format exactly so System.Text.Json can +/// populate this type declaratively. +/// +public sealed class Operation +{ + /// The operation's unique identifier. + [JsonPropertyName("Id")] + public string? Id { get; set; } + + /// Operation type — see . + [JsonPropertyName("Type")] + public string? Type { get; set; } + + /// Operation status — see . + [JsonPropertyName("Status")] + public string? Status { get; set; } + + /// User-supplied operation name (e.g., the step name). + [JsonPropertyName("Name")] + public string? Name { get; set; } + + /// Identifier of the parent operation, if any (used for nested contexts). + [JsonPropertyName("ParentId")] + public string? ParentId { get; set; } + + /// Operation sub-type, if any (e.g., for child contexts). + [JsonPropertyName("SubType")] + public string? SubType { get; set; } + + /// Unix-epoch milliseconds at which the operation started. + [JsonPropertyName("StartTimestamp")] + public long? StartTimestamp { get; set; } + + /// Unix-epoch milliseconds at which the operation ended. + [JsonPropertyName("EndTimestamp")] + public long? EndTimestamp { get; set; } + + /// Step-specific details (present when is STEP). + [JsonPropertyName("StepDetails")] + public StepDetails? StepDetails { get; set; } + + /// Wait-specific details (present when is WAIT). + [JsonPropertyName("WaitDetails")] + public WaitDetails? WaitDetails { get; set; } + + /// Execution-specific details (present when is EXECUTION). + [JsonPropertyName("ExecutionDetails")] + public ExecutionDetails? ExecutionDetails { get; set; } + + /// Callback-specific details (present when is CALLBACK). + [JsonPropertyName("CallbackDetails")] + public CallbackDetails? CallbackDetails { get; set; } + + /// Chained-invoke details (present when is CHAINED_INVOKE). + [JsonPropertyName("ChainedInvokeDetails")] + public ChainedInvokeDetails? ChainedInvokeDetails { get; set; } + + /// Child-context details (present when is CONTEXT). + [JsonPropertyName("ContextDetails")] + public ContextDetails? ContextDetails { get; set; } +} + +/// Details for a STEP operation. +public sealed class StepDetails +{ + /// Serialized step result. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error from the most recent attempt, if it failed. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } + + /// The attempt number (1-based). + [JsonPropertyName("Attempt")] + public int? Attempt { get; set; } + + /// Unix-epoch milliseconds at which the next retry attempt is scheduled. + [JsonPropertyName("NextAttemptTimestamp")] + public long? NextAttemptTimestamp { get; set; } +} + +/// Details for a WAIT operation. +public sealed class WaitDetails +{ + /// Unix-epoch milliseconds at which the wait is scheduled to end. + [JsonPropertyName("ScheduledEndTimestamp")] + public long? ScheduledEndTimestamp { get; set; } +} + +/// Details for an EXECUTION operation. +public sealed class ExecutionDetails +{ + /// The serialized user input payload for this invocation. + [JsonPropertyName("InputPayload")] + public string? InputPayload { get; set; } +} + +/// Details for a CALLBACK operation. +public sealed class CallbackDetails +{ + /// The callback identifier returned to the external system. + [JsonPropertyName("CallbackId")] + public string? CallbackId { get; set; } + + /// Serialized callback result. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error returned by the external system, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// Details for a CHAINED_INVOKE operation. +public sealed class ChainedInvokeDetails +{ + /// Serialized result from the invoked function. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error returned by the invoked function, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// Details for a CONTEXT operation (child contexts). +public sealed class ContextDetails +{ + /// Serialized result of the child context. + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// Error from the child context, if any. + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} + +/// +/// Wire-format string constants. +/// Plural name avoids collision with Amazon.Lambda.OperationType. +/// +public static class OperationTypes +{ + /// Step operation. + public const string Step = "STEP"; + + /// Wait/timer operation. + public const string Wait = "WAIT"; + + /// Callback (external-system signal) operation. + public const string Callback = "CALLBACK"; + + /// Chained-invoke (durable-to-durable call) operation. + public const string ChainedInvoke = "CHAINED_INVOKE"; + + /// Child-context operation. + public const string Context = "CONTEXT"; + + /// Top-level execution operation carrying the user input payload. + public const string Execution = "EXECUTION"; +} + +/// +/// Wire-format string constants. SubType is a +/// finer-grained classifier sent alongside for +/// observability — the values are PascalCase ("Step", "Wait") and distinct +/// from the uppercase values. +/// +public static class OperationSubTypes +{ + /// Step sub-type. + public const string Step = "Step"; + + /// Wait sub-type. + public const string Wait = "Wait"; + + /// Callback sub-type. + public const string Callback = "Callback"; + + /// Wait-for-callback sub-type. + public const string WaitForCallback = "WaitForCallback"; + + /// Chained-invoke sub-type. + public const string ChainedInvoke = "ChainedInvoke"; + + /// Child-context sub-type. + public const string Context = "Context"; +} + +/// +/// Wire-format string constants. +/// Plural name avoids collision with Amazon.Lambda.OperationStatus. +/// +public static class OperationStatuses +{ + /// The operation has started. + public const string Started = "STARTED"; + + /// The operation completed successfully. + public const string Succeeded = "SUCCEEDED"; + + /// The operation failed. + public const string Failed = "FAILED"; + + /// The operation is pending (waiting for time, callback, or invocation). + public const string Pending = "PENDING"; + + /// The operation was cancelled. + public const string Cancelled = "CANCELLED"; + + /// The operation is ready to resume. + public const string Ready = "READY"; + + /// The operation was stopped. + public const string Stopped = "STOPPED"; + + /// The operation timed out (e.g. callback or chained invoke timeout). + public const string TimedOut = "TIMED_OUT"; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/README.md b/Libraries/src/Amazon.Lambda.DurableExecution/README.md new file mode 100644 index 000000000..81f229ca8 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/README.md @@ -0,0 +1,106 @@ +# AWS Lambda Durable Execution SDK for .NET + +> **Preview.** `Amazon.Lambda.DurableExecution` is in active development (0.x). Public APIs may change before 1.0. + +`Amazon.Lambda.DurableExecution` is the .NET SDK for building resilient, long-running AWS Lambda functions that automatically checkpoint progress and resume after failures. Workflows can run for up to one year, with charges only for active compute time. + +## Key Features + +- **Automatic checkpointing** — progress is saved after each step; failures resume from the last checkpoint. +- **Cost-effective waits** — suspend execution for minutes, hours, or days without compute charges. +- **Configurable retries** — built-in retry strategies with exponential backoff and jitter. +- **Replay safety** — functions deterministically resume from checkpoints after interruptions. +- **Type safety** — full generic type support for step results. +- **AOT-friendly** — pluggable `ILambdaSerializer` so you can register `SourceGeneratorLambdaJsonSerializer` for trimmed / Native AOT functions. + +## How It Works + +Your handler delegates to `DurableFunction.WrapAsync`, which gives your workflow function an `IDurableContext`. The context is your interface to durable operations: + +- `ctx.StepAsync` — run code and checkpoint the result. ([docs](docs/core/steps.md)) +- `ctx.WaitAsync` — suspend execution without compute charges. ([docs](docs/core/wait.md)) +- `ctx.CreateCallbackAsync` / `ctx.WaitForCallbackAsync` — wait for external events (approvals, webhooks). ([docs](docs/core/callbacks.md)) +- `ctx.RunInChildContextAsync` — run an isolated child context with its own checkpoint log. ([docs](docs/core/child-contexts.md)) + +## Quick Start + +### Installation + +```bash +dotnet add package Amazon.Lambda.DurableExecution +``` + +### Your first durable function + +> **Programming model:** the preview only supports the **executable programming model** — your function is an executable assembly that hosts its own bootstrap loop and passes the serializer to the runtime in code. Class-library handlers on the managed runtime will be supported once the changes made to Amazon.Lambda.RuntimeSupport to support durable functions has been deployed to the managed runtime. This README will be updated then. + +A complete order-processing workflow with two steps and a wait, deployed as an executable assembly on the `dotnet10` runtime. `Main` builds a `LambdaBootstrap` with your handler and an `ILambdaSerializer`, and `DurableFunction.WrapAsync` uses that serializer to checkpoint step inputs and outputs. + +```csharp +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderProcessor; + +public class OrderProcessor +{ + public static async Task Main() + { + var handler = new OrderProcessor(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(Order order, IDurableContext ctx) + { + var reservation = await ctx.StepAsync( + async _ => await InventoryService.ReserveAsync(order.Items), + name: "reserve-inventory"); + + var payment = await ctx.StepAsync( + async _ => await PaymentService.ChargeAsync(order.PaymentMethod, order.Total), + name: "process-payment"); + + await ctx.WaitAsync(TimeSpan.FromHours(2), name: "warehouse-processing"); + + var shipment = await ctx.StepAsync( + async _ => await ShippingService.ShipAsync(reservation, order.Address), + name: "confirm-shipment"); + + return new OrderResult(order.Id, shipment.TrackingNumber); + } +} + +public record Order(string Id, IReadOnlyList Items, PaymentMethod PaymentMethod, decimal Total, Address Address); +public record OrderResult(string OrderId, string TrackingNumber); +``` + +For AOT or trim-friendly serialization, swap `DefaultLambdaJsonSerializer` for `SourceGeneratorLambdaJsonSerializer` and register your `JsonSerializerContext`. + +## Documentation + +**Core operations** + +- [Steps](docs/core/steps.md) — execute code with automatic checkpointing, retry strategies, and at-least/at-most-once semantics. +- [Wait](docs/core/wait.md) — pause execution without compute charges. +- [Callbacks](docs/core/callbacks.md) — wait for external systems to respond. +- [Child Contexts](docs/core/child-contexts.md) — group related operations into isolated, checkpointed units. + +**Examples** + +End-to-end test functions (each paired with an integration test) live under `Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/`. + +## Related SDKs + +- [aws-durable-execution-sdk-java](https://github.com/aws/aws-durable-execution-sdk-java) — Java SDK +- [aws-durable-execution-sdk-js](https://github.com/aws/aws-durable-execution-sdk-js) — JavaScript / TypeScript SDK +- [aws-durable-execution-sdk-python](https://github.com/aws/aws-durable-execution-sdk-python) — Python SDK diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs b/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs new file mode 100644 index 000000000..8e9c3e8a1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/RetryStrategy.cs @@ -0,0 +1,204 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.RegularExpressions; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Jitter strategy for exponential backoff to prevent thundering-herd scenarios. +/// +public enum JitterStrategy +{ + /// No randomization — delay is exactly the calculated backoff value. + None, + /// Random delay between 0 and the calculated backoff value (recommended). + Full, + /// Random delay between 50% and 100% of the calculated backoff value. + Half +} + +/// +/// Controls whether a step re-executes if the Lambda is re-invoked mid-attempt. +/// +public enum StepSemantics +{ + /// + /// Default. The step may re-execute if the Lambda is re-invoked during execution. + /// Use for idempotent operations. + /// + AtLeastOncePerRetry, + + /// + /// The step executes at most once per retry attempt. A START checkpoint is written + /// before execution; on replay with an existing START, the SDK skips re-execution + /// and proceeds to the retry handler. + /// + AtMostOncePerRetry +} + +/// +/// Factory methods for common retry strategies. +/// +public static class RetryStrategy +{ + /// 6 attempts, 2x backoff, 5s initial delay, 60s max, Full jitter. + public static IRetryStrategy Default { get; } = Exponential( + maxAttempts: 6, + initialDelay: TimeSpan.FromSeconds(5), + maxDelay: TimeSpan.FromSeconds(60), + backoffRate: 2.0, + jitter: JitterStrategy.Full); + + /// 3 attempts, 2x backoff, 1s initial delay, 5s max, Half jitter. + public static IRetryStrategy Transient { get; } = Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 2.0, + jitter: JitterStrategy.Half); + + /// No retry — 1 attempt only. + public static IRetryStrategy None { get; } = Exponential(maxAttempts: 1); + + /// + /// Creates an exponential backoff retry strategy. + /// + /// + /// Thrown if < 1, < 1, + /// is non-positive, is non-positive, + /// or > . + /// + public static IRetryStrategy Exponential( + int maxAttempts = 3, + TimeSpan? initialDelay = null, + TimeSpan? maxDelay = null, + double backoffRate = 2.0, + JitterStrategy jitter = JitterStrategy.Full, + Type[]? retryableExceptions = null, + string[]? retryableMessagePatterns = null) + { + return new ExponentialRetryStrategy( + maxAttempts, + initialDelay ?? TimeSpan.FromSeconds(5), + maxDelay ?? TimeSpan.FromSeconds(300), + backoffRate, + jitter, + retryableExceptions, + retryableMessagePatterns); + } + + /// + /// Creates a retry strategy from a delegate. + /// + /// Thrown if is null. + public static IRetryStrategy FromDelegate(Func strategy) + { + if (strategy == null) throw new ArgumentNullException(nameof(strategy)); + return new DelegateRetryStrategy(strategy); + } +} + +internal sealed class ExponentialRetryStrategy : IRetryStrategy +{ + private readonly int _maxAttempts; + private readonly TimeSpan _initialDelay; + private readonly TimeSpan _maxDelay; + private readonly double _backoffRate; + private readonly JitterStrategy _jitter; + private readonly Type[]? _retryableExceptions; + private readonly Regex[]? _retryableMessagePatterns; + + public ExponentialRetryStrategy( + int maxAttempts, + TimeSpan initialDelay, + TimeSpan maxDelay, + double backoffRate, + JitterStrategy jitter, + Type[]? retryableExceptions, + string[]? retryableMessagePatterns) + { + if (maxAttempts < 1) + throw new ArgumentOutOfRangeException(nameof(maxAttempts), maxAttempts, "must be >= 1"); + if (initialDelay <= TimeSpan.Zero) + throw new ArgumentOutOfRangeException(nameof(initialDelay), initialDelay, "must be > 0"); + if (maxDelay <= TimeSpan.Zero) + throw new ArgumentOutOfRangeException(nameof(maxDelay), maxDelay, "must be > 0"); + if (initialDelay > maxDelay) + throw new ArgumentOutOfRangeException(nameof(initialDelay), initialDelay, $"must be <= maxDelay ({maxDelay})"); + if (backoffRate < 1.0 || double.IsNaN(backoffRate) || double.IsInfinity(backoffRate)) + throw new ArgumentOutOfRangeException(nameof(backoffRate), backoffRate, "must be a finite value >= 1.0"); + + _maxAttempts = maxAttempts; + _initialDelay = initialDelay; + _maxDelay = maxDelay; + _backoffRate = backoffRate; + _jitter = jitter; + _retryableExceptions = retryableExceptions; + _retryableMessagePatterns = retryableMessagePatterns? + .Select(p => new Regex(p)) + .ToArray(); + } + + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + { + if (attemptNumber >= _maxAttempts) + return RetryDecision.DoNotRetry(); + + if (!IsRetryable(exception)) + return RetryDecision.DoNotRetry(); + + var delay = CalculateDelay(attemptNumber); + return RetryDecision.RetryAfter(delay); + } + + private bool IsRetryable(Exception exception) + { + if (_retryableExceptions == null && _retryableMessagePatterns == null) + return true; + + if (_retryableExceptions != null) + { + var exType = exception.GetType(); + if (_retryableExceptions.Any(t => t.IsAssignableFrom(exType))) + return true; + } + + if (_retryableMessagePatterns != null) + { + var message = exception.Message; + if (_retryableMessagePatterns.Any(p => p.IsMatch(message))) + return true; + } + + return false; + } + + internal TimeSpan CalculateDelay(int attemptNumber) + { + var baseDelay = _initialDelay.TotalSeconds * Math.Pow(_backoffRate, attemptNumber - 1); + var cappedDelay = Math.Min(baseDelay, _maxDelay.TotalSeconds); + + var finalDelay = _jitter switch + { + JitterStrategy.Full => Random.Shared.NextDouble() * cappedDelay, + JitterStrategy.Half => cappedDelay * (0.5 + 0.5 * Random.Shared.NextDouble()), + _ => cappedDelay + }; + + return TimeSpan.FromSeconds(Math.Max(1, Math.Ceiling(finalDelay))); + } +} + +internal sealed class DelegateRetryStrategy : IRetryStrategy +{ + private readonly Func _strategy; + + public DelegateRetryStrategy(Func strategy) + { + _strategy = strategy ?? throw new ArgumentNullException(nameof(strategy)); + } + + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + => _strategy(exception, attemptNumber); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs new file mode 100644 index 000000000..a38dda31b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs @@ -0,0 +1,203 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Model; +using Amazon.Runtime; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; +using SdkOperation = Amazon.Lambda.Model.Operation; +using Operation = Amazon.Lambda.DurableExecution.Operation; +using StepDetails = Amazon.Lambda.DurableExecution.StepDetails; +using WaitDetails = Amazon.Lambda.DurableExecution.WaitDetails; +using ExecutionDetails = Amazon.Lambda.DurableExecution.ExecutionDetails; +using ContextDetails = Amazon.Lambda.DurableExecution.ContextDetails; +using CallbackDetails = Amazon.Lambda.DurableExecution.CallbackDetails; +using ChainedInvokeDetails = Amazon.Lambda.DurableExecution.ChainedInvokeDetails; + +namespace Amazon.Lambda.DurableExecution.Services; + +/// +/// Calls the real AWS Lambda Durable Execution APIs via the AWSSDK.Lambda client. +/// +internal sealed class LambdaDurableServiceClient +{ + private readonly IAmazonLambda _lambdaClient; + + public LambdaDurableServiceClient(IAmazonLambda lambdaClient) + { + _lambdaClient = lambdaClient; + } + + /// + /// Flushes pending checkpoint operations to the durable execution service. + /// SDK errors are wrapped in so user logs + /// show the durable-execution context (which API call, which ARN) alongside the + /// underlying SDK message — instead of a bare AWSSDK stack trace with no clue + /// about what was being called. + /// When is supplied, any + /// NewExecutionState.Operations the service returns (e.g. a freshly + /// allocated CallbackId after a callback START checkpoint, or a + /// timer-fired SUCCEEDED) are forwarded to the callback so the caller can + /// merge them into its in-memory . + /// + public async Task CheckpointAsync( + string durableExecutionArn, + string? checkpointToken, + IReadOnlyList pendingOperations, + Action>? onNewOperations = null, + CancellationToken cancellationToken = default) + { + if (pendingOperations.Count == 0) + return checkpointToken; + + var request = new CheckpointDurableExecutionRequest + { + DurableExecutionArn = durableExecutionArn, + CheckpointToken = checkpointToken ?? "", + Updates = pendingOperations is List list ? list : pendingOperations.ToList() + }; + + CheckpointDurableExecutionResponse response; + try + { + response = await _lambdaClient.CheckpointDurableExecutionAsync(request, cancellationToken); + } + catch (AmazonServiceException ex) + { + throw new DurableExecutionException( + $"Failed to checkpoint operations for durable execution '{durableExecutionArn}': {ex.Message}", + ex); + } + + // The service returns NewExecutionState carrying any operations updated + // since the last checkpoint — most importantly, the callback ID stamped + // onto a freshly-started CALLBACK op, plus any externally-completed + // callbacks/timers. Hand them to the caller (DurableFunction wires this + // back into ExecutionState) so subsequent replay-style lookups see the + // updated state immediately. + var updated = response.NewExecutionState?.Operations; + if (onNewOperations != null && updated != null && updated.Count > 0) + { + var mapped = new List(updated.Count); + foreach (var sdkOp in updated) + mapped.Add(MapFromSdkOperation(sdkOp)); + onNewOperations(mapped); + } + + return response.CheckpointToken; + } + + /// + /// Fetches additional pages of execution state when the initial state is paginated. + /// SDK errors are wrapped in for the same + /// reason as . + /// + public async Task<(List Operations, string? NextMarker)> GetExecutionStateAsync( + string durableExecutionArn, + string? checkpointToken, + string marker, + CancellationToken cancellationToken = default) + { + var request = new GetDurableExecutionStateRequest + { + DurableExecutionArn = durableExecutionArn, + CheckpointToken = checkpointToken ?? "", + Marker = marker + }; + + GetDurableExecutionStateResponse response; + try + { + response = await _lambdaClient.GetDurableExecutionStateAsync(request, cancellationToken); + } + catch (AmazonServiceException ex) + { + throw new DurableExecutionException( + $"Failed to fetch execution state for durable execution '{durableExecutionArn}' (marker '{marker}'): {ex.Message}", + ex); + } + + var operations = new List(); + if (response.Operations != null) + { + foreach (var sdkOp in response.Operations) + { + operations.Add(MapFromSdkOperation(sdkOp)); + } + } + + return (operations, response.NextMarker); + } + + private static Operation MapFromSdkOperation(SdkOperation sdkOp) + { + return new Operation + { + Id = sdkOp.Id, + Type = sdkOp.Type, + Status = sdkOp.Status, + Name = sdkOp.Name, + ParentId = sdkOp.ParentId, + SubType = sdkOp.SubType, + StepDetails = sdkOp.StepDetails != null ? new StepDetails + { + Result = sdkOp.StepDetails.Result, + Error = MapError(sdkOp.StepDetails.Error), + Attempt = sdkOp.StepDetails.Attempt, + NextAttemptTimestamp = sdkOp.StepDetails.NextAttemptTimestamp.HasValue + ? new DateTimeOffset(sdkOp.StepDetails.NextAttemptTimestamp.Value, TimeSpan.Zero).ToUnixTimeMilliseconds() + : null + } : null, + WaitDetails = sdkOp.WaitDetails != null ? new WaitDetails + { + ScheduledEndTimestamp = sdkOp.WaitDetails.ScheduledEndTimestamp.HasValue + ? new DateTimeOffset(sdkOp.WaitDetails.ScheduledEndTimestamp.Value, TimeSpan.Zero).ToUnixTimeMilliseconds() + : null + } : null, + ExecutionDetails = sdkOp.ExecutionDetails != null ? new ExecutionDetails + { + InputPayload = sdkOp.ExecutionDetails.InputPayload + } : null, + ContextDetails = sdkOp.ContextDetails != null ? new ContextDetails + { + Result = sdkOp.ContextDetails.Result, + Error = MapError(sdkOp.ContextDetails.Error) + } : null, + CallbackDetails = sdkOp.CallbackDetails != null ? new CallbackDetails + { + CallbackId = sdkOp.CallbackDetails.CallbackId, + Result = sdkOp.CallbackDetails.Result, + Error = MapError(sdkOp.CallbackDetails.Error) + } : null, + ChainedInvokeDetails = sdkOp.ChainedInvokeDetails != null ? new ChainedInvokeDetails + { + Result = sdkOp.ChainedInvokeDetails.Result, + Error = MapError(sdkOp.ChainedInvokeDetails.Error) + } : null + }; + } + + /// + /// Maps an SDK into the + /// internal . Carries every field the wire object + /// exposes — ErrorType, ErrorMessage, ErrorData, and + /// StackTrace — so the durable execution exception builders + /// (, , and + /// the tree) can rehydrate the original + /// failure faithfully on real-service replay. + /// + private static ErrorObject? MapError(Amazon.Lambda.Model.ErrorObject? sdkError) + { + if (sdkError == null) return null; + return new ErrorObject + { + ErrorType = sdkError.ErrorType, + ErrorMessage = sdkError.ErrorMessage, + ErrorData = sdkError.ErrorData, + // SDK exposes List; assigning into IReadOnlyList? + // is reference-identical. A null list (SDK 4.x default when the + // field isn't set on the wire) propagates as null on our side. + StackTrace = sdkError.StackTrace + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs new file mode 100644 index 000000000..eea3dc791 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/StepConfig.cs @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for step execution. +/// +public sealed class StepConfig +{ + /// + /// Retry strategy for failed steps. When null (default), failures are not retried. + /// + public IRetryStrategy? RetryStrategy { get; set; } + + /// + /// Controls whether a step may re-execute if the Lambda is re-invoked mid-attempt. + /// Default is . + /// + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs b/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs new file mode 100644 index 000000000..1ebfe58a1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/UpperSnakeCaseEnumConverter.cs @@ -0,0 +1,66 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Converts between UPPER_SNAKE_CASE wire format (e.g., CHAINED_INVOKE) +/// and PascalCase enum values (e.g., ChainedInvoke). +/// +public sealed class UpperSnakeCaseEnumConverter : JsonConverter where T : struct, Enum +{ + /// + public override T Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + if (reader.TokenType == JsonTokenType.Null) + return default; + + var value = reader.GetString(); + if (value == null) + return default; + + // Convert UPPER_SNAKE_CASE to PascalCase for enum lookup + var pascalCase = SnakeToPascal(value); + + if (Enum.TryParse(pascalCase, ignoreCase: true, out var result)) + return result; + + // Fallback: try direct case-insensitive parse of the raw value + if (Enum.TryParse(value, ignoreCase: true, out result)) + return result; + + throw new JsonException($"Unable to parse '{value}' as {typeof(T).Name}."); + } + + /// + public override void Write(Utf8JsonWriter writer, T value, JsonSerializerOptions options) + { + writer.WriteStringValue(PascalToSnake(value.ToString())); + } + + private static string SnakeToPascal(string snake) + { + var parts = snake.Split('_'); + for (int i = 0; i < parts.Length; i++) + { + if (parts[i].Length > 0) + parts[i] = char.ToUpper(parts[i][0]) + parts[i][1..].ToLower(); + } + return string.Join("", parts); + } + + private static string PascalToSnake(string pascal) + { + var result = new System.Text.StringBuilder(); + for (int i = 0; i < pascal.Length; i++) + { + if (i > 0 && char.IsUpper(pascal[i])) + result.Append('_'); + result.Append(char.ToUpper(pascal[i])); + } + return result.ToString(); + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs new file mode 100644 index 000000000..90cf1f420 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/WaitForCallbackConfig.cs @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for the composite +/// +/// operation. Inherits the callback's and +/// ; adds a +/// for the submitter step. +/// +public class WaitForCallbackConfig : CallbackConfig +{ + /// + /// Retry strategy applied to the submitter step. When null (default), + /// submitter failures are not retried — the submitter step fails terminally + /// and surfaces as . + /// + public IRetryStrategy? RetryStrategy { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md new file mode 100644 index 000000000..573ad17e3 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/callbacks.md @@ -0,0 +1,185 @@ +# Callbacks + +Callbacks let a workflow suspend until an external system (a human approver, a webhook, another service) delivers a result. The external system completes the callback by calling `SendDurableExecutionCallbackSuccess`, `SendDurableExecutionCallbackFailure`, or `SendDurableExecutionCallbackHeartbeat` with the `callbackId` you handed it. + +Two APIs are available: + +- `WaitForCallbackAsync` — composite operation; create the callback, hand it to the external system inside a submitter delegate, and suspend until the result arrives. +- `CreateCallbackAsync` — lower-level; allocate the callback yourself, hand the ID out in your own steps, and `await` the result separately. + +## `WaitForCallbackAsync` + +```csharp +Task WaitForCallbackAsync( + Func submitter, + string? name = null, + WaitForCallbackConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The submitter receives the freshly allocated `callbackId` and an `IWaitForCallbackContext` (logger-only). Submitter failures (after retries are exhausted) surface as `CallbackSubmitterException`; callback failures and timeouts surface as `CallbackFailedException` / `CallbackTimeoutException`. + +## `CreateCallbackAsync` + +```csharp +Task> CreateCallbackAsync( + string? name = null, + CallbackConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The returned `ICallback` exposes: + +- `string CallbackId` — give this to the external system. +- `Task GetResultAsync(CancellationToken)` — `await` to suspend until the external system completes the callback. + +The result is deserialized using the registered `ILambdaSerializer`. Throws `CallbackFailedException` or `CallbackTimeoutException` on failure. + +## End-to-end example + +Two Lambdas: a workflow that suspends on a callback, and a separate approver Lambda that resolves it. The workflow hands its `callbackId` to the approver via `Event` invocation (fire-and-forget), then suspends. The approver runs in its own Lambda and signals completion by calling `SendDurableExecutionCallbackSuccessAsync`. + +### 1. Workflow Lambda — `WaitForCallbackAsync` + +```csharp +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderApprovalWorkflow; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main() + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(OrderInput input, IDurableContext ctx) + { + var approverFunctionName = Environment.GetEnvironmentVariable("APPROVER_FUNCTION_NAME") + ?? throw new InvalidOperationException("APPROVER_FUNCTION_NAME env var not set"); + + // Suspend until the approver Lambda calls SendDurableExecutionCallbackSuccessAsync + // with this callback ID. The submitter is invoked once with a freshly-allocated + // ID; it hands the ID to the approver and returns immediately. + var result = await ctx.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + var payload = $$"""{"callbackId":"{{callbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = approverFunctionName, + InvocationType = InvocationType.Event, // fire-and-forget + Payload = payload + }); + }, + name: "approve"); + + return result; + } +} + +public record OrderInput(string OrderId); +public record ApprovalResult(string Status, string ApprovedBy); +``` + +### 2. Approver Lambda — completes the callback + +A plain Lambda — no durable execution wrapper. It receives the callback ID, performs whatever logic the external system needs, and calls `SendDurableExecutionCallbackSuccessAsync` to resume the workflow. + +```csharp +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace OrderApprovalWorkflow; + +public class ApproverFunction +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main() + { + var handler = new ApproverFunction(); + var serializer = new DefaultLambdaJsonSerializer(); + using var wrapper = HandlerWrapper.GetHandlerWrapper( + handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(wrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(ApproverInput input, ILambdaContext context) + { + // The result JSON must match the T in WaitForCallbackAsync — here, ApprovalResult. + var resultJson = $$"""{"Status":"approved","ApprovedBy":"{{input.OrderId}}"}"""; + await LambdaClient.SendDurableExecutionCallbackSuccessAsync( + new SendDurableExecutionCallbackSuccessRequest + { + CallbackId = input.CallbackId, + Result = new MemoryStream(Encoding.UTF8.GetBytes(resultJson)) + }); + return null; + } +} + +public record ApproverInput(string CallbackId, string OrderId); +``` + +To signal failure instead, call `SendDurableExecutionCallbackFailureAsync` — the workflow throws `CallbackFailedException`. To extend the heartbeat deadline (when `HeartbeatTimeout` is configured), call `SendDurableExecutionCallbackHeartbeatAsync`. + +### `CreateCallbackAsync` variant + +When you need to allocate the ID before deciding how to hand it out — e.g. several steps run between callback creation and submission — use `CreateCallbackAsync` and a separate `StepAsync` for the submission. Wrapping the hand-off in a step prevents replays from re-invoking the approver. + +```csharp +private async Task Workflow(OrderInput input, IDurableContext ctx) +{ + var cb = await ctx.CreateCallbackAsync(name: "approve"); + + await ctx.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = approverFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); +} +``` + +## Configuration + +```csharp +public class CallbackConfig +{ + public TimeSpan Timeout { get; set; } // overall callback timeout, ≥ 1s or Zero (default = no timeout) + public TimeSpan HeartbeatTimeout { get; set; } // heartbeat-gap timeout, ≥ 1s or Zero (default = no timeout) +} + +public class WaitForCallbackConfig : CallbackConfig +{ + public IRetryStrategy? RetryStrategy { get; set; } // applied to the submitter step only +} +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md new file mode 100644 index 000000000..4a664e11e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/child-contexts.md @@ -0,0 +1,46 @@ +# Child Contexts + +`RunInChildContextAsync` runs a sub-workflow inside its own deterministic operation-ID space. The child's return value is checkpointed as a single `CONTEXT` operation, so subsequent invocations replay the cached value without re-executing the contained operations. Use to group related steps under a shared error/observability boundary. + +## Signatures + +```csharp +Task RunInChildContextAsync( + Func> func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); + +Task RunInChildContextAsync( + Func func, + string? name = null, + ChildContextConfig? config = null, + CancellationToken cancellationToken = default); +``` + +## Example + +```csharp +var phaseResult = await ctx.RunInChildContextAsync( + async childCtx => + { + var validated = await childCtx.StepAsync(async _ => Validate(input), name: "validate"); + await childCtx.WaitAsync(TimeSpan.FromSeconds(2), name: "short_wait"); + var processed = await childCtx.StepAsync(async _ => Process(validated), name: "process"); + return processed; + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); +``` + +## Configuration + +```csharp +public sealed class ChildContextConfig +{ + public string? SubType { get; set; } // observability label + public Func? ErrorMapping { get; set; } // remap thrown exceptions +} +``` + +`ErrorMapping` lets you translate exceptions thrown inside the child context into a domain-specific exception type before they propagate to the parent. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md new file mode 100644 index 000000000..c7f9e9f22 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/steps.md @@ -0,0 +1,148 @@ +# Steps + +`StepAsync` runs a unit of work whose result is checkpointed. On replay, completed steps return their cached result without re-executing. + +## Signatures + +```csharp +Task StepAsync( + Func> func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); + +Task StepAsync( + Func func, + string? name = null, + StepConfig? config = null, + CancellationToken cancellationToken = default); +``` + +The `IStepContext` parameter exposes the current `AttemptNumber`, the deterministic `OperationId`, and a scoped `Logger`. Returned values are serialized via the `ILambdaSerializer` registered on `ILambdaContext.Serializer`. + +## Basic step + +```csharp +var user = await ctx.StepAsync( + async _ => await userService.GetUserAsync(userId), + name: "fetch-user"); +``` + +## Multiple steps + +```csharp +var a = await ctx.StepAsync(async _ => $"a-{input.OrderId}", name: "step_1"); +var b = await ctx.StepAsync(async _ => $"{a}-b", name: "step_2"); +var c = await ctx.StepAsync(async _ => $"{b}-c", name: "step_3"); +``` + +## Step configuration + +Configure step behavior with `StepConfig`: + +```csharp +public sealed class StepConfig +{ + public IRetryStrategy? RetryStrategy { get; set; } // null = no retry + public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry; +} +``` + +### Retry strategies + +When a step throws, the configured `IRetryStrategy` decides whether to retry and after what delay. + +```csharp +public interface IRetryStrategy +{ + RetryDecision ShouldRetry(Exception exception, int attemptNumber); +} + +public readonly struct RetryDecision +{ + public bool ShouldRetry { get; } + public TimeSpan Delay { get; } + + public static RetryDecision DoNotRetry(); + public static RetryDecision RetryAfter(TimeSpan delay); +} +``` + +Built-in strategies on the `RetryStrategy` static class: + +| Member | Behavior | +| --- | --- | +| `RetryStrategy.Default` | 6 attempts, 2× backoff, 5s initial, 60s max, Full jitter. | +| `RetryStrategy.Transient` | 3 attempts, 2× backoff, 1s initial, 5s max, Half jitter. | +| `RetryStrategy.None` | 1 attempt only — no retry. | +| `RetryStrategy.Exponential(...)` | Builder for custom exponential strategies. | +| `RetryStrategy.FromDelegate(Func)` | Wrap a custom decision function. | + +`Exponential` parameters: + +```csharp +public static IRetryStrategy Exponential( + int maxAttempts = 3, + TimeSpan? initialDelay = null, // default 5s + TimeSpan? maxDelay = null, // default 300s + double backoffRate = 2.0, + JitterStrategy jitter = JitterStrategy.Full, + Type[]? retryableExceptions = null, + string[]? retryableMessagePatterns = null); + +public enum JitterStrategy { None, Full, Half } +``` + +When `retryableExceptions` and `retryableMessagePatterns` are both null (default), every exception is retried up to `maxAttempts`. If either is set, only matching exceptions are retried. + +#### Step with retries + +```csharp +var result = await ctx.StepAsync( + async stepCtx => + { + if (stepCtx.AttemptNumber < 3) + throw new InvalidOperationException($"flake on attempt {stepCtx.AttemptNumber}"); + return $"ok on attempt {stepCtx.AttemptNumber}"; + }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); +``` + +### Step semantics + +Control how a step behaves when interrupted mid-execution: + +```csharp +public enum StepSemantics +{ + AtLeastOncePerRetry, // default — body may re-execute if Lambda is re-invoked mid-attempt + AtMostOncePerRetry // body executes at most once per retry attempt +} +``` + +| Semantic | Behavior | Use case | +| --- | --- | --- | +| `AtLeastOncePerRetry` (default) | Re-executes the step if interrupted before completion. | Idempotent operations (database upserts, API calls with idempotency keys). | +| `AtMostOncePerRetry` | Never re-executes; throws if interrupted. | Non-idempotent operations (sending email, charging payments). | + +These semantics apply *per retry attempt*, not per overall execution. To achieve true at-most-once across the whole workflow, combine with `RetryStrategy.None`: + +```csharp +var result = await ctx.StepAsync( + async _ => await paymentService.ChargeAsync(amount), + name: "charge-payment", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.None + }); +``` diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md new file mode 100644 index 000000000..d7d2679f4 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/docs/core/wait.md @@ -0,0 +1,28 @@ +# Wait + +`WaitAsync` suspends the workflow for a duration. The Lambda terminates and is re-invoked when the timer fires — you pay for compute time only on the resume side. + +## Signature + +```csharp +Task WaitAsync( + TimeSpan duration, + string? name = null, + CancellationToken cancellationToken = default); +``` + +`duration` must be at least 1 second and at most 31,622,400 seconds (~1 year). + +## Example + +```csharp +await ctx.WaitAsync(TimeSpan.FromHours(2), name: "warehouse-processing"); +``` + +## Step + Wait + Step + +```csharp +var validated = await ctx.StepAsync(async _ => Validate(input), name: "validate"); +await ctx.WaitAsync(TimeSpan.FromSeconds(3), name: "short_wait"); +var processed = await ctx.StepAsync(async _ => Process(validated), name: "process"); +``` diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj new file mode 100644 index 000000000..ec4d0ffd0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Amazon.Lambda.DurableExecution.AotPublishTest.csproj @@ -0,0 +1,24 @@ + + + + Exe + net8.0 + enable + enable + true + true + full + false + true + IL2026,IL2067,IL2075,IL3050 + false + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs new file mode 100644 index 000000000..41404ca96 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.AotPublishTest/Program.cs @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json.Serialization; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace Amazon.Lambda.DurableExecution.AotPublishTest; + +/// +/// AOT publish smoke check. This program must publish under NativeAOT with +/// zero IL2026/IL3050 warnings (promoted to errors by the csproj). The serializer +/// registered with is the same one DurableExecution +/// reads via , so AOT-safety is fully determined +/// by the user's choice of serializer (here, ). +/// +public class Program +{ + public static async Task Main() + { + var serializer = new SourceGeneratorLambdaJsonSerializer(); + Func> handler = HandlerAsync; + await LambdaBootstrapBuilder + .Create(handler, serializer) + .Build() + .RunAsync(); + } + + public static Task HandlerAsync( + DurableExecutionInvocationInput input, ILambdaContext context) => + DurableFunction.WrapAsync(WorkflowAsync, input, context); + + private static async Task WorkflowAsync(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + return new ValidationResult { IsValid = true }; + }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + + return new OrderResult { Status = validation.IsValid ? "approved" : "rejected", OrderId = input.OrderId }; + } + + public class OrderEvent + { + public string? OrderId { get; set; } + } + + public class OrderResult + { + public string? Status { get; set; } + public string? OrderId { get; set; } + } + + public class ValidationResult + { + public bool IsValid { get; set; } + } +} + +[JsonSerializable(typeof(DurableExecutionInvocationInput))] +[JsonSerializable(typeof(DurableExecutionInvocationOutput))] +[JsonSerializable(typeof(Program.OrderEvent))] +[JsonSerializable(typeof(Program.OrderResult))] +[JsonSerializable(typeof(Program.ValidationResult))] +public partial class AotJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj new file mode 100644 index 000000000..0ef2e561d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj @@ -0,0 +1,43 @@ + + + + + + + $(DefaultPackageTargets) + enable + enable + false + true + $(NoWarn);NU1903;CS1591 + + + + + + + + + + + + + PreserveNewest + + + + + + + + + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs new file mode 100644 index 000000000..ae36c22dd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/AtMostOnceCrashReplayTest.cs @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class AtMostOnceCrashReplayTest +{ + private readonly ITestOutputHelper _output; + public AtMostOnceCrashReplayTest(ITestOutputHelper output) => _output = output; + + /// + /// Validates the AtMostOncePerRetry crash-recovery wire path: the Lambda + /// process is killed mid-step on attempt 1 (after START flush, before + /// SUCCEED). On re-invocation the SDK sees a STARTED checkpoint with no + /// terminal record and routes through the retry strategy rather than + /// re-executing the step. Attempt 2 succeeds. + /// + /// This is the only path that exercises the StepInterruptedException + /// synthesis — the unit-test analogue + /// (StepAsync_AtMostOnce_StartedReplay_TriggersRetryHandler) fakes the + /// STARTED state in-memory and never proves the service actually delivers + /// it on a real crash. + /// + [Fact] + public async Task AtMostOnce_StepCrashesMidExecution_RecoversViaRetry() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("AtMostOnceCrashFunction"), + "amocrash", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 2s retry delay + initial-attempt cold-start + recovery invoke. Generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.StepSucceededDetails != null && e.Name == "crash_then_recover") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Attempt 1 was crashed (no SUCCEED), attempt 2 recovered. + // We expect exactly one StepSucceeded carrying "recovered on attempt 2". + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "crash_then_recover"); + Assert.NotNull(succeeded); + Assert.Equal("\"recovered on attempt 2\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // Two StepStarted events: one per invocation. + Assert.True( + events.Count(e => e.EventType == EventType.StepStarted) >= 2, + "Expected at least 2 StepStarted events (attempt 1 crashed, attempt 2 recovered)."); + + // The crash-recovery branch records the synthesized StepInterruptedException + // as a StepFailed event for attempt 1, with a message identifying the lost + // attempt rather than a user exception type. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "crash_then_recover") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.NotEmpty(failures); + Assert.Contains(failures, m => m.Contains("Step result lost", StringComparison.OrdinalIgnoreCase) + || m.Contains("interrupted", StringComparison.OrdinalIgnoreCase) + || m.Contains("previous attempt", StringComparison.OrdinalIgnoreCase)); + + // The execution actually crossed at least one invocation boundary + // (otherwise replay wasn't exercised at all). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves crash + replay), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs new file mode 100644 index 000000000..3a1e6c2c9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackFailedTest.cs @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CallbackFailedTest +{ + private readonly ITestOutputHelper _output; + public CallbackFailedTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end failure path for CreateCallbackAsync: + /// a paired RejecterFunction Lambda (Event-invoked from the workflow) + /// reports a failure via SendDurableExecutionCallbackFailure. The SDK + /// raises from GetResultAsync, + /// and the workflow surfaces FAILED with that exception type recorded. + /// + /// + /// The callback delivery has to come from a separate Lambda — not from the + /// test process — because the test's synchronous InvokeAsync blocks + /// until the durable execution reaches a terminal state. If the test tried + /// to deliver the callback itself, it would deadlock against its own + /// blocked Invoke. + /// + [Fact] + public async Task CallbackFailed_SurfacesAsCallbackFailedException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CallbackFailedFunction"), + "cb-failed", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("RejecterFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The workflow's surfaced exception is CallbackFailedException — the SDK + // wraps the external error message into the exception's Message. Verify + // the recorded error type is the SDK's CallbackFailedException and that + // the original failure message survives. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackFailedException).FullName, execution.Error.ErrorType); + Assert.Contains("rejected", execution.Error.ErrorMessage); + + // History records both Started and Failed for the same callback. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackFailed) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs new file mode 100644 index 000000000..7f50091c9 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CallbackTimeoutTest.cs @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CallbackTimeoutTest +{ + private readonly ITestOutputHelper _output; + public CallbackTimeoutTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end timeout path for CreateCallbackAsync: + /// the workflow waits on a callback whose + /// elapses before any result is delivered. The service marks the callback as + /// TIMED_OUT, the SDK throws , and the + /// workflow surfaces FAILED with that exception type recorded. + /// + [Fact] + public async Task CallbackTimeout_SurfacesAsCallbackTimeoutException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CallbackTimeoutFunction"), + "cb-timeout", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Capture the CallbackId before the timeout fires so we can assert it + // on the surfaced exception. CallbackStarted has the ID; CallbackTimedOut + // typically does not echo it back on the event. + var callbackId = await WaitForCallbackIdAsync(deployment, arn!, TimeSpan.FromSeconds(30)); + Assert.False(string.IsNullOrEmpty(callbackId)); + _output.WriteLine($"Service-allocated CallbackId: {callbackId}"); + + // The configured timeout is 10s; allow generous headroom for service + // latency (timer scheduling + re-invoke + Lambda cold start). + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The execution surfaces the SDK's CallbackTimeoutException to the user. + // ErrorObject.FromException records ErrorType as the FullName; verify both + // the type and that the recorded message mentions "timed out". + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackTimeoutException).FullName, execution.Error.ErrorType); + Assert.Contains("timed out", execution.Error.ErrorMessage, StringComparison.OrdinalIgnoreCase); + + // History records both Started and TimedOut for the same callback. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackTimedOut) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackTimedOut)); + } + + private static async Task WaitForCallbackIdAsync( + DurableFunctionDeployment deployment, string arn, TimeSpan timeout) + { + var history = await deployment.WaitForHistoryAsync( + arn, + h => h.Events?.Any(e => + e.CallbackStartedDetails != null + && !string.IsNullOrEmpty(e.CallbackStartedDetails.CallbackId)) ?? false, + timeout); + return history.Events? + .Where(e => e.CallbackStartedDetails != null + && !string.IsNullOrEmpty(e.CallbackStartedDetails.CallbackId)) + .Select(e => e.CallbackStartedDetails.CallbackId) + .FirstOrDefault(); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs new file mode 100644 index 000000000..b2bcc76f8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextFailsTest.cs @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextFailsTest +{ + private readonly ITestOutputHelper _output; + public ChildContextFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end RunInChildContextAsync failure path: the user func inside the + /// child throws, the SDK emits a CONTEXT FAIL checkpoint, the child's prior + /// inner step is preserved, and the workflow is marked FAILED with the + /// original exception details surfaced via ContextFailedDetails.Error. + /// + [Fact] + public async Task ChildContext_FailureSurfacesAsContextFailed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextFailsFunction"), + "childctxfail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-fail"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload; locate the execution by name. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("intentional child context failure", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + Assert.Equal("OrderProcessing", contextStarted!.SubType); + // The child context op itself is at root — its boundary opens at the parent scope. + Assert.Null(contextStarted.ParentId); + + // The CONTEXT FAIL record carries the original exception details and + // closes the boundary back at the parent scope (root, ParentId=null). + var contextFailed = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed && e.Name == "phase"); + Assert.NotNull(contextFailed); + Assert.Null(contextFailed!.ParentId); + var error = contextFailed.ContextFailedDetails.Error?.Payload; + Assert.NotNull(error); + Assert.Contains("intentional child context failure", error!.ErrorMessage ?? string.Empty); + Assert.Equal(typeof(InvalidOperationException).FullName, error.ErrorType); + // The wire ErrorObject preserves StackTrace from ToSdkError end-to-end — + // the service stores it and returns it on replay (or directly in the + // history event), so user-facing ChildContextException.OriginalStackTrace + // is populated rather than dropped. + Assert.NotNull(error.StackTrace); + Assert.NotEmpty(error.StackTrace); + + // The step that ran before the throw was checkpointed under the child. + var contextOpId = contextStarted.Id; + var innerStep = events.SingleOrDefault( + e => e.StepSucceededDetails != null && e.Name == "prepare" && e.ParentId == contextOpId); + Assert.NotNull(innerStep); + Assert.Equal("\"prepared-integ-test-fail\"", innerStep!.StepSucceededDetails.Result?.Payload); + + // Every inner step/wait event for this workflow is parented under the + // child context — the child is a single observability boundary. + var innerOpEvents = events + .Where(e => e.StepStartedDetails != null + || e.StepSucceededDetails != null + || e.StepFailedDetails != null + || e.WaitStartedDetails != null + || e.WaitSucceededDetails != null) + .ToList(); + Assert.NotEmpty(innerOpEvents); + Assert.All(innerOpEvents, e => Assert.Equal(contextOpId, e.ParentId)); + + // The child never reached SUCCEED; the workflow body past the throw is unreachable. + Assert.DoesNotContain(events, e => e.EventType == EventType.ContextSucceeded); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs new file mode 100644 index 000000000..90fafcd61 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextRetryFailsTest.cs @@ -0,0 +1,114 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextRetryFailsTest +{ + private readonly ITestOutputHelper _output; + public ChildContextRetryFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end: a step inside a child context retries until exhausted, then + /// the child closes as ContextFailed. Validates the child is a single + /// retry/error boundary — every per-attempt StepStarted/StepFailed (and the + /// terminal ContextFailed's surfaced exception) reflect the same logical + /// failure under the same parent op id. + /// + [Fact] + public async Task ChildContext_RetryExhaustionInsideChild_AllAttemptsParentedUnderChild() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextRetryFailsFunction"), + "childctxretry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-retry"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 attempts with 2s + 4s retry delays plus service-driven re-invokes. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("always-fails", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + var contextOpId = contextStarted!.Id; + Assert.NotNull(contextOpId); + + // All 3 step attempts (with their per-attempt StepFailed records) ran + // inside the child boundary. + var stepStarted = events.Where(e => e.EventType == EventType.StepStarted && e.Name == "always_fails").ToList(); + Assert.Equal(3, stepStarted.Count); + Assert.All(stepStarted, e => Assert.Equal(contextOpId, e.ParentId)); + + var stepFailed = events.Where(e => e.StepFailedDetails != null && e.Name == "always_fails").ToList(); + Assert.Equal(3, stepFailed.Count); + Assert.All(stepFailed, e => Assert.Equal(contextOpId, e.ParentId)); + + // The per-attempt failure messages reflect the user's exception. + var failureMessages = stepFailed + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failureMessages, m => m.Contains("attempt 1")); + Assert.Contains(failureMessages, m => m.Contains("attempt 2")); + Assert.Contains(failureMessages, m => m.Contains("attempt 3")); + + // Each StepFailed event preserves StackTrace through the wire — proves + // StepDetails.Error mapping doesn't drop frames. + Assert.All(stepFailed, e => + { + var stack = e.StepFailedDetails.Error?.Payload?.StackTrace; + Assert.NotNull(stack); + Assert.NotEmpty(stack); + }); + + // The child closes the boundary at the parent scope (root) and surfaces + // the underlying exception type — a single retry/error envelope. + var contextFailed = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed && e.Name == "phase"); + Assert.NotNull(contextFailed); + Assert.Null(contextFailed!.ParentId); + var contextError = contextFailed.ContextFailedDetails.Error?.Payload; + Assert.NotNull(contextError); + Assert.Contains("always-fails", contextError!.ErrorMessage ?? string.Empty); + // StackTrace round-trips end-to-end — the service preserves it from the + // checkpointed FAIL update and returns it on replay/history. + Assert.NotNull(contextError.StackTrace); + Assert.NotEmpty(contextError.StackTrace); + + Assert.DoesNotContain(events, e => e.StepSucceededDetails != null); + Assert.DoesNotContain(events, e => e.EventType == EventType.ContextSucceeded); + + // Service honored retry delays: with 2s + 4s and no jitter, the gap + // between first and last StepStarted should be >= 6s. + var startedTimestamps = stepStarted + .Where(e => e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays inside child: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs new file mode 100644 index 000000000..6216e8d90 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ChildContextTest.cs @@ -0,0 +1,112 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ChildContextTest +{ + private readonly ITestOutputHelper _output; + public ChildContextTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end RunInChildContextAsync: the workflow runs a child context that + /// performs step + wait + step and returns a typed result. The unit tests + /// fake state transitions in-memory; this test verifies the service actually + /// round-trips CONTEXT START/SUCCEED records, parents the inner step/wait + /// events under the context op, and persists the child's return value as + /// the ContextSucceeded payload. + /// + [Fact] + public async Task ChildContext_CompletesViaService() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ChildContextFunction"), + "childctx", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-456"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded) ?? false) + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Exactly one child context was opened and closed successfully. + var contextStarted = events.SingleOrDefault(e => e.EventType == EventType.ContextStarted && e.Name == "phase"); + Assert.NotNull(contextStarted); + Assert.Equal("OrderProcessing", contextStarted!.SubType); + + // The child boundary opens and closes at the parent scope (root, ParentId=null). + Assert.Null(contextStarted.ParentId); + + var contextSucceeded = events.SingleOrDefault(e => e.EventType == EventType.ContextSucceeded && e.Name == "phase"); + Assert.NotNull(contextSucceeded); + Assert.Null(contextSucceeded!.ParentId); + + // The child's return value was checkpointed as the CONTEXT SUCCEED payload. + Assert.Equal( + "\"processed-validated-integ-test-456\"", + contextSucceeded.ContextSucceededDetails.Result?.Payload); + + // Inner operations are parented to the context op so the service + // visualizes them nested under the child. + var contextOpId = contextStarted.Id; + Assert.NotNull(contextOpId); + + var innerStepEvents = events + .Where(e => e.EventType == EventType.StepStarted && e.ParentId == contextOpId) + .OrderBy(e => e.EventTimestamp) + .ToList(); + Assert.Equal(2, innerStepEvents.Count); + Assert.Equal("validate", innerStepEvents[0].Name); + Assert.Equal("process", innerStepEvents[1].Name); + + var innerWaitStarted = events.SingleOrDefault( + e => e.WaitStartedDetails != null && e.Name == "short_wait" && e.ParentId == contextOpId); + Assert.NotNull(innerWaitStarted); + Assert.Equal(2, innerWaitStarted!.WaitStartedDetails.Duration); + + // Inner step results chain: validate -> wait -> process. + var stepResults = events + .Where(e => e.StepSucceededDetails != null && e.ParentId == contextOpId) + .OrderBy(e => e.EventTimestamp) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("validate", stepResults[0].Name); + Assert.Equal("validated-integ-test-456", stepResults[0].Payload); + Assert.Equal("process", stepResults[1].Name); + Assert.Equal("processed-validated-integ-test-456", stepResults[1].Payload); + + // Every inner step/wait event for this workflow is parented under the + // child context — the child is a single observability boundary. + var innerOpEvents = events + .Where(e => e.StepStartedDetails != null + || e.StepSucceededDetails != null + || e.StepFailedDetails != null + || e.WaitStartedDetails != null + || e.WaitSucceededDetails != null) + .ToList(); + Assert.NotEmpty(innerOpEvents); + Assert.All(innerOpEvents, e => Assert.Equal(contextOpId, e.ParentId)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs new file mode 100644 index 000000000..d4629350a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/CreateCallbackHappyPathTest.cs @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class CreateCallbackHappyPathTest +{ + private readonly ITestOutputHelper _output; + public CreateCallbackHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy path for CreateCallbackAsync: + /// the workflow suspends inside GetResultAsync; a paired + /// ApproverFunction Lambda (Event-invoked from the workflow) acts + /// as the external system and delivers a result via + /// SendDurableExecutionCallbackSuccess; the workflow resumes and + /// returns the delivered payload. + /// + /// + /// The callback delivery has to come from a separate Lambda — not from the + /// test process — because the test's synchronous InvokeAsync blocks + /// until the durable execution reaches a terminal state. If the test tried + /// to deliver the callback itself, it would deadlock against its own + /// blocked Invoke. + /// + [Fact] + public async Task CreateCallback_DeliversResultViaSendSuccess() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("CreateCallbackHappyPathFunction"), + "cb-happy", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("ApproverFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId":"integ-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The execution result mirrors the payload the approver sent — proves + // GetResultAsync deserialized the wire-level callback Result and the + // workflow returned it. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("approved", execution.Result); + Assert.Contains("integ-test", execution.Result); + + // History shows the canonical callback lifecycle: Started then Succeeded. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackSucceeded) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackSucceeded)); + + var succeeded = events.First(e => e.CallbackSucceededDetails != null); + Assert.Equal("approve", succeeded.Name); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs new file mode 100644 index 000000000..e3247e26c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs @@ -0,0 +1,859 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using System.Text.Json; +using Amazon; +using Amazon.ECR; +using Amazon.ECR.Model; +using Amazon.IdentityManagement; +using Amazon.IdentityManagement.Model; +using Amazon.Lambda; +using Amazon.Lambda.Model; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +/// +/// Builds, deploys, and invokes a single durable Lambda function for an integration test. +/// Manages the full lifecycle: IAM role, ECR repo, Docker image, Lambda function. +/// All resources are torn down on DisposeAsync. +/// +internal sealed class DurableFunctionDeployment : IAsyncDisposable +{ + private readonly ITestOutputHelper _output; + private readonly IAmazonLambda _lambdaClient; + private readonly IAmazonECR _ecrClient; + private readonly IAmazonIdentityManagementService _iamClient; + + private readonly string _functionName; + private readonly string _repoName; + private readonly string _roleName; + private string? _roleArn; + private string? _imageUri; + private string? _functionArn; + private bool _functionCreated; + private bool _ecrRepoCreated; + private readonly List _inlinePolicyNames = new(); + + // Optional paired "external system" Lambda — a plain (non-durable) function + // that the workflow's submitter invokes. Models a real-world callback flow + // where an out-of-band service resolves the durable execution. + private readonly string _externalFunctionName; + private readonly string _externalRepoName; + private readonly string _externalRoleName; + private string? _externalRoleArn; + private bool _externalFunctionCreated; + private bool _externalEcrRepoCreated; + + public string FunctionName => _functionName; + public string? ExternalFunctionName => _externalFunctionCreated ? _externalFunctionName : null; + + /// + /// The fully-qualified function ARN (unqualified). Available after + /// or completes. Use $"{FunctionArn}:$LATEST" + /// when constructing a qualified identifier for chained invocation. + /// + public string FunctionArn => _functionArn + ?? throw new InvalidOperationException("Function ARN is not available until the function has been created."); + + public IAmazonLambda LambdaClient => _lambdaClient; + + private DurableFunctionDeployment(ITestOutputHelper output, string suffix) + { + _output = output; + _lambdaClient = new AmazonLambdaClient(RegionEndpoint.USEast1); + _ecrClient = new AmazonECRClient(RegionEndpoint.USEast1); + _iamClient = new AmazonIdentityManagementServiceClient(RegionEndpoint.USEast1); + + // Truncate the GUID (not the suffix) so CloudTrail entries stay readable. + // Keep the GUID short enough that the total stays well under 40 chars even for long suffixes. + static string ShortId() => Guid.NewGuid().ToString("N")[..Math.Min(8, 32)]; + _functionName = $"durable-integ-{suffix}-{ShortId()}"; + _repoName = $"durable-integ-{suffix}-{ShortId()}"; + _roleName = $"durable-integ-{suffix}-{ShortId()}"; + _externalFunctionName = $"durable-integ-{suffix}-ext-{ShortId()}"; + _externalRepoName = $"durable-integ-{suffix}-ext-{ShortId()}"; + _externalRoleName = $"durable-integ-{suffix}-ext-{ShortId()}"; + } + + public static async Task CreateAsync( + string testFunctionDir, + string scenarioSuffix, + ITestOutputHelper output, + string? externalFunctionDir = null, + IDictionary? environment = null, + IReadOnlyList? invokeAllowedFunctionArns = null, + bool enableTenancy = false) + { + var deployment = new DurableFunctionDeployment(output, scenarioSuffix); + try + { + await deployment.InitializeAsync(testFunctionDir, externalFunctionDir, environment, invokeAllowedFunctionArns, enableTenancy); + } + catch + { + // Tear down anything that did get created (IAM role, ECR repo) so we + // don't leak resources when init fails part-way through. + await deployment.DisposeAsync(); + throw; + } + return deployment; + } + + /// + /// Two-step deployment for chained-invoke scenarios: deploys the downstream (callee) + /// function first, captures its ARN, then deploys the parent (caller) with + /// DOWNSTREAM_FUNCTION_ARN set in the parent's environment and the parent's + /// role granted lambda:InvokeFunction on the downstream's ARN. + /// + /// + /// The parent and downstream are independent + /// instances; both are returned so the caller can dispose them in the right order + /// (parent first, then downstream — the caller is the one in flight when the test ends). + /// The DOWNSTREAM_FUNCTION_ARN env var carries a qualified identifier + /// (arn:...:function:name:$LATEST) so the parent can pass it directly to + /// ctx.InvokeAsync(...) without further manipulation. + /// + public static async Task<(DurableFunctionDeployment Parent, DurableFunctionDeployment Downstream)> + CreateWithDownstreamAsync( + string parentTestFunctionDir, + string downstreamTestFunctionDir, + string scenarioSuffix, + ITestOutputHelper output, + IDictionary? extraParentEnvironment = null, + bool enableDownstreamTenancy = false) + { + // Deploy downstream first so we can pass its ARN to the parent's environment. + var downstream = await CreateAsync( + downstreamTestFunctionDir, + scenarioSuffix + "-d", + output, + enableTenancy: enableDownstreamTenancy); + + DurableFunctionDeployment? parent = null; + try + { + // Use a qualified identifier — the durable execution service rejects + // unqualified ARNs. $LATEST is fine for integration tests; production + // should use a version or alias. + var qualifiedDownstreamArn = downstream.FunctionArn + ":$LATEST"; + var parentEnv = new Dictionary(StringComparer.Ordinal) + { + ["DOWNSTREAM_FUNCTION_ARN"] = qualifiedDownstreamArn, + }; + if (extraParentEnvironment != null) + { + foreach (var kv in extraParentEnvironment) + parentEnv[kv.Key] = kv.Value; + } + + parent = await CreateAsync( + parentTestFunctionDir, + scenarioSuffix + "-p", + output, + environment: parentEnv, + invokeAllowedFunctionArns: new[] { downstream.FunctionArn }); + } + catch + { + // Parent failed to deploy — tear down the downstream we already created + // so we don't leak resources. + await downstream.DisposeAsync(); + throw; + } + + return (parent!, downstream); + } + + private const string LambdaAssumeRolePolicy = """ + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + }] + } + """; + + private async Task InitializeAsync( + string testFunctionDir, + string? externalFunctionDir, + IDictionary? environment, + IReadOnlyList? invokeAllowedFunctionArns, + bool enableTenancy) + { + // 1. Create the workflow's IAM role. + _output.WriteLine($"Creating IAM role: {_roleName}"); + var createRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest + { + RoleName = _roleName, + AssumeRolePolicyDocument = LambdaAssumeRolePolicy + }); + _roleArn = createRoleResponse.Role.Arn; + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _roleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + }); + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _roleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy" + }); + + // 2. (optional) Create the external function's IAM role up front so its + // sts:AssumeRole and lambda:SendDurableExecutionCallbackSuccess + // permissions propagate alongside the workflow role's permissions + // (single 10-second sleep covers both). + if (externalFunctionDir != null) + { + _output.WriteLine($"Creating external IAM role: {_externalRoleName}"); + var extRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest + { + RoleName = _externalRoleName, + AssumeRolePolicyDocument = LambdaAssumeRolePolicy + }); + _externalRoleArn = extRoleResponse.Role.Arn; + + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + { + RoleName = _externalRoleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + }); + + // Inline policy lets the external function call the durable callback API. + // Resource "*" because we don't yet know the workflow's ARN at this point — + // the external function only resolves callbacks belonging to executions the + // workflow created, so the blast radius is bounded by the role's lifetime. + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _externalRoleName, + PolicyName = "SendDurableExecutionCallback", + PolicyDocument = """ + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "lambda:SendDurableExecutionCallbackSuccess", + "lambda:SendDurableExecutionCallbackFailure" + ], + "Resource": "*" + }] + } + """ + }); + + // Workflow function will Invoke the external function — grant via inline policy. + // Scoped to the external function name we just minted. + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _roleName, + PolicyName = "InvokeExternalFunction", + PolicyDocument = $$""" + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": "lambda:InvokeFunction", + "Resource": "arn:aws:lambda:*:*:function:{{_externalFunctionName}}" + }] + } + """ + }); + _inlinePolicyNames.Add("InvokeExternalFunction"); + } + + // Grant cross-Lambda invoke when the parent of a chained-invoke scenario + // needs to call out to a downstream function. The durable execution service + // is the one that actually drives the chained invocation in production — + // attaching this directly to the parent's role keeps the parent role + // capable of being used in non-durable contexts (e.g. for diagnostic + // direct invokes from the test harness). + if (invokeAllowedFunctionArns != null && invokeAllowedFunctionArns.Count > 0) + { + // Allow both the unqualified ARN and any qualifier (alias/version/$LATEST). + var resources = new List(invokeAllowedFunctionArns.Count * 2); + foreach (var arn in invokeAllowedFunctionArns) + { + resources.Add(arn); + resources.Add(arn + ":*"); + } + var resourceJson = "[" + string.Join(",", resources.Select(r => $"\"{r}\"")) + "]"; + var policyDoc = $$""" + { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": ["lambda:InvokeFunction"], + "Resource": {{resourceJson}} + }] + } + """; + const string PolicyName = "AllowChainedInvoke"; + await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + { + RoleName = _roleName, + PolicyName = PolicyName, + PolicyDocument = policyDoc + }); + _inlinePolicyNames.Add(PolicyName); + } + + // Wait for IAM propagation. + await Task.Delay(TimeSpan.FromSeconds(10)); + + // 3. Create the workflow ECR repo + image. + _output.WriteLine($"Creating ECR repository: {_repoName}"); + var createRepoResponse = await _ecrClient.CreateRepositoryAsync(new CreateRepositoryRequest + { + RepositoryName = _repoName + }); + _ecrRepoCreated = true; + var repositoryUri = createRepoResponse.Repository.RepositoryUri; + + _output.WriteLine($"Building and pushing Docker image from {testFunctionDir}..."); + _imageUri = await BuildAndPushImage(testFunctionDir, repositoryUri); + _output.WriteLine($"Image pushed: {_imageUri}"); + + // 4. (optional) Create + push the external function image and create the Lambda. + // Done before the workflow Lambda so the workflow function's environment can + // reference the external function name (which is already known from the ctor). + if (externalFunctionDir != null) + { + _output.WriteLine($"Creating external ECR repository: {_externalRepoName}"); + var extRepoResponse = await _ecrClient.CreateRepositoryAsync(new CreateRepositoryRequest + { + RepositoryName = _externalRepoName + }); + _externalEcrRepoCreated = true; + var extRepositoryUri = extRepoResponse.Repository.RepositoryUri; + + _output.WriteLine($"Building external Docker image from {externalFunctionDir}..."); + var extImageUri = await BuildAndPushImage(externalFunctionDir, extRepositoryUri); + _output.WriteLine($"External image pushed: {extImageUri}"); + + _output.WriteLine($"Creating external Lambda function: {_externalFunctionName}"); + await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest + { + FunctionName = _externalFunctionName, + PackageType = PackageType.Image, + Role = _externalRoleArn, + Code = new FunctionCode { ImageUri = extImageUri }, + Timeout = 30, + MemorySize = 256 + // No DurableConfig — this is a plain function. + }); + _externalFunctionCreated = true; + + _output.WriteLine("Waiting for external function to become Active..."); + await WaitForFunctionActive(_externalFunctionName); + } + + // 5. Create the workflow Lambda. + _output.WriteLine($"Creating Lambda function: {_functionName}"); + var createFunctionRequest = new CreateFunctionRequest + { + FunctionName = _functionName, + PackageType = PackageType.Image, + Role = _roleArn, + Code = new FunctionCode { ImageUri = _imageUri }, + Timeout = 30, + MemorySize = 256, + DurableConfig = new DurableConfig { ExecutionTimeout = 60 } + }; + + // Tenant isolation must be set at function-creation time (Lambda rejects + // post-create modification). Without it, the durable execution service + // refuses chained invokes that carry a TenantId — so the tenant-routing + // integration test needs the *callee* deployed with PER_TENANT. + if (enableTenancy) + { + createFunctionRequest.TenancyConfig = new TenancyConfig + { + TenantIsolationMode = TenantIsolationMode.PER_TENANT + }; + } + + // Build the function's environment: start with the caller-supplied vars, then + // tack on EXTERNAL_FUNCTION_NAME if a paired external function exists. + var envVars = new Dictionary(StringComparer.Ordinal); + if (environment != null) + { + foreach (var kv in environment) + envVars[kv.Key] = kv.Value; + } + if (externalFunctionDir != null) + { + envVars["EXTERNAL_FUNCTION_NAME"] = _externalFunctionName; + } + if (envVars.Count > 0) + { + createFunctionRequest.Environment = new Amazon.Lambda.Model.Environment + { + Variables = envVars + }; + } + + var createFunctionResponse = await _lambdaClient.CreateFunctionAsync(createFunctionRequest); + _functionCreated = true; + _functionArn = createFunctionResponse.FunctionArn; + + _output.WriteLine($"Waiting for function to become Active... (ARN: {_functionArn})"); + await WaitForFunctionActive(_functionName); + } + + public async Task<(InvokeResponse Response, string ExecutionName)> InvokeAsync(string payload, string? executionName = null) + { + var name = executionName ?? $"integ-test-{Guid.NewGuid():N}"; + var response = await _lambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = _functionName, + Qualifier = "$LATEST", + Payload = payload, + DurableExecutionName = name + }); + return (response, name); + } + + /// + /// Polls ListDurableExecutionsByFunction until an execution with the given name appears. + /// Useful when the synchronous Invoke response gives no ARN (e.g., failed workflows return null). + /// + public async Task FindDurableExecutionArnByNameAsync(string executionName, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + var attempt = 0; + _output.WriteLine($"[FindArn] Starting search for execution name '{executionName}' on function '{_functionName}' (timeout: {timeout.TotalSeconds}s)"); + + while (DateTime.UtcNow < deadline) + { + attempt++; + try + { + var resp = await _lambdaClient.ListDurableExecutionsByFunctionAsync( + new ListDurableExecutionsByFunctionRequest + { + FunctionName = _functionName, + DurableExecutionName = executionName // server-side exact match + }); + + var count = resp.DurableExecutions?.Count ?? 0; + _output.WriteLine($"[FindArn] attempt {attempt}: List returned {count} executions"); + + if (count > 0) + { + foreach (var e in resp.DurableExecutions!) + { + _output.WriteLine($"[FindArn] - name='{e.DurableExecutionName}' status={e.Status} arn={e.DurableExecutionArn}"); + } + var match = resp.DurableExecutions.FirstOrDefault(e => e.DurableExecutionName == executionName); + if (match != null) + { + _output.WriteLine($"[FindArn] matched on attempt {attempt}"); + return match.DurableExecutionArn; + } + } + } + catch (Exception ex) + { + _output.WriteLine($"[FindArn] attempt {attempt} error (will retry): {ex.Message}"); + } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + _output.WriteLine($"[FindArn] gave up after {attempt} attempts ({timeout.TotalSeconds}s)"); + return null; + } + + public async Task PollForCompletionAsync(string durableExecutionArn, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + + while (DateTime.UtcNow < deadline) + { + try + { + var resp = await _lambdaClient.GetDurableExecutionAsync( + new GetDurableExecutionRequest { DurableExecutionArn = durableExecutionArn }); + + var status = resp.Status?.ToString(); + if (status == "SUCCEEDED" || status == "FAILED" || + status == "TIMED_OUT" || status == "STOPPED") + { + return status; + } + } + catch (Exception ex) + { + _output.WriteLine($"Poll error (will retry): {ex.Message}"); + } + + await Task.Delay(TimeSpan.FromSeconds(2)); + } + + return "TIMEOUT"; + } + + public async Task GetExecutionAsync(string durableExecutionArn) + => await _lambdaClient.GetDurableExecutionAsync( + new GetDurableExecutionRequest { DurableExecutionArn = durableExecutionArn }); + + public async Task GetHistoryAsync(string durableExecutionArn, bool includeExecutionData = true) + => await _lambdaClient.GetDurableExecutionHistoryAsync( + new GetDurableExecutionHistoryRequest + { + DurableExecutionArn = durableExecutionArn, + IncludeExecutionData = includeExecutionData + }); + + /// + /// Repeatedly fetches history until is satisfied or the + /// timeout elapses. Needed because the history endpoint is eventually consistent — + /// the execution status can flip to SUCCEEDED before all events are indexed. + /// + public async Task WaitForHistoryAsync( + string durableExecutionArn, + Func predicate, + TimeSpan timeout, + bool includeExecutionData = true) + { + var deadline = DateTime.UtcNow + timeout; + GetDurableExecutionHistoryResponse? last = null; + var attempt = 0; + + while (DateTime.UtcNow < deadline) + { + attempt++; + try + { + last = await GetHistoryAsync(durableExecutionArn, includeExecutionData); + var eventCount = last.Events?.Count ?? 0; + var typeCounts = last.Events? + .GroupBy(e => e.EventType?.Value ?? "") + .Select(g => $"{g.Key}:{g.Count()}") + .OrderBy(s => s); + _output.WriteLine($"[WaitForHistory] attempt {attempt}: {eventCount} events [{string.Join(",", typeCounts ?? Enumerable.Empty())}]"); + if (predicate(last)) + { + DumpEvents(last); + return last; + } + } + catch (Exception ex) + { + _output.WriteLine($"[WaitForHistory] attempt {attempt} error (will retry): {ex.Message}"); + } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + + _output.WriteLine($"[WaitForHistory] gave up after {attempt} attempts; returning last response with {last?.Events?.Count ?? 0} events"); + if (last != null) DumpEvents(last); + return last ?? throw new TimeoutException($"GetDurableExecutionHistory never succeeded within {timeout.TotalSeconds}s"); + } + + private void DumpEvents(GetDurableExecutionHistoryResponse history) + { + var events = history.Events ?? new List(); + _output.WriteLine($"[WaitForHistory] event dump ({events.Count} total):"); + for (int i = 0; i < events.Count; i++) + { + var e = events[i]; + _output.WriteLine($" [{i}] type={e.EventType?.Value ?? ""} name={e.Name ?? ""} ts={e.EventTimestamp:O}"); + } + } + + public string? ExtractDurableExecutionArn(string responsePayload) + { + try + { + var doc = JsonDocument.Parse(responsePayload); + if (doc.RootElement.TryGetProperty("durableExecutionArn", out var arnProp)) + return arnProp.GetString(); + } + catch { } + return null; + } + + private async Task WaitForFunctionActive(string functionName) + { + for (int i = 0; i < 60; i++) + { + try + { + var config = await _lambdaClient.GetFunctionConfigurationAsync( + new GetFunctionConfigurationRequest { FunctionName = functionName }); + if (config.State == State.Active) return; + if (config.State == State.Failed) + throw new Exception($"Function '{functionName}' creation failed: {config.StateReasonCode} - {config.StateReason}"); + } + catch (ResourceNotFoundException) { } + await Task.Delay(TimeSpan.FromSeconds(2)); + } + throw new TimeoutException($"Function '{functionName}' did not become Active within 120 seconds"); + } + + private async Task BuildAndPushImage(string testFunctionDir, string repositoryUri) + { + // `dotnet test` spins up one testhost per TargetFramework (net8.0 + net10.0) and + // runs them concurrently. Both testhosts invoke the same test classes, which means + // two processes can race on the same TestFunctions// source dir — wiping bin/ + // and obj/ under each other's feet. Symptom: MSB3030 "Could not copy bootstrap.dll" + // because one process deleted obj/ while the other was mid-publish. Serialize the + // per-source-dir build with a cross-process file lock so different test functions + // can still build in parallel. (A Mutex would have thread-affinity issues across + // awaits; an exclusive FileStream avoids that.) Lock file goes under temp — keeping + // it out of the source tree avoids polluting git status across worktrees. + var lockKey = Convert.ToHexString(System.Security.Cryptography.SHA256.HashData( + Encoding.UTF8.GetBytes(testFunctionDir.ToLowerInvariant())))[..16]; + var lockPath = Path.Combine(Path.GetTempPath(), $"durable-integ-build-{lockKey}.lock"); + using var lockHandle = await AcquireExclusiveFileLockAsync(lockPath, TimeSpan.FromMinutes(10)); + + var publishDir = Path.Combine(testFunctionDir, "bin", "publish"); + if (Directory.Exists(publishDir)) Directory.Delete(publishDir, true); + + // MSBuild's up-to-date check leaves stale .Up2Date markers under obj/ that + // make `dotnet publish` skip the copy-to-output step on a second run after + // we've wiped bin/publish/. Result: empty publish dir → empty Docker build + // context → "COPY bin/publish/ … not found". Nuking obj/ guarantees a real + // publish each time the helper is invoked. Cheap (each test function is small). + var objDir = Path.Combine(testFunctionDir, "obj"); + if (Directory.Exists(objDir)) Directory.Delete(objDir, true); + var binDir = Path.Combine(testFunctionDir, "bin"); + if (Directory.Exists(binDir)) Directory.Delete(binDir, true); + + await RunProcess("dotnet", + $"publish -c Release -r linux-x64 --self-contained true -o \"{publishDir}\"", + testFunctionDir); + + var imageTag = $"{repositoryUri}:latest"; + await RunProcess("docker", + $"build --platform linux/amd64 --provenance=false -t {imageTag} .", + testFunctionDir); + + var authResponse = await _ecrClient.GetAuthorizationTokenAsync(new GetAuthorizationTokenRequest()); + var authData = authResponse.AuthorizationData[0]; + var token = Encoding.UTF8.GetString(Convert.FromBase64String(authData.AuthorizationToken)); + var parts = token.Split(':'); + var registryUrl = authData.ProxyEndpoint; + + await RunProcess("docker", + $"login --username {parts[0]} --password-stdin {registryUrl}", + testFunctionDir, + stdin: parts[1]); + + await RunProcess("docker", $"push {imageTag}", testFunctionDir); + + return imageTag; + } + + private static async Task AcquireExclusiveFileLockAsync(string lockPath, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + while (true) + { + try + { + return new FileStream(lockPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None); + } + catch (IOException) + { + if (DateTime.UtcNow >= deadline) + throw new TimeoutException($"Timed out waiting for build lock '{lockPath}' after {timeout.TotalSeconds:F0}s"); + await Task.Delay(TimeSpan.FromMilliseconds(500)); + } + } + } + + private async Task RunProcess(string fileName, string arguments, string workingDir, string? stdin = null) + { + _output.WriteLine($"Running: {fileName} {arguments}"); + var psi = new System.Diagnostics.ProcessStartInfo + { + FileName = fileName, + Arguments = arguments, + WorkingDirectory = workingDir, + RedirectStandardOutput = true, + RedirectStandardError = true, + RedirectStandardInput = stdin != null, + UseShellExecute = false + }; + + var process = System.Diagnostics.Process.Start(psi)!; + + if (stdin != null) + { + await process.StandardInput.WriteAsync(stdin); + process.StandardInput.Close(); + } + + var stdoutTask = process.StandardOutput.ReadToEndAsync(); + var stderrTask = process.StandardError.ReadToEndAsync(); + + await Task.WhenAny( + process.WaitForExitAsync(), + Task.Delay(TimeSpan.FromMinutes(5))); + + if (!process.HasExited) + { + process.Kill(); + throw new TimeoutException($"{fileName} timed out after 5 minutes"); + } + + var stdout = await stdoutTask; + var stderr = await stderrTask; + + if (process.ExitCode != 0) + { + // Dump the FULL streams on failure — diagnosing build errors with + // truncated output is painful, and these only fire on test failure. + _output.WriteLine($"stdout: {stdout}"); + _output.WriteLine($"stderr: {stderr}"); + var detail = !string.IsNullOrWhiteSpace(stderr) ? stderr : stdout; + throw new Exception($"{fileName} failed (exit {process.ExitCode}): {detail}"); + } + + if (!string.IsNullOrWhiteSpace(stdout)) + _output.WriteLine($"stdout: {stdout[..Math.Min(stdout.Length, 1000)]}"); + } + + public async ValueTask DisposeAsync() + { + if (_functionCreated) + { + try + { + _output.WriteLine($"Deleting function: {_functionName}"); + await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _functionName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (function): {ex.Message}"); } + } + + if (_externalFunctionCreated) + { + try + { + _output.WriteLine($"Deleting external function: {_externalFunctionName}"); + await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _externalFunctionName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (external function): {ex.Message}"); } + } + + if (_ecrRepoCreated) + { + try + { + _output.WriteLine($"Deleting ECR repository: {_repoName}"); + await _ecrClient.DeleteRepositoryAsync(new DeleteRepositoryRequest + { + RepositoryName = _repoName, + Force = true + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (ECR): {ex.Message}"); } + } + + if (_externalEcrRepoCreated) + { + try + { + _output.WriteLine($"Deleting external ECR repository: {_externalRepoName}"); + await _ecrClient.DeleteRepositoryAsync(new DeleteRepositoryRequest + { + RepositoryName = _externalRepoName, + Force = true + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (external ECR): {ex.Message}"); } + } + + if (_roleArn != null) + { + // Detach each policy independently — if one detach fails (e.g., the + // policy was never attached because init bailed out early) we still + // want to attempt the others and the final DeleteRole. + await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); + await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy"); + + // Inline policies must be deleted (not detached) before DeleteRole succeeds. + foreach (var inline in _inlinePolicyNames) + { + await TryDeleteInline(_roleName, inline); + } + + try + { + await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _roleName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole): {ex.Message}"); } + } + + if (_externalRoleArn != null) + { + await TryDetachManaged(_externalRoleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); + await TryDeleteInline(_externalRoleName, "SendDurableExecutionCallback"); + try + { + await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _externalRoleName }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole external): {ex.Message}"); } + } + + async Task TryDetachManaged(string roleName, string policyArn) + { + try + { + await _iamClient.DetachRolePolicyAsync(new DetachRolePolicyRequest + { + RoleName = roleName, + PolicyArn = policyArn + }); + } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM Detach {policyArn}): {ex.Message}"); } + } + + async Task TryDeleteInline(string roleName, string policyName) + { + try + { + await _iamClient.DeleteRolePolicyAsync(new DeleteRolePolicyRequest + { + RoleName = roleName, + PolicyName = policyName + }); + } + catch (NoSuchEntityException) { /* policy was never attached — fine */ } + catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteInline {policyName}): {ex.Message}"); } + } + } + + public static string FindTestFunctionDir(string functionDirName) + { + var dir = AppContext.BaseDirectory; + while (dir != null) + { + var candidate = Path.Combine(dir, "TestFunctions", functionDirName); + if (Directory.Exists(candidate)) + return candidate; + + // Also check legacy "TestFunction" location for backwards compat + var legacy = Path.Combine(dir, functionDirName); + if (Directory.Exists(legacy) && File.Exists(Path.Combine(legacy, $"{functionDirName}.csproj"))) + return legacy; + + dir = Path.GetDirectoryName(dir); + } + + // Fallback: relative from test source directory + var fallback = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "TestFunctions", functionDirName)); + if (Directory.Exists(fallback)) + return fallback; + + throw new DirectoryNotFoundException( + $"Could not find TestFunctions/{functionDirName}/ directory. Looked up from: {AppContext.BaseDirectory}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs new file mode 100644 index 000000000..1b967588d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeFailureTest.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeFailureTest +{ + private readonly ITestOutputHelper _output; + public InvokeFailureTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_ChildThrows_ParentSurfacesInvokeFailedException() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeFailureParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeFailureChildFunction"), + scenarioSuffix: "invokefail", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-fail"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // The parent catches InvokeFailedException and returns normally — + // the parent execution itself SUCCEEDS even though the chained + // invocation FAILED. This is the value of the SDK's exception + // surface: failure is observable but not necessarily fatal. + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false) + && (h.Events?.Any(e => e.ChainedInvokeFailedDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Exactly one chained invoke was issued and it FAILED — the parent + // did not retry the invoke (no retry semantics for InvokeAsync yet). + Assert.Equal(1, events.Count(e => e.EventType == EventType.ChainedInvokeStarted)); + var failed = events.FirstOrDefault(e => e.ChainedInvokeFailedDetails != null); + Assert.NotNull(failed); + Assert.Equal("call_failing_child", failed!.Name); + + var error = failed.ChainedInvokeFailedDetails.Error?.Payload; + Assert.NotNull(error); + // The child's exception type and message propagate through the + // service into the parent's history. Some service implementations + // record only the simple type name and others the fully-qualified + // one — match either by checking for the substring. + Assert.Contains("InvalidOperationException", error!.ErrorType ?? string.Empty); + Assert.Contains("intentional child failure", error.ErrorMessage ?? string.Empty); + + // The parent's terminal result encodes "parent-saw-" — confirms + // the parent's catch block ran AND the exception's ErrorType field + // was populated by the SDK on resume from the FAILED chained invoke. + // Without the Result assertions, a regression that left ErrorType + // null would still produce a SUCCEEDED execution (parent-saw-unknown) + // and silently pass. + var execution = await parent.GetExecutionAsync(arn!); + Assert.Null(execution.Error); + Assert.NotNull(execution.Result); + Assert.Contains("parent-saw-", execution.Result); + Assert.DoesNotContain("parent-saw-unknown", execution.Result); + Assert.Contains("InvalidOperationException", execution.Result); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs new file mode 100644 index 000000000..4d884d24e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeHappyPathTest.cs @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeHappyPathTest +{ + private readonly ITestOutputHelper _output; + public InvokeHappyPathTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_HappyPath_ChildResultPropagatesToParent() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeHappyPathParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeHappyPathChildFunction"), + scenarioSuffix: "invokehappy", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-happy"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + // Locate the parent execution and wait for terminal status. Chained + // invoke suspends the parent — the synchronous Invoke response + // carries no data — so we drive completion via the listing API. + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The chained invoke's result surfaces in the parent's history as a + // ChainedInvokeSucceeded event. The parent then returns that result + // verbatim from its workflow. + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false) + && (h.Events?.Any(e => e.ChainedInvokeSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var started = events.FirstOrDefault(e => e.EventType == EventType.ChainedInvokeStarted); + Assert.NotNull(started); + Assert.Equal(downstream.FunctionArn + ":$LATEST", started!.ChainedInvokeStartedDetails.FunctionName); + + var succeeded = events.FirstOrDefault(e => e.ChainedInvokeSucceededDetails != null); + Assert.NotNull(succeeded); + // The child returned the JSON-encoded string "got-42". + var childPayload = succeeded!.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal("got-42", childPayload); + + // The chained invoke event names what was invoked; cross-check against + // the deployed downstream's name so we know the parent really called + // the function we wired in. + Assert.Equal("call_child", succeeded.Name); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs new file mode 100644 index 000000000..9be5eeecb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeReplayDeterminismTest.cs @@ -0,0 +1,122 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public InvokeReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_ReplayDeterminism_OperationIdsStableAcrossInvocations() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeReplayDeterminismParentFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeReplayDeterminismChildFunction"), + scenarioSuffix: "invokerply", + output: _output); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "invoke-replay"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — wait until both step-succeeded + // events AND the chained-invoke-succeeded event are visible. + var history = await parent.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.ChainedInvokeSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Each step ran exactly once across the entire workflow — proves + // the chained invoke's suspend/resume cycle did NOT cause the + // pre-invoke step to re-execute. (Replay returned the cached + // checkpoint instead.) + var stepSucceededByName = events + .Where(e => e.StepSucceededDetails != null) + .GroupBy(e => e.Name) + .ToDictionary(g => g.Key!, g => g.Count()); + Assert.Equal(1, stepSucceededByName["before_invoke"]); + Assert.Equal(1, stepSucceededByName["after_invoke"]); + + // Exactly ONE chained invoke fired — replay didn't double-fire + // the InvokeAsync. Same invariant we check for steps. + Assert.Equal(1, events.Count(e => e.EventType == EventType.ChainedInvokeStarted)); + Assert.Equal(1, events.Count(e => e.ChainedInvokeSucceededDetails != null)); + + var beforeInvokeEvent = events.First(e => e.StepSucceededDetails != null && e.Name == "before_invoke"); + var generatedGuid = beforeInvokeEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.NotNull(generatedGuid); + Assert.True(Guid.TryParse(generatedGuid, out _), + $"before_invoke should produce a valid GUID, got: {generatedGuid}"); + + // The downstream's echo carries through to after_invoke verbatim, + // proving the cached chained-invoke result was used on resume. + var chainedSucceeded = events.First(e => e.ChainedInvokeSucceededDetails != null); + var chainedPayload = chainedSucceeded.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal($"echoed:{generatedGuid}", chainedPayload); + + var afterInvokeEvent = events.First(e => e.StepSucceededDetails != null && e.Name == "after_invoke"); + var afterPayload = afterInvokeEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal($"final:echoed:{generatedGuid}", afterPayload); + + // The chained invoke's suspend/resume forced at least 2 invocations + // of the parent — proves replay actually happened (not just a + // single straight-through execution that skipped suspension). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves replay happened), got {invocations.Count}"); + + // Operation IDs are stable across all replays of the same logical + // position. The Started event and the corresponding Succeeded event + // for each operation share the same ID — that's the clearest + // observable proof the SDK's deterministic ID generator is working. + // The SDK hashes "<counter>" at the root, so each ID is a + // 64-char lowercase hex SHA-256 digest. + var startedIds = events + .Where(e => e.EventType == EventType.StepStarted || e.EventType == EventType.ChainedInvokeStarted) + .Select(e => (e.Name, Id: e.Id)) + .ToList(); + var succeededIds = events + .Where(e => e.StepSucceededDetails != null || e.ChainedInvokeSucceededDetails != null) + .Select(e => (e.Name, Id: e.Id)) + .ToList(); + + // All operation IDs are populated and look like SHA-256 hex digests. + foreach (var (name, id) in startedIds) + { + Assert.False(string.IsNullOrEmpty(id), $"Operation '{name}' has no Id on its Started event"); + Assert.Equal(64, id!.Length); + Assert.Matches("^[0-9a-f]{64}$", id); + } + + // Every started operation ID must appear in a succeeded event — + // proves the deterministic IDs from the Start path matched the IDs + // the service used to record the terminal event. + foreach (var (name, id) in startedIds) + { + Assert.True( + succeededIds.Any(s => s.Name == name && s.Id == id), + $"Operation '{name}' (id={id}) started but did not produce a matching SUCCEEDED event with the same ID"); + } + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs new file mode 100644 index 000000000..6fbfcb27f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/InvokeWithTenantIdTest.cs @@ -0,0 +1,66 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class InvokeWithTenantIdTest +{ + private readonly ITestOutputHelper _output; + public InvokeWithTenantIdTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task InvokeAsync_WithTenantId_PropagatesToChainedInvokeOptions() + { + var (parent, downstream) = await DurableFunctionDeployment.CreateWithDownstreamAsync( + parentTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeWithTenantIdFunction"), + downstreamTestFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("InvokeChildTenantFunction"), + scenarioSuffix: "invoketenant", + output: _output, + // The downstream must be PER_TENANT for the service to accept a + // chained invoke carrying a TenantId. The parent stays default. + enableDownstreamTenancy: true); + + await using (downstream) + await using (parent) + { + var (invokeResponse, executionName) = await parent.InvokeAsync("""{"orderId": "tenant-test"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Parent response: {responsePayload}"); + + var arn = await parent.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await parent.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await parent.WaitForHistoryAsync( + arn!, + h => h.Events?.Any(e => e.EventType == EventType.ChainedInvokeStarted) ?? false, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var started = events.FirstOrDefault(e => e.EventType == EventType.ChainedInvokeStarted); + Assert.NotNull(started); + + // The tenant ID flows through ChainedInvokeOptions -> service -> + // ChainedInvokeStartedDetails. This is the load-bearing assertion: + // it proves the SDK's InvokeConfig.TenantId reaches the wire. + Assert.Equal("test-tenant", started!.ChainedInvokeStartedDetails.TenantId); + + // The chained call still produced a result — proves nothing in the + // tenant-routing path silently dropped the invocation. + var succeeded = events.FirstOrDefault(e => e.ChainedInvokeSucceededDetails != null); + Assert.NotNull(succeeded); + var childPayload = succeeded!.ChainedInvokeSucceededDetails.Result?.Payload?.Trim('"'); + Assert.Equal("tenant-aware-7", childPayload); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs new file mode 100644 index 000000000..94dbfc0a5 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongRetryChainTest.cs @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class LongRetryChainTest +{ + private readonly ITestOutputHelper _output; + public LongRetryChainTest(ITestOutputHelper output) => _output = output; + + /// + /// Long retry chain across many invocations: step fails 5 times before + /// succeeding on attempt 6. Validates that StepDetails.Attempt increments + /// monotonically across invocations (no off-by-one, no skipped attempts) + /// and that IStepContext.AttemptNumber on the user side matches the wire + /// value on each attempt. + /// + [Fact] + public async Task FailsFiveTimesThenSucceeds_AttemptCounterIsMonotonic() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("LongRetryChainFunction"), + "longretry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total retry delay budget: 1+2+3+4+5 = 15s. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 6 + && (h.Events?.Any(e => e.StepSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Six attempts total: five failures + one success. + Assert.Equal(6, events.Count(e => e.EventType == EventType.StepStarted)); + Assert.Equal(5, events.Count(e => e.StepFailedDetails != null && e.Name == "long_retry_step")); + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "long_retry_step"); + Assert.NotNull(succeeded); + + // The user-facing AttemptNumber on the final (winning) attempt was 6 — + // proves IStepContext.AttemptNumber tracks the wire attempt counter + // across invocations, not just within a single invocation. + Assert.Equal("\"ok on attempt 6\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // Each failure carries a unique per-attempt message — confirms the user-side + // counter incremented exactly once per invocation, no duplicates or skips. + var failureMessages = events + .Where(e => e.StepFailedDetails != null && e.Name == "long_retry_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Equal(5, failureMessages.Count); + for (int i = 1; i <= 5; i++) + { + Assert.Contains(failureMessages, m => m.Contains($"attempt {i}")); + } + + // The chain was executed across multiple invocations (proves the + // service actually re-invoked us between retries instead of holding + // a single Lambda alive through all six attempts). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 5, + $"Expected at least 5 InvocationCompleted events (one per retry boundary), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs new file mode 100644 index 000000000..cb66e3e04 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/LongerWaitTest.cs @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class LongerWaitTest +{ + private readonly ITestOutputHelper _output; + public LongerWaitTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task LongerWait_ExpiresAndCompletes() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("LongerWaitFunction"), + "longwait", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "long-wait-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(90)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Steps before and after the wait both ran, with the post-wait step seeing + // the pre-wait step's value via replay. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("before_wait", stepResults[0].Name); + Assert.Equal("started-long-wait-test", stepResults[0].Payload); + Assert.Equal("after_wait", stepResults[1].Name); + Assert.Equal("after_wait-started-long-wait-test", stepResults[1].Payload); + + // The wait was checkpointed for the configured 15-second duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "long_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(15, waitStarted!.WaitStartedDetails.Duration); + + // The wait spanned at least two invocations: one to schedule it and at + // least one to resume after the timer fires. + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs new file mode 100644 index 000000000..73fdbf0e3 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MultipleStepsTest.cs @@ -0,0 +1,62 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MultipleStepsTest +{ + private readonly ITestOutputHelper _output; + public MultipleStepsTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task MultipleSteps_AllCheckpointed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MultipleStepsFunction"), + "multi", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "chain"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — the execution can be SUCCEEDED before + // all events are indexed. Wait until we see all 5 step-succeeded events. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 5 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 5, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(5, events.Count(e => e.EventType == EventType.StepStarted)); + + // Each step ran exactly once (no replay-induced duplicates) in declaration order, + // and each step's output chained from the previous one. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => $"{e.Name}={e.StepSucceededDetails.Result?.Payload?.Trim('"')}") + .ToList(); + Assert.Equal( + new[] + { + "step_1=a-chain", + "step_2=a-chain-b", + "step_3=a-chain-b-c", + "step_4=a-chain-b-c-d", + "step_5=a-chain-b-c-d-e", + }, + stepResults); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs new file mode 100644 index 000000000..053e2b299 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ReplayDeterminismTest.cs @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public ReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task ReplayDeterminism_SameGuidAcrossInvocations() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ReplayDeterminismFunction"), + "replay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "replay-test"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // History is eventually consistent — wait until both step-succeeded events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Each step succeeded exactly once — generate_id was NOT re-executed on replay + // (a duplicate would show up as two succeeded events for the same name). + var stepSucceededEvents = events.Where(e => e.StepSucceededDetails != null).ToList(); + Assert.Equal(2, stepSucceededEvents.Count); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "generate_id")); + Assert.Single(stepSucceededEvents.Where(e => e.Name == "echo_id")); + + var generateEvent = stepSucceededEvents.First(e => e.Name == "generate_id"); + var echoEvent = stepSucceededEvents.First(e => e.Name == "echo_id"); + + var generatedGuid = generateEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + var echoedResult = echoEvent.StepSucceededDetails.Result?.Payload?.Trim('"'); + Assert.NotNull(generatedGuid); + Assert.NotNull(echoedResult); + Assert.True(Guid.TryParse(generatedGuid, out _), + $"generate_id should produce a valid GUID, got: {generatedGuid}"); + + // The echoed value matches the cached GUID — proves replay returned the + // checkpointed value rather than running generate_id again. + Assert.Equal($"echo:{generatedGuid}", echoedResult); + + // The boundary wait actually caused a suspend/resume cycle. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "boundary_wait"); + Assert.NotNull(waitStarted); + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (proves replay actually happened), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs new file mode 100644 index 000000000..982f72e98 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryExhaustionTest.cs @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class RetryExhaustionTest +{ + private readonly ITestOutputHelper _output; + public RetryExhaustionTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end retry exhaustion: step always throws, maxAttempts=3. + /// Validates that the SDK records exactly three StepStarted/StepFailed pairs, + /// the final attempt produces a FAIL checkpoint (not RETRY), and the workflow + /// terminates FAILED with the original exception surfaced through the + /// execution-level error. + /// + [Fact] + public async Task AlwaysFailsStep_ExhaustsRetries_TerminatesFailed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("RetryExhaustionFunction"), + "rexhaust", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload synchronously; locate the execution by name. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 2s + 4s of retry delays + 3x execution overhead. Generous headroom for scheduling. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // Execution-level error is the original exception from the final attempt. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("attempt 3", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.StepFailedDetails != null) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Three attempts ran in total — no extra (off-by-one) and no truncation. + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepStarted)); + + // Three failures recorded; no successes. + Assert.Equal(3, events.Count(e => e.StepFailedDetails != null && e.Name == "always_fails_step")); + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + + // Each recorded failure carries the right per-attempt message. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "always_fails_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failures, m => m.Contains("attempt 1")); + Assert.Contains(failures, m => m.Contains("attempt 2")); + Assert.Contains(failures, m => m.Contains("attempt 3")); + + // Service honored the retry delays. No-jitter exponential backoff at 2s/4s + // means the gap between the first and last StepStarted is >= 6s. + var startedTimestamps = events + .Where(e => e.EventType == EventType.StepStarted && e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs new file mode 100644 index 000000000..1dcf48249 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/RetryTest.cs @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class RetryTest +{ + private readonly ITestOutputHelper _output; + public RetryTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end retry: step throws on attempts 1 and 2, succeeds on attempt 3. + /// Validates that the service honors the RETRY checkpoint, schedules the + /// requested delay, and re-invokes the Lambda — none of which the unit + /// tests can prove (they fake state transitions in-memory). + /// + [Fact] + public async Task FlakyStep_RetriesAndSucceedsOnThirdAttempt() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("RetryFunction"), + "retry", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Initial invoke returns when the SDK suspends after the first failure. + // The execution continues asynchronously via service-driven re-invokes. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Total expected wall time: 2s + 4s of retry delay + execution overhead. + // Allow generous headroom for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 3 + && (h.Events?.Any(e => e.StepSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Three attempts ran (attempts 1, 2, 3). + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepStarted)); + + // Two failed attempts recorded retry metadata; the final attempt succeeded. + Assert.Equal(2, events.Count(e => e.StepFailedDetails != null && e.Name == "flaky_step")); + var succeeded = events.SingleOrDefault(e => e.StepSucceededDetails != null && e.Name == "flaky_step"); + Assert.NotNull(succeeded); + Assert.Equal("\"ok on attempt 3\"", succeeded!.StepSucceededDetails.Result?.Payload); + + // The two recorded failure messages reflect the per-attempt exception. + var failures = events + .Where(e => e.StepFailedDetails != null && e.Name == "flaky_step") + .Select(e => e.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty) + .ToList(); + Assert.Contains(failures, m => m.Contains("attempt 1")); + Assert.Contains(failures, m => m.Contains("attempt 2")); + + // Timing check: the service must have actually waited between attempts. + // With initialDelay=2s, backoffRate=2.0, no jitter: delays are 2s and 4s. + // The gap between the first and last StepStarted should be >= 6s. + var startedTimestamps = events + .Where(e => e.EventType == EventType.StepStarted && e.EventTimestamp.HasValue) + .OrderBy(e => e.EventTimestamp!.Value) + .Select(e => e.EventTimestamp!.Value) + .ToList(); + var totalGap = startedTimestamps[^1] - startedTimestamps[0]; + _output.WriteLine($"Time between first and last attempt: {totalGap.TotalSeconds:F1}s"); + Assert.True(totalGap >= TimeSpan.FromSeconds(6), + $"Service did not honor retry delays: {totalGap.TotalSeconds:F1}s gap (expected >= 6s)"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs new file mode 100644 index 000000000..7e3a546fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepFailsTest.cs @@ -0,0 +1,57 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class StepFailsTest +{ + private readonly ITestOutputHelper _output; + public StepFailsTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task StepFails_PropagatesAsFailedStatus() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("StepFailsFunction"), + "stepfail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller. Locate the execution + // by name and verify the service marked it FAILED. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Contains("intentional failure", execution.Error.ErrorMessage); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.StepStarted) ?? false) + && (h.Events?.Any(e => e.StepFailedDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(1, events.Count(e => e.EventType == EventType.StepStarted)); + + // The failing step recorded a StepFailed event with the exception message. + var stepFailed = events.FirstOrDefault(e => e.StepFailedDetails != null && e.Name == "fail_step"); + Assert.NotNull(stepFailed); + Assert.Contains("intentional failure", stepFailed!.StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty); + + // No step ever succeeded — the workflow body was unreachable past the throw. + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs new file mode 100644 index 000000000..55a34e895 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/StepWaitStepTest.cs @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class StepWaitStepTest +{ + private readonly ITestOutputHelper _output; + public StepWaitStepTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task StepWaitStep_CompletesViaService() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("StepWaitStepFunction"), + "stepwait", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "integ-test-123"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.StepStarted) ?? 0) >= 2 + && (h.Events?.Count(e => e.StepSucceededDetails != null) ?? 0) >= 2 + && (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(2, events.Count(e => e.EventType == EventType.StepStarted)); + + // Both steps ran in order and produced the expected chained outputs. + var stepResults = events + .Where(e => e.StepSucceededDetails != null) + .Select(e => (Name: e.Name, Payload: e.StepSucceededDetails.Result?.Payload?.Trim('"'))) + .ToList(); + Assert.Equal(2, stepResults.Count); + Assert.Equal("validate", stepResults[0].Name); + Assert.Equal("validated-integ-test-123", stepResults[0].Payload); + Assert.Equal("process", stepResults[1].Name); + Assert.Equal("processed-validated-integ-test-123", stepResults[1].Payload); + + // The wait was actually scheduled with the expected duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "short_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(3, waitStarted!.WaitStartedDetails.Duration); + var waitSucceeded = events.FirstOrDefault(e => e.WaitSucceededDetails != null && e.Name == "short_wait"); + Assert.NotNull(waitSucceeded); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj new file mode 100644 index 000000000..92fe96678 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/ApproverFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs new file mode 100644 index 000000000..4991290d4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ApproverFunction/Function.cs @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace ApproverFunction; + +/// +/// Plain Lambda that acts as the "external system" in the WaitForCallback +/// integration test. Receives a callback ID + payload bits, builds the result +/// JSON, and resolves the durable execution by calling +/// SendDurableExecutionCallbackSuccess. Modeled after the real-world pattern +/// where an out-of-band service signals workflow completion. +/// +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(ApproverInput input, ILambdaContext context) + { + if (string.IsNullOrEmpty(input.CallbackId)) + throw new ArgumentException("CallbackId is required"); + + var resultJson = $$"""{"Status":"approved","ApprovedBy":"{{input.OrderId}}"}"""; + await LambdaClient.SendDurableExecutionCallbackSuccessAsync( + new SendDurableExecutionCallbackSuccessRequest + { + CallbackId = input.CallbackId, + Result = new MemoryStream(Encoding.UTF8.GetBytes(resultJson)) + }); + return null; + } +} + +public class ApproverInput +{ + public string? CallbackId { get; set; } + public string? OrderId { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/AtMostOnceCrashFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs new file mode 100644 index 000000000..443d05b8a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/AtMostOnceCrashFunction/Function.cs @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +/// +/// Exercises the AtMostOncePerRetry crash-recovery path end-to-end. +/// +/// On attempt 1 the step kills the Lambda process AFTER the START checkpoint +/// has been flushed but BEFORE any SUCCEED checkpoint can be written. The +/// service re-invokes us; replay sees STARTED with no terminal record, so the +/// SDK routes through the retry strategy with a synthesized +/// StepInterruptedException. Attempt 2 succeeds normally. +/// +/// The per-attempt counter is read from the input payload — the durable +/// service preserves it across re-invokes so we can drive deterministic crash +/// behavior on attempt 1 only. +/// +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber == 1) + { + // Hard process exit AFTER the SDK has flushed the START + // checkpoint (sync flush is part of the AtMostOncePerRetry + // contract). The service will see a STARTED record with no + // terminal counterpart on the next invocation. + Environment.Exit(137); + } + return $"recovered on attempt {ctx.AttemptNumber}"; + }, + name: "crash_then_recover", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/CallbackFailedFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs new file mode 100644 index 000000000..721302ed3 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackFailedFunction/Function.cs @@ -0,0 +1,59 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Hand the service-allocated callback ID to the paired RejecterFunction + // (Event invocation — fire-and-forget). The rejecter calls + // SendDurableExecutionCallbackFailure out-of-band, which surfaces in + // GetResultAsync as CallbackFailedException — uncaught here, so the + // workflow ends FAILED with that exception type recorded. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var cb = await context.CreateCallbackAsync(name: "approve"); + + // Wrap the hand-off in a step so replays don't re-invoke the rejecter. + await context.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/CallbackTimeoutFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs new file mode 100644 index 000000000..58fe3c75e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CallbackTimeoutFunction/Function.cs @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The test deliberately never delivers the callback. The service should + // fire the timeout, mark the callback TIMED_OUT, and the SDK should + // surface CallbackTimeoutException to the workflow. + var cb = await context.CreateCallbackAsync( + name: "approve", + config: new CallbackConfig { Timeout = TimeSpan.FromSeconds(10) }); + var result = await cb.GetResultAsync(); + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/ChildContextFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs new file mode 100644 index 000000000..ae3134f24 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFailsFunction/Function.cs @@ -0,0 +1,48 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Throw inside a child context to validate the CONTEXT FAIL path: the + // service must record a ContextFailed event with the error details and + // mark the workflow FAILED. + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"prepared-{input.OrderId}"; }, + name: "prepare"); + + throw new InvalidOperationException("intentional child context failure for integration test"); + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/ChildContextFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs new file mode 100644 index 000000000..507f1df0f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextFunction/Function.cs @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Run a child context that itself does step + wait + step. The child's + // return value is checkpointed at the parent level as a CONTEXT + // SUCCEED record, so on replay we'd see it returned from cache. + var phaseResult = await context.RunInChildContextAsync( + async (childCtx) => + { + var validated = await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"validated-{input.OrderId}"; }, + name: "validate"); + + await childCtx.WaitAsync(TimeSpan.FromSeconds(2), name: "short_wait"); + + var processed = await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return $"processed-{validated}"; }, + name: "process"); + + return processed; + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "completed", Data = phaseResult }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/ChildContextRetryFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs new file mode 100644 index 000000000..521a7fa50 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ChildContextRetryFailsFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // A retry-then-exhaust step inside a child context: every retry + // checkpoint should be parented under the child, and the child should + // close as ContextFailed when retries are exhausted — proving the + // child is a single retry/error boundary. + await context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + throw new InvalidOperationException( + $"always-fails on attempt {ctx.AttemptNumber} for {input.OrderId}"); + }, + name: "always_fails", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + }, + name: "phase", + config: new ChildContextConfig { SubType = "OrderProcessing" }); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/CreateCallbackHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs new file mode 100644 index 000000000..e9712e6ea --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/CreateCallbackHappyPathFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Hand the service-allocated callback ID to the paired ApproverFunction + // (Event invocation — fire-and-forget). The approver runs in its own Lambda + // and resolves the callback out-of-band by calling + // SendDurableExecutionCallbackSuccess. This mirrors WaitForCallbackHappyPath's + // topology so the test process never has to play "external system" — the + // synchronous Invoke from the test would otherwise deadlock against the + // suspended workflow. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var cb = await context.CreateCallbackAsync(name: "approve"); + + // Wrap the hand-off in a step so replays don't re-invoke the approver. + await context.StepAsync(async _ => + { + var payload = $$"""{"callbackId":"{{cb.CallbackId}}","orderId":"integ-test"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, name: "submit"); + + return await cb.GetResultAsync(); + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs new file mode 100644 index 000000000..240565384 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + var formatted = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"tenant-aware-{input}"; }, + name: "tenant_step"); + return formatted; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeChildTenantFunction/InvokeChildTenantFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs new file mode 100644 index 000000000..7e96ff0c8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/Function.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + // Throw inside a step so the workflow records a step-failed event AND + // surfaces a FAILED execution status. The parent's InvokeAsync sees a + // FAILED chained invocation and raises InvokeFailedException with the + // step's error type (System.InvalidOperationException) attached. + await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional child failure"); + }, + name: "fail_step"); + + return "unreachable"; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureChildFunction/InvokeFailureChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs new file mode 100644 index 000000000..40bfa3079 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/Function.cs @@ -0,0 +1,56 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + try + { + await context.InvokeAsync( + downstreamArn, + payload: 1, + name: "call_failing_child"); + + // Should not reach — the child throws and the parent surfaces + // InvokeFailedException on the resume. + return new TestResult { Status = "unexpected_success", Data = null }; + } + catch (InvokeFailedException ex) + { + // The parent catches and converts the exception into a normal result — + // the workflow itself succeeds, even though the chained invoke failed. + return new TestResult + { + Status = "completed", + Data = $"parent-saw-{ex.ErrorType ?? "unknown"}" + }; + } + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeFailureParentFunction/InvokeFailureParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs new file mode 100644 index 000000000..898021cdd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(int input, IDurableContext context) + { + var prefixed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"got-{input}"; }, + name: "format"); + return prefixed; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathChildFunction/InvokeHappyPathChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs new file mode 100644 index 000000000..4a2e93f8c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/Function.cs @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Parent receives the downstream function ARN via env var so the test + // harness can wire arbitrary downstream functions without rebuilding + // the parent image. + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + var result = await context.InvokeAsync( + downstreamArn, + payload: 42, + name: "call_child"); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeHappyPathParentFunction/InvokeHappyPathParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs new file mode 100644 index 000000000..5115101e1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/Function.cs @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(string input, IDurableContext context) + { + var echoed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"echoed:{input}"; }, + name: "child_echo"); + return echoed; + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismChildFunction/InvokeReplayDeterminismChildFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs new file mode 100644 index 000000000..b00be9c95 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/Function.cs @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + // Step 1 generates a fresh GUID. On replay this MUST return the + // checkpointed value — proves the SDK's deterministic operation IDs + // line up with the service's view of the state. + var generatedId = await context.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "before_invoke"); + + // The chained invoke forces a suspend/resume cycle. After the resume, + // step 1 must replay (returning the cached GUID) and the invoke must + // not be re-fired (cached result is returned immediately). + var invokeResult = await context.InvokeAsync( + downstreamArn, + payload: generatedId, + name: "echo_invoke"); + + var afterInvoke = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"final:{invokeResult}"; }, + name: "after_invoke"); + + return new TestResult { Status = "completed", Data = afterInvoke }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeReplayDeterminismParentFunction/InvokeReplayDeterminismParentFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs new file mode 100644 index 000000000..a11eba6d2 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/Function.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var downstreamArn = System.Environment.GetEnvironmentVariable("DOWNSTREAM_FUNCTION_ARN") + ?? throw new InvalidOperationException("DOWNSTREAM_FUNCTION_ARN env var is not set."); + + var result = await context.InvokeAsync( + downstreamArn, + payload: 7, + name: "call_with_tenant", + config: new InvokeConfig { TenantId = "test-tenant" }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/InvokeWithTenantIdFunction/InvokeWithTenantIdFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs new file mode 100644 index 000000000..7d3c0f0e1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/Function.cs @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +/// +/// Five-failure retry chain: the step throws on attempts 1-5 and succeeds on +/// attempt 6. The result payload echoes ctx.AttemptNumber on each attempt so +/// the integration test can verify the SDK's user-facing attempt counter +/// matches the wire-format StepDetails.Attempt value across multiple +/// invocations. +/// +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber < 6) + throw new InvalidOperationException($"flake on attempt {ctx.AttemptNumber}"); + return $"ok on attempt {ctx.AttemptNumber}"; + }, + name: "long_retry_step", + config: new StepConfig + { + // Short delays so the test wall time stays manageable: 1s, 2s, 3s, 4s, 5s. + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 6, + initialDelay: TimeSpan.FromSeconds(1), + maxDelay: TimeSpan.FromSeconds(5), + backoffRate: 1.5, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongRetryChainFunction/LongRetryChainFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs new file mode 100644 index 000000000..401066c0e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/Function.cs @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"started-{input.OrderId}"; }, + name: "before_wait"); + + await context.WaitAsync(TimeSpan.FromSeconds(15), name: "long_wait"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"after_wait-{step1}"; }, + name: "after_wait"); + + return new TestResult { Status = "completed", Data = step2 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/LongerWaitFunction/LongerWaitFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs new file mode 100644 index 000000000..cdf5992b6 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/Function.cs @@ -0,0 +1,53 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"a-{input.OrderId}"; }, + name: "step_1"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step1}-b"; }, + name: "step_2"); + + var step3 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step2}-c"; }, + name: "step_3"); + + var step4 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step3}-d"; }, + name: "step_4"); + + var step5 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"{step4}-e"; }, + name: "step_5"); + + return new TestResult { Status = "completed", Data = step5 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MultipleStepsFunction/MultipleStepsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs new file mode 100644 index 000000000..a450855a7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/Function.cs @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace RejecterFunction; + +/// +/// Plain Lambda that acts as the "external system" in the CallbackFailed +/// integration test. Receives a callback ID and resolves the durable execution +/// as failed by calling SendDurableExecutionCallbackFailure. Modeled after +/// ApproverFunction (its happy-path counterpart). +/// +public class Function +{ + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public async Task Handler(RejecterInput input, ILambdaContext context) + { + if (string.IsNullOrEmpty(input.CallbackId)) + throw new ArgumentException("CallbackId is required"); + + await LambdaClient.SendDurableExecutionCallbackFailureAsync( + new SendDurableExecutionCallbackFailureRequest + { + CallbackId = input.CallbackId, + Error = new ErrorObject + { + ErrorType = "ApprovalRejected", + ErrorMessage = "external system rejected the request", + } + }); + return null; + } +} + +public class RejecterInput +{ + public string? CallbackId { get; set; } + public string? OrderId { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj new file mode 100644 index 000000000..92fe96678 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RejecterFunction/RejecterFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..22f919900 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/Function.cs @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Step 1 generates a fresh GUID. On replay, this MUST return the cached value. + var generatedId = await context.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate_id"); + + // Force a suspend/resume cycle to trigger replay + await context.WaitAsync(TimeSpan.FromSeconds(3), name: "boundary_wait"); + + // Step 2 echoes the GUID. After replay, it should see the SAME GUID from step 1. + var echoed = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"echo:{generatedId}"; }, + name: "echo_id"); + + return new TestResult { Status = "completed", Data = echoed }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ReplayDeterminismFunction/ReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs new file mode 100644 index 000000000..3e78ffd9d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/Function.cs @@ -0,0 +1,50 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + throw new InvalidOperationException($"always-fails attempt {ctx.AttemptNumber}"); + }, + name: "always_fails_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryExhaustionFunction/RetryExhaustionFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs new file mode 100644 index 000000000..800dc075f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/Function.cs @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + if (ctx.AttemptNumber < 3) + throw new InvalidOperationException($"flake on attempt {ctx.AttemptNumber}"); + return $"ok on attempt {ctx.AttemptNumber}"; + }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(10), + backoffRate: 2.0, + jitter: JitterStrategy.None) + }); + + return new TestResult { Status = "completed", Data = result }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/RetryFunction/RetryFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs new file mode 100644 index 000000000..de0246a50 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/Function.cs @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + await context.StepAsync( + async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional failure for integration test"); + }, + name: "fail_step"); + + return new TestResult { Status = "should_not_reach" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepFailsFunction/StepFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs new file mode 100644 index 000000000..97f7edd51 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/Function.cs @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var step1 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"validated-{input.OrderId}"; }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(3), name: "short_wait"); + + var step2 = await context.StepAsync( + async (_) => { await Task.CompletedTask; return $"processed-{step1}"; }, + name: "process"); + + return new TestResult { Status = "completed", Data = step2 }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/StepWaitStepFunction/StepWaitStepFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs new file mode 100644 index 000000000..129344d25 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/Function.cs @@ -0,0 +1,61 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.Model; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + // Reuse a single Lambda client across submitter invocations. + private static readonly IAmazonLambda LambdaClient = new AmazonLambdaClient(); + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The submitter is called once with a freshly-allocated callback ID. + // It hands that ID off to the paired ApproverFunction (Event invocation — + // fire-and-forget, modelling a real external system). The submitter + // returns immediately, the SDK suspends, and the approver eventually + // calls SendDurableExecutionCallbackSuccess to resolve the workflow + // out-of-band. + var externalFunctionName = System.Environment.GetEnvironmentVariable("EXTERNAL_FUNCTION_NAME") + ?? throw new InvalidOperationException("EXTERNAL_FUNCTION_NAME env var not set"); + + var result = await context.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + var payload = $$"""{"callbackId":"{{callbackId}}","orderId":"{{input.OrderId}}"}"""; + await LambdaClient.InvokeAsync(new InvokeRequest + { + FunctionName = externalFunctionName, + InvocationType = InvocationType.Event, + Payload = payload + }); + }, + name: "approve"); + + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackHappyPathFunction/WaitForCallbackHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs new file mode 100644 index 000000000..19b60d567 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/Function.cs @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // The submitter throws on every attempt. With RetryStrategy.None the + // SDK should fail terminally on the first attempt and surface the + // failure as CallbackSubmitterException. The workflow does not catch + // it, so the durable execution surfaces FAILED with that exception. + var result = await context.WaitForCallbackAsync( + submitter: async (callbackId, cbCtx) => + { + await Task.CompletedTask; + throw new InvalidOperationException("submitter intentional failure"); + }, + name: "approve", + config: new WaitForCallbackConfig { RetryStrategy = RetryStrategy.None }); + + return result; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class MyResult { public string? Status { get; set; } public string? ApprovedBy { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitForCallbackSubmitterFailsFunction/WaitForCallbackSubmitterFailsFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs new file mode 100644 index 000000000..8bfd7b7cd --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/Function.cs @@ -0,0 +1,34 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + await context.WaitAsync(TimeSpan.FromSeconds(5), name: "only_wait"); + return new TestResult { Status = "completed", Data = "wait_only" }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/WaitOnlyFunction/WaitOnlyFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs new file mode 100644 index 000000000..3d6ad3d86 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackHappyPathTest.cs @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForCallbackHappyPathTest +{ + private readonly ITestOutputHelper _output; + public WaitForCallbackHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy path for WaitForCallbackAsync using a real + /// two-Lambda flow: the workflow's submitter Event-invokes a paired + /// ApproverFunction, which calls SendDurableExecutionCallbackSuccess + /// out-of-band. The workflow suspends after the submitter step completes, + /// the service re-invokes the workflow once the approver resolves the + /// callback, and WaitForCallbackAsync returns the deserialized result. + /// + [Fact] + public async Task WaitForCallback_SubmitterDeliversResult_WorkflowCompletes() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForCallbackHappyPathFunction"), + "wfcb-happy", _output, + externalFunctionDir: DurableFunctionDeployment.FindTestFunctionDir("ApproverFunction")); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "approver-1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The execution returns the payload the submitter delivered. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("approved", execution.Result); + Assert.Contains("approver-1", execution.Result); + + // History records the canonical WaitForCallback lifecycle: + // submitter step Started + Succeeded, callback Started + Succeeded, + // and a containing context (CONTEXT operation) wrapping the pair. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.CallbackStarted) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.CallbackSucceeded) ?? false) + && (h.Events?.Any(e => e.EventType == EventType.StepSucceeded) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Single(events.Where(e => e.EventType == EventType.CallbackStarted)); + Assert.Single(events.Where(e => e.EventType == EventType.CallbackSucceeded)); + + // The submitter ran exactly once and succeeded — the SDK's "callback + // already resolved" branch must NOT have re-run it on replay. Filter + // on a name that the SDK uses for the submitter step (typically + // matches the WaitForCallback name). + var submitterSteps = events + .Where(e => e.EventType == EventType.StepSucceeded + || e.EventType == EventType.StepStarted) + .ToList(); + Assert.NotEmpty(submitterSteps); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs new file mode 100644 index 000000000..e172a4ab0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitForCallbackSubmitterFailsTest.cs @@ -0,0 +1,69 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitForCallbackSubmitterFailsTest +{ + private readonly ITestOutputHelper _output; + public WaitForCallbackSubmitterFailsTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end submitter-failure path for WaitForCallbackAsync: + /// the submitter throws on attempt 1 with ; + /// the SDK fails the composite operation terminally and surfaces + /// . The workflow surfaces FAILED. + /// + [Fact] + public async Task WaitForCallback_SubmitterThrows_SurfacesAsCallbackSubmitterException() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitForCallbackSubmitterFailsFunction"), + "wfcb-fail", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "x"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Initial response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("FAILED", status, ignoreCase: true); + + // The workflow surfaces CallbackSubmitterException — the SDK's wrapper + // type around the failed submitter step. Verify both the recorded + // ErrorType and that the original "submitter intentional failure" + // message survives in the error chain. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + Assert.Equal(typeof(CallbackSubmitterException).FullName, execution.Error.ErrorType); + // ErrorObject.FromException records the outer exception's Message; that + // message should reference the submitter failure context. Be lenient + // about exact wording since the SDK may prepend / wrap the inner. + Assert.False(string.IsNullOrEmpty(execution.Error.ErrorMessage)); + + // History records the submitter step failed exactly once — RetryStrategy.None + // means no retries — and no callback was ever started since the submitter + // never delivered the ID. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => h.Events?.Any(e => e.StepFailedDetails != null) ?? false, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var stepFailures = events.Where(e => e.StepFailedDetails != null).ToList(); + Assert.Single(stepFailures); + var failureMessage = stepFailures[0].StepFailedDetails.Error?.Payload?.ErrorMessage ?? string.Empty; + Assert.Contains("submitter intentional failure", failureMessage); + + // No SUCCEEDED step events — the submitter never succeeded. + Assert.Empty(events.Where(e => e.StepSucceededDetails != null)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs new file mode 100644 index 000000000..a8ab9b22b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/WaitOnlyTest.cs @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class WaitOnlyTest +{ + private readonly ITestOutputHelper _output; + public WaitOnlyTest(ITestOutputHelper output) => _output = output; + + [Fact] + public async Task WaitOnly_NoSteps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("WaitOnlyFunction"), + "waitonly", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "wait-only"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.WaitSucceededDetails != null) ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // The wait was checkpointed and ran for the configured duration. + var waitStarted = events.FirstOrDefault(e => e.WaitStartedDetails != null && e.Name == "only_wait"); + Assert.NotNull(waitStarted); + Assert.Equal(5, waitStarted!.WaitStartedDetails.Duration); + + var waitSucceeded = events.FirstOrDefault(e => e.WaitSucceededDetails != null && e.Name == "only_wait"); + Assert.NotNull(waitSucceeded); + + // No step events: this workflow body contains only a wait. + Assert.Empty(events.Where(e => e.StepStartedDetails != null)); + + // The wait genuinely caused a suspend/resume, not an in-process delay: + // expect at least 2 invocations recorded (initial + resume after timer fires). + var invocations = events.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected at least 2 InvocationCompleted events (initial + post-wait resume), got {invocations.Count}"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json new file mode 100644 index 000000000..b6de9b357 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://xunit.net/schema/current/xunit.runner.schema.json", + "parallelizeTestCollections": false, + "parallelizeAssembly": false, + "maxParallelThreads": 1 +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj new file mode 100644 index 000000000..6f9abfe62 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj @@ -0,0 +1,35 @@ + + + + + + $(DefaultPackageTargets) + Amazon.Lambda.DurableExecution.Tests + Amazon.Lambda.DurableExecution.Tests + true + ..\..\..\buildtools\public.snk + true + enable + enable + $(NoWarn);CS1591 + true + + + + + + + + + + + + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs new file mode 100644 index 000000000..c70dc75fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs @@ -0,0 +1,486 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class CallbackOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + /// + /// Wires a recorder so that the next CALLBACK START flush stamps the given + /// callback ID into — modeling the durable-execution + /// service's NewExecutionState response that allocates the ID. + /// + private static void WireServiceCallbackIdAllocation( + RecordingBatcher recorder, ExecutionState state, string callbackId) + { + recorder.OnFlush = ops => + { + foreach (var op in ops) + { + if (op.Type == OperationTypes.Callback && op.Action == "START") + { + state.AddOperations(new[] + { + new Operation + { + Id = op.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = op.Name, + CallbackDetails = new CallbackDetails { CallbackId = callbackId } + } + }); + } + } + }; + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_FlushesStartAndReturnsCallbackId() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-abc-123"); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + Assert.Equal("cb-abc-123", callback.CallbackId); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + + // CreateCallbackAsync sync-flushes a single START checkpoint. + var single = Assert.Single(recorder.Flushed); + Assert.Equal(OperationTypes.Callback, single.Type); + Assert.Equal("START", single.Action); + Assert.Equal(OperationSubTypes.Callback, single.SubType); + Assert.Equal("approval", single.Name); + Assert.Equal(IdAt(1), single.Id); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_NoConfig_DoesNotEmitCallbackOptions() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync(name: "no_options"); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.Null(single.CallbackOptions); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_WithConfig_EmitsCallbackOptions() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync( + name: "with_options", + config: new CallbackConfig + { + Timeout = TimeSpan.FromHours(1), + HeartbeatTimeout = TimeSpan.FromMinutes(5) + }); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.NotNull(single.CallbackOptions); + Assert.Equal(3600, single.CallbackOptions.TimeoutSeconds); + Assert.Equal(300, single.CallbackOptions.HeartbeatTimeoutSeconds); + } + + [Fact] + public async Task CreateCallbackAsync_FreshExecution_OnlyTimeout_EmitsOnlyTimeout() + { + var (context, recorder, _, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + await context.CreateCallbackAsync( + config: new CallbackConfig { Timeout = TimeSpan.FromSeconds(45) }); + + await recorder.Batcher.DrainAsync(); + + var single = Assert.Single(recorder.Flushed); + Assert.NotNull(single.CallbackOptions); + Assert.Equal(45, single.CallbackOptions.TimeoutSeconds); + // HeartbeatTimeout was not set → property remains at its default + // (the AWS SDK Marshaller will not serialize the field). + Assert.True( + single.CallbackOptions.HeartbeatTimeoutSeconds == null + || single.CallbackOptions.HeartbeatTimeoutSeconds == 0); + } + + [Fact] + public async Task CreateCallbackAsync_ServiceMissingCallbackId_ThrowsNonDeterministic() + { + // Service doesn't stamp a CallbackId — RecordingBatcher's OnFlush left unset. + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "broken")); + Assert.Contains("CallbackId", ex.Message); + } + + [Fact] + public async Task GetResultAsync_FreshExecution_SuspendsExecution() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + // GetResultAsync should signal termination and return a never-completing task. + var resultTask = callback.GetResultAsync(); + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + } + + [Fact] + public async Task ReplayStarted_DoesNotReFlushStart_AndSuspendsOnGetResult() + { + // STARTED on replay = service has stamped CallbackId but no terminal yet. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-replay-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + Assert.Equal("cb-replay-1", callback.CallbackId); + Assert.False(tm.IsTerminated); + + var resultTask = callback.GetResultAsync(); + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + + // No new checkpoints — replay path doesn't re-flush START. + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ReplaySucceeded_GetResultDeserializes_NoSuspension() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-done-1", + Result = "\"approved\"" + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + var result = await callback.GetResultAsync(); + + Assert.Equal("cb-done-1", callback.CallbackId); + Assert.Equal("approved", result); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ReplaySucceeded_NullResultReturnsDefault() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "no_payload", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "no_payload"); + var result = await callback.GetResultAsync(); + Assert.Null(result); + } + + [Fact] + public async Task ReplayFailed_GetResultThrowsCallbackFailedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Failed, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-fail-1", + Error = new ErrorObject + { + ErrorType = "ExternalSystemError", + ErrorMessage = "rejected by reviewer", + ErrorData = "{\"reviewer\":\"jane\"}" + } + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.IsAssignableFrom(ex); + Assert.Equal("rejected by reviewer", ex.Message); + Assert.Equal("cb-fail-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + Assert.Equal("{\"reviewer\":\"jane\"}", ex.ErrorData); + } + + [Fact] + public async Task ReplayTimedOut_GetResultThrowsCallbackTimeoutException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-to-1", + Error = new ErrorObject + { + ErrorMessage = "callback timed out after 24h" + } + } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.IsAssignableFrom(ex); + Assert.Equal("callback timed out after 24h", ex.Message); + Assert.Equal("cb-to-1", ex.CallbackId); + } + + [Fact] + public async Task ReplayTimedOut_NoErrorDetails_DefaultMessage() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + var ex = await Assert.ThrowsAsync(() => callback.GetResultAsync()); + Assert.Equal("Callback timed out", ex.Message); + } + + [Fact] + public async Task ReplayUnknownStatus_ThrowsNonDeterministic() + { + // Replay must throw on unexpected statuses (CANCELLED, garbage, etc.) + // rather than silently degrading to a suspend. Mirrors WaitOperation + // and ChildContextOperation's `default:` arms. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = "CANCELLED", + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = "cb-1" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "approval")); + Assert.Contains("unexpected status", ex.Message); + Assert.Contains("CANCELLED", ex.Message); + } + + [Fact] + public async Task ReplayMissingCallbackId_ThrowsNonDeterministic() + { + // Replay path expects the CallbackId to be present. If it's absent, surface + // a clear non-deterministic error rather than letting users see a NRE later. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "broken", + CallbackDetails = new CallbackDetails { CallbackId = null } + } + } + }); + + await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "broken")); + } + + [Fact] + public async Task ReplayDeterministic_CallbackIdStableAcrossReplays() + { + // Round-trip: STARTED checkpoint with CallbackId X must yield the same X + // on replay so external systems' references remain valid. + const string id = "stable-cb-id-12345"; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = id } + } + } + }); + + var callback = await context.CreateCallbackAsync(name: "approval"); + Assert.Equal(id, callback.CallbackId); + } + + [Fact] + public async Task ReplayTypeMismatch_ThrowsNonDeterministic() + { + // What was a CALLBACK on a previous invocation is now arriving as something + // else — code drift detection. ExecutionState.ValidateReplayConsistency + // is the gate. + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval", + StepDetails = new StepDetails { Result = "\"ok\"" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "approval")); + } + + [Fact] + public async Task CreateCallbackAsync_CallbackIdAccessBeforeStart_Throws() + { + // Direct construction of the CallbackOperation without going through + // ExecuteAsync — guard against bugs that try to read CallbackId early. + var op = new CallbackOperation( + "op-id", "name", parentId: null, null, new DefaultLambdaJsonSerializer(), + new ExecutionState(), new TerminationManager(), "arn", batcher: null); + + Assert.Throws(() => _ = ((ICallback)op).CallbackId); + await Task.CompletedTask; + } + + [Fact] + public async Task CreateCallbackAsync_NoSerializer_Throws() + { + // No ILambdaSerializer registered on the LambdaContext — surface a clear + // error instead of letting users see a NRE later. + var state = new ExecutionState(); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no Serializer set + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var ex = await Assert.ThrowsAsync(() => + context.CreateCallbackAsync(name: "no-serializer")); + Assert.Contains("ILambdaSerializer", ex.Message); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs new file mode 100644 index 000000000..effeb5804 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs @@ -0,0 +1,216 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class CheckpointBatcherTests +{ + private static SdkOperationUpdate Update(string id) => new() + { + Id = id, + Type = "STEP", + Action = "SUCCEED" + }; + + [Fact] + public async Task EnqueueAsync_AwaitsUntilBatchFlushes() + { + var flushedTokens = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + flushedTokens.Add(token); + return Task.FromResult("token-1"); + }); + + await batcher.EnqueueAsync(Update("0-step")); + + Assert.Equal(new string?[] { "token-0" }, flushedTokens); + Assert.Equal("token-1", batcher.CheckpointToken); + + await batcher.DrainAsync(); + } + + [Fact] + public async Task MultipleEnqueueAsync_BatchedWithinWindow() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + batches.Add(ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(50) }); + + // Fire several enqueues concurrently and await all — they should + // coalesce into a single batch since FlushInterval > 0. + var tasks = Enumerable.Range(0, 5) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.Single(batches); + Assert.Equal(5, batches[0]); + } + + [Fact] + public async Task EnqueueAsync_OverflowOps_SplitsBatches() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + batches.Add(ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig + { + MaxBatchOperations = 3, + FlushInterval = TimeSpan.FromMilliseconds(100) + }); + + var tasks = Enumerable.Range(0, 7) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + // 7 items, max 3 per batch → 3, 3, 1 (or some permutation summing to 7 + // with no batch over 3). + Assert.Equal(7, batches.Sum()); + Assert.All(batches, count => Assert.True(count <= 3)); + Assert.True(batches.Count >= 3); + } + + [Fact] + public async Task FlushAsync_Throws_PropagatesToAllAwaiters() + { + var failure = new InvalidOperationException("service unavailable"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure), + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(50) }); + + var tasks = Enumerable.Range(0, 3) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + // Each awaiter should see the same exception. + foreach (var t in tasks) + { + var ex = await Assert.ThrowsAsync(() => t); + Assert.Equal("service unavailable", ex.Message); + } + } + + [Fact] + public async Task EnqueueAsync_AfterTerminalError_FailsFast() + { + var failure = new InvalidOperationException("kaboom"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure)); + + // First enqueue trips the terminal error. + await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("0-step"))); + + // Subsequent enqueue should fail fast with the same exception. + var second = await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("1-step"))); + Assert.Equal("kaboom", second.Message); + } + + [Fact] + public async Task DrainAsync_FlushesRemainingItems() + { + var totalFlushed = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + Interlocked.Add(ref totalFlushed, ops.Count); + return Task.FromResult(token); + }); + + // Fire enqueues without awaiting them individually. + var tasks = Enumerable.Range(0, 4) + .Select(i => batcher.EnqueueAsync(Update($"{i}-step"))) + .ToArray(); + + await batcher.DrainAsync(); + await Task.WhenAll(tasks); + + Assert.Equal(4, totalFlushed); + } + + [Fact] + public async Task DrainAsync_AfterTerminalError_Throws() + { + var failure = new InvalidOperationException("nope"); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromException(failure)); + + // Trip the terminal error. + await Assert.ThrowsAsync(() => batcher.EnqueueAsync(Update("0-step"))); + + // Drain should rethrow. + await Assert.ThrowsAsync(() => batcher.DrainAsync()); + } + + [Fact] + public async Task EnqueueAsync_AfterDispose_Throws() + { + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => Task.FromResult(token)); + + await batcher.DisposeAsync(); + + await Assert.ThrowsAnyAsync(() => batcher.EnqueueAsync(Update("0-step"))); + } + + [Fact] + public async Task CheckpointToken_UpdatesAfterEachFlush() + { + var counter = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + var next = $"token-{Interlocked.Increment(ref counter)}"; + return Task.FromResult(next); + }); + + await batcher.EnqueueAsync(Update("0-step")); + Assert.Equal("token-1", batcher.CheckpointToken); + + await batcher.EnqueueAsync(Update("1-step")); + Assert.Equal("token-2", batcher.CheckpointToken); + + await batcher.DrainAsync(); + } + + [Fact] + public async Task ConcurrentEnqueueAsync_AllComplete() + { + var totalFlushed = 0; + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + Interlocked.Add(ref totalFlushed, ops.Count); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig { FlushInterval = TimeSpan.FromMilliseconds(20) }); + + var tasks = Enumerable.Range(0, 100) + .Select(i => Task.Run(() => batcher.EnqueueAsync(Update($"{i}-step")))) + .ToArray(); + + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.Equal(100, totalFlushed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs new file mode 100644 index 000000000..3aa182248 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -0,0 +1,525 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ChildContextOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + [Fact] + public async Task RunInChildContextAsync_FreshExecution_RunsFuncAndCheckpoints() + { + var (context, recorder, tm, _) = CreateContext(); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + executed = true; + return await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "inner"; }, name: "inner_step"); + }, + name: "phase"); + + Assert.True(executed); + Assert.Equal("inner", result); + Assert.False(tm.IsTerminated); + + // CONTEXT START → STEP START (fire-and-forget, but flushed before drain) + // → STEP SUCCEED → CONTEXT SUCCEED + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] + { + "CONTEXT:START", + "STEP:START", + "STEP:SUCCEED", + "CONTEXT:SUCCEED" + }, actions); + + var contextSucceed = recorder.Flushed.Single(o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal(IdAt(1), contextSucceed.Id); + Assert.Equal("phase", contextSucceed.Name); + Assert.Equal("\"inner\"", contextSucceed.Payload); + } + + [Fact] + public async Task RunInChildContextAsync_FreshExecution_ChildOperationIdsDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "a"; }, name: "first"); + await childCtx.StepAsync(async (_) => { await Task.CompletedTask; return "b"; }, name: "second"); + return 0; + }, + name: "phase"); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstChildOpId = ChildIdAt(parentOpId, 1); + var secondChildOpId = ChildIdAt(parentOpId, 2); + + var stepStarts = recorder.Flushed.Where(o => o.Type == "STEP" && o.Action == "START").ToArray(); + Assert.Equal(2, stepStarts.Length); + Assert.Equal(firstChildOpId, stepStarts[0].Id); + Assert.Equal(secondChildOpId, stepStarts[1].Id); + } + + [Fact] + public async Task RunInChildContextAsync_ReplaySucceeded_ReturnsCachedAndDoesNotRun() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + ContextDetails = new ContextDetails { Result = "\"cached\"" } + } + } + }); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + executed = true; + await Task.CompletedTask; + return "fresh"; + }, + name: "phase"); + + Assert.False(executed); + Assert.Equal("cached", result); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayFailed_ThrowsChildContextException() + { + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "phase", + SubType = "WaitForCallback", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "child went wrong", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "phase")); + + Assert.Equal("child went wrong", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.Equal("WaitForCallback", ex.SubType); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayFailed_AppliesErrorMapping() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "phase", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "boom" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase", + config: new ChildContextConfig + { + // Mapper sees the ChildContextException and remaps to a + // domain-specific exception, preserving the original via + // InnerException. + ErrorMapping = e => new InvalidOperationException("mapped", e) + })); + + Assert.Equal("mapped", ex.Message); + Assert.IsType(ex.InnerException); + } + + [Fact] + public async Task RunInChildContextAsync_FuncThrows_CheckpointsFailAndThrows() + { + var (context, recorder, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("inner boom"); }, + name: "phase")); + + Assert.Equal("inner boom", ex.Message); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + // Fresh-path failures populate OriginalStackTrace alongside ErrorType so + // ErrorMapping callbacks see the same shape on both fresh and replay paths. + Assert.NotNull(ex.OriginalStackTrace); + Assert.NotEmpty(ex.OriginalStackTrace!); + + await recorder.Batcher.DrainAsync(); + var contextActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT") + .Select(o => o.Action.ToString()) + .ToArray(); + Assert.Equal(new[] { "START", "FAIL" }, contextActions); + } + + [Fact] + public async Task RunInChildContextAsync_InnerNonDeterminism_BubblesUpWithoutCheckpointingFail() + { + // A child context whose inner step's checkpoint type doesn't match the + // user code (replay mismatch) must NOT be wrapped/checkpointed as + // CONTEXT FAIL — that would freeze the corruption into history. + var parentOpId = IdAt(1); + var innerOpId = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "phase" + }, + new() + { + Id = innerOpId, + Type = OperationTypes.Wait, // wrong type — code calls StepAsync + Status = OperationStatuses.Succeeded, + Name = "inner_step" + } + } + }); + + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "inner_step"); + }, + name: "phase")); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "FAIL"); + } + + [Fact] + public async Task RunInChildContextAsync_FuncThrows_AppliesErrorMapping() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; throw new TimeoutException("inner timeout"); }, + name: "phase", + config: new ChildContextConfig + { + ErrorMapping = e => new InvalidOperationException("mapped", e) + })); + + Assert.Equal("mapped", ex.Message); + Assert.IsType(ex.InnerException); + } + + [Fact] + public async Task RunInChildContextAsync_ChildSuspendsOnWait_TerminatesWithWaitScheduled() + { + var (context, recorder, tm, _) = CreateContext(); + + // Suspending child: the inner Wait flushes WAIT START sync, then + // returns a never-completing Task via TerminationManager.SuspendAndAwait. + // The outer ChildContextOperation awaits that and never reaches + // CONTEXT SUCCEED. DurableExecutionHandler.RunAsync's WhenAny race + // wins on the termination signal; the test below short-circuits via + // the same TerminationManager.IsTerminated check. + var task = context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.WaitAsync(TimeSpan.FromSeconds(5), name: "wait_inside"); + return "should not return"; + }, + name: "phase"); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // CONTEXT START + WAIT START have flushed; no SUCCEED/FAIL since the + // child is suspended. + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Contains("CONTEXT:START", actions); + Assert.Contains("WAIT:START", actions); + Assert.DoesNotContain("CONTEXT:SUCCEED", actions); + Assert.DoesNotContain("CONTEXT:FAIL", actions); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayStarted_ReExecutesFuncWithInnerCacheReplay() + { + var parentOpId = IdAt(1); + var innerStepOpId = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "phase" + }, + new() + { + Id = innerStepOpId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "inner_step", + StepDetails = new StepDetails { Result = "\"cached_inner\"" } + } + } + }); + + var innerExecuted = false; + var result = await context.RunInChildContextAsync( + async (childCtx) => + { + return await childCtx.StepAsync( + async (_) => { innerExecuted = true; await Task.CompletedTask; return "fresh_inner"; }, + name: "inner_step"); + }, + name: "phase"); + + // The user func re-runs (replay propagation), but its inner step + // replays the cached value without invoking the inner code. + Assert.False(innerExecuted); + Assert.Equal("cached_inner", result); + + await recorder.Batcher.DrainAsync(); + + // Critical: do NOT re-checkpoint CONTEXT START on replay. The original + // STARTED checkpoint is still authoritative. + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "START"); + + // The CONTEXT SUCCEED happens only this time, since the user func + // returned successfully. + Assert.Contains(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + } + + [Fact] + public async Task RunInChildContextAsync_VoidOverload_RunsAndCheckpoints() + { + var (context, recorder, _, _) = CreateContext(); + + var executed = false; + await context.RunInChildContextAsync( + async (childCtx) => + { + await childCtx.StepAsync( + async (_) => { executed = true; await Task.CompletedTask; }, + name: "inner_void"); + }, + name: "phase"); + + Assert.True(executed); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}").ToArray(); + Assert.Equal(new[] + { + "CONTEXT:START", + "STEP:START", + "STEP:SUCCEED", + "CONTEXT:SUCCEED" + }, actions); + + // Void overload returns a null object, which the registered + // ILambdaSerializer serializes as the literal "null" payload. + var contextSucceed = recorder.Flushed.Single(o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal("null", contextSucceed.Payload); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, // wrong type — should be CONTEXT + Status = OperationStatuses.Succeeded, + Name = "phase", + StepDetails = new StepDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase")); + + Assert.Contains("expected type 'CONTEXT'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayNameMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "old_name", + ContextDetails = new ContextDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "new_name")); + + Assert.Contains("expected name 'new_name'", ex.Message); + Assert.Contains("found 'old_name'", ex.Message); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayUnknownStatus_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = "BOGUS", + Name = "phase" + } + } + }); + + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "x"; }, + name: "phase")); + } + + [Fact] + public async Task RunInChildContextAsync_SubTypeAndName_PropagateToCheckpoint() + { + var (context, recorder, _, _) = CreateContext(); + + await context.RunInChildContextAsync( + async (_) => { await Task.CompletedTask; return "ok"; }, + name: "phase", + config: new ChildContextConfig { SubType = "WaitForCallback" }); + + await recorder.Batcher.DrainAsync(); + + var contextOps = recorder.Flushed.Where(o => o.Type == "CONTEXT").ToArray(); + Assert.Equal(2, contextOps.Length); + foreach (var op in contextOps) + { + Assert.Equal("WaitForCallback", op.SubType); + Assert.Equal("phase", op.Name); + } + } + +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs new file mode 100644 index 000000000..95417b953 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ConfigTests.cs @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ConfigTests +{ + [Fact] + public void InvokeConfig_Defaults() + { + var config = new InvokeConfig(); + Assert.Null(config.TenantId); + } + + [Fact] + public void InvokeConfig_RoundTripsProperties() + { + var config = new InvokeConfig + { + TenantId = "tenant-42" + }; + + Assert.Equal("tenant-42", config.TenantId); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs new file mode 100644 index 000000000..7798830b4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs @@ -0,0 +1,933 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableContextTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static DurableContext CreateContext( + InitialExecutionState? initialState = null, + TerminationManager? terminationManager = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = terminationManager ?? new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + + return new DurableContext(state, tm, idGen, "arn:aws:lambda:us-east-1:123:durable-execution:test", lambdaContext); + } + + #region StepAsync Tests + + [Fact] + public async Task StepAsync_NewExecution_RunsFunction() + { + var context = CreateContext(); + var executed = false; + + var result = await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + return 42; + }, name: "my_step"); + + Assert.True(executed); + Assert.Equal(42, result); + } + + [Fact] + public async Task StepAsync_Replay_ReturnsCachedResult() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"cached_value\"" } + } + } + }); + + var executed = false; + var result = await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + return "fresh_value"; + }, name: "cached_step"); + + Assert.False(executed); + Assert.Equal("cached_value", result); + } + + [Fact] + public async Task StepAsync_ReplayFailed_ThrowsStepException() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Failed, + StepDetails = new StepDetails + { + Error = new ErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => { await Task.CompletedTask; return "x"; }, name: "bad_step")); + + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("timed out", ex.Message); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + } + + [Fact] + public async Task StepAsync_Throws_FailsWithStepException() + { + var context = CreateContext(); + var attempts = 0; + + await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => + { + attempts++; + await Task.CompletedTask; + throw new InvalidOperationException("boom"); + }, name: "fail_step")); + + // No retry support yet — the step runs once. + Assert.Equal(1, attempts); + } + + [Fact] + public async Task StepAsync_WithStepContext_ReceivesMetadata() + { + var context = CreateContext(); + string? receivedOpId = null; + int receivedAttempt = 0; + Microsoft.Extensions.Logging.ILogger? receivedLogger = null; + + await context.StepAsync(async (step) => + { + receivedOpId = step.OperationId; + receivedAttempt = step.AttemptNumber; + receivedLogger = step.Logger; + await Task.CompletedTask; + return "done"; + }, name: "meta_step"); + + Assert.Equal(IdAt(1), receivedOpId); + Assert.Equal(1, receivedAttempt); + Assert.NotNull(receivedLogger); + } + + [Fact] + public async Task StepAsync_VoidOverload_Works() + { + var context = CreateContext(); + var executed = false; + + await context.StepAsync(async (_) => + { + executed = true; + await Task.CompletedTask; + }, name: "void_step"); + + Assert.True(executed); + } + + [Fact] + public async Task StepAsync_MultipleSteps_DeterministicIds() + { + var context = CreateContext(); + + var r1 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "a"; }, name: "first"); + var r2 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "b"; }, name: "second"); + var r3 = await context.StepAsync(async (_) => { await Task.CompletedTask; return "c"; }); + + Assert.Equal("a", r1); + Assert.Equal("b", r2); + Assert.Equal("c", r3); + } + + [Fact] + public async Task StepAsync_ComplexType_SerializesCorrectly() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "{\"Name\":\"Alice\",\"Age\":30}" } + } + } + }); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return new TestPerson { Name = "Bob", Age = 25 }; }, + name: "fetch"); + + Assert.Equal("Alice", result.Name); + Assert.Equal(30, result.Age); + } + + [Fact] + public async Task StepAsync_NoSerializerOnContext_ThrowsInvalidOperation() + { + // The serializer comes from ILambdaContext.Serializer — without one, + // we can't checkpoint anything. The error message points users at the + // bootstrap registration point. + var state = new ExecutionState(); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no Serializer set + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync(async (_) => { await Task.CompletedTask; return "x"; }, name: "no_serializer")); + + Assert.Contains("ILambdaSerializer", ex.Message); + } + + [Fact] + public void Logger_Defaults_ToNullLogger() + { + var context = CreateContext(); + Assert.NotNull(context.Logger); + } + + [Fact] + public void ExecutionContext_ExposesArn() + { + var context = CreateContext(); + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:test", context.ExecutionContext.DurableExecutionArn); + } + + [Fact] + public void LambdaContext_IsExposed() + { + var context = CreateContext(); + Assert.NotNull(context.LambdaContext); + } + + [Fact] + public async Task StepAsync_Replay_NullResult_ReturnsDefault() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = null } + } + } + }); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, + name: "no_result"); + + Assert.Null(result); + } + + [Fact] + public async Task StepAsync_CancelledToken_ThrowsOperationCanceled() + { + var context = CreateContext(); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await Assert.ThrowsAnyAsync(() => + context.StepAsync( + async (_) => + { + cts.Token.ThrowIfCancellationRequested(); + await Task.CompletedTask; + return "unreachable"; + }, + name: "cancelled_step", + cancellationToken: cts.Token)); + } + + #endregion + + #region WaitAsync Tests + + [Fact] + public async Task WaitAsync_SubSecond_ThrowsArgumentOutOfRange() + { + var context = CreateContext(); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromMilliseconds(500))); + } + + [Fact] + public async Task WaitAsync_AboveOneYear_ThrowsArgumentOutOfRange() + { + var context = CreateContext(); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromSeconds(31_622_401))); + } + + [Fact] + public async Task WaitAsync_NewExecution_SignalsTermination() + { + var tm = new TerminationManager(); + var context = CreateContext(terminationManager: tm); + + // WaitAsync should signal termination and return a never-completing task + var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "my_wait"); + + // Give it a moment to execute + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(waitTask.IsCompleted); + } + + [Fact] + public async Task WaitAsync_Elapsed_ContinuesImmediately() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds(); + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + }); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "cooldown"); + // If we got here, the wait was correctly skipped + } + + [Fact] + public async Task WaitAsync_StartedButNotExpired_ResuspendsWithoutNewCheckpoint() + { + var futureExpirationMs = DateTimeOffset.UtcNow.AddSeconds(300).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = futureExpirationMs } + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "pending_wait"); + + await Task.Delay(10); + + Assert.True(tm.IsTerminated); + Assert.False(waitTask.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task WaitAsync_AlreadySucceeded_ContinuesImmediately() + { + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Succeeded + } + } + }); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "done_wait"); + // Completed without blocking + } + + [Fact] + public async Task WaitAsync_UnknownStatus_ThrowsNonDeterministicException() + { + // Unrecognized status on a replayed wait checkpoint must surface as + // NonDeterministicExecutionException — silently re-emitting WAIT START + // would either fail at the service or duplicate work. + var context = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = "TOTALLY_BOGUS_STATUS" + } + } + }); + + await Assert.ThrowsAsync(() => + context.WaitAsync(TimeSpan.FromSeconds(30), name: "mystery_wait")); + } + + #endregion + + #region End-to-end: Step + Wait + Step + + [Fact] + public async Task EndToEnd_StepWaitStep_FirstInvocation_SuspendsOnWait() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + await context.StepAsync(async (_) => { await Task.CompletedTask; return "fetched"; }, name: "fetch"); + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + var final = await context.StepAsync(async (_) => { await Task.CompletedTask; return "processed"; }, name: "process"); + return final; + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + } + + [Fact] + public async Task EndToEnd_StepWaitStep_SecondInvocation_Completes() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-5).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"fetched\"" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + }); + + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + var processExecuted = false; + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + var fetched = await context.StepAsync(async (_) => { await Task.CompletedTask; return "fresh_fetch"; }, name: "fetch"); + Assert.Equal("fetched", fetched); // cached from replay + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + // wait is elapsed, continues + + var final = await context.StepAsync(async (_) => + { + processExecuted = true; + await Task.CompletedTask; + return "processed"; + }, name: "process"); + return final; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("processed", result.Result); + Assert.True(processExecuted); + } + + #endregion + + #region Non-Determinism Detection Tests + + [Fact] + public async Task StepAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Wait, + Status = OperationStatuses.Succeeded + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.StepAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "my_op")); + + Assert.Contains("expected type 'STEP'", ex.Message); + Assert.Contains("found 'WAIT'", ex.Message); + } + + [Fact] + public async Task WaitAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"hello\"" } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.WaitAsync(TimeSpan.FromSeconds(10), name: "my_op")); + + Assert.Contains("expected type 'WAIT'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + [Fact] + public async Task StepAsync_ReplayNameMismatch_ThrowsNonDeterministicException() + { + // Simulate a scenario where the operation was stored with a different name + // than what the current code passes (e.g., service returned stale data). + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "old_name", + StepDetails = new StepDetails { Result = "\"old_result\"" } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var ex = await Assert.ThrowsAsync(async () => + await context.StepAsync( + async (_) => { await Task.CompletedTask; return "new"; }, + name: "my_step")); + + Assert.Contains("expected name 'my_step'", ex.Message); + Assert.Contains("found 'old_name'", ex.Message); + } + + [Fact] + public async Task StepAsync_NoReplay_SkipsValidation() + { + var context = CreateContext(); + + var result = await context.StepAsync( + async (_) => { await Task.CompletedTask; return "ok"; }, + name: "anything"); + + Assert.Equal("ok", result); + } + + #endregion + + private class TestPerson + { + public string? Name { get; set; } + public int Age { get; set; } + } + + #region StepAsync Retry Tests + + [Fact] + public async Task StepAsync_FailsWithRetryStrategy_CheckpointsRetryAndSuspends() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var stepTask = context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("transient"); }, + name: "flaky_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromSeconds(5), + jitter: JitterStrategy.None) + }); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + + // Fresh attempt 1 emits a fire-and-forget START (telemetry under + // AtLeastOncePerRetry), then a RETRY when the user code throws and + // the retry strategy decides to retry. + var checkpoints = recorder.Flushed; + Assert.Equal(2, checkpoints.Count); + Assert.Equal("START", checkpoints[0].Action); + Assert.Equal("RETRY", checkpoints[1].Action); + Assert.Equal(IdAt(1), checkpoints[1].Id); + Assert.Equal(5, checkpoints[1].StepOptions.NextAttemptDelaySeconds); + } + + [Fact] + public async Task StepAsync_FailsNoRetryStrategy_CheckpointsFail() + { + var context = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("permanent"); }, + name: "fail_step")); + + Assert.Equal("permanent", ex.Message); + } + + [Fact] + public async Task StepAsync_RetryExhausted_CheckpointsFail() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 2, + NextAttemptTimestamp = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds() + } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + // Attempt 3 (last one) — should fail after this + var ex = await Assert.ThrowsAsync(() => + context.StepAsync( + async (_) => { await Task.CompletedTask; throw new InvalidOperationException("still failing"); }, + name: "exhaust_step", + config: new StepConfig + { + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, jitter: JitterStrategy.None) + })); + + Assert.Equal("still failing", ex.Message); + + // Fresh attempt 3 emits a fire-and-forget START (telemetry under + // AtLeastOncePerRetry), then a FAIL after the retry strategy gives up. + var checkpoints = recorder.Flushed; + Assert.Equal(2, checkpoints.Count); + Assert.Equal("START", checkpoints[0].Action); + Assert.Equal("FAIL", checkpoints[1].Action); + } + + [Fact] + public async Task StepAsync_PendingWithFutureTimestamp_Suspends() + { + var futureMs = DateTimeOffset.UtcNow.AddSeconds(300).ToUnixTimeMilliseconds(); + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 1, + NextAttemptTimestamp = futureMs + } + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var stepTask = context.StepAsync( + async (_) => { await Task.CompletedTask; return "should not run"; }, + name: "pending_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + await Task.Delay(50); + + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task StepAsync_PendingWithPastTimestamp_ReExecutes() + { + var pastMs = DateTimeOffset.UtcNow.AddSeconds(-10).ToUnixTimeMilliseconds(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Pending, + StepDetails = new StepDetails + { + Attempt = 1, + NextAttemptTimestamp = pastMs + } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var result = await context.StepAsync( + async (ctx) => + { + await Task.CompletedTask; + Assert.Equal(2, ctx.AttemptNumber); + return "retry success"; + }, + name: "retry_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + Assert.Equal("retry success", result); + } + + [Fact] + public async Task StepAsync_ReadyReplay_AdvancesAttemptAndExecutes() + { + // READY = service has post-PENDING re-invoked us; the retry timer + // already fired so no timestamp check is needed. Just advance the + // attempt counter and run. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Ready, + StepDetails = new StepDetails { Attempt = 2 } + } + } + }); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + + var executed = false; + var result = await context.StepAsync( + async (ctx) => + { + executed = true; + Assert.Equal(3, ctx.AttemptNumber); + await Task.CompletedTask; + return "ok"; + }, + name: "ready_step", + config: new StepConfig { RetryStrategy = RetryStrategy.Default }); + + Assert.True(executed); + Assert.Equal("ok", result); + Assert.False(tm.IsTerminated); + Assert.False(state.IsReplaying); + } + + [Fact] + public async Task StepAsync_AtMostOnce_FlushesStartBeforeExecution() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + IReadOnlyList? flushedAtFuncEntry = null; + + var result = await context.StepAsync( + async (_) => + { + flushedAtFuncEntry = recorder.Flushed.Select(o => o.Action.ToString()).ToArray(); + await Task.CompletedTask; + return "done"; + }, + name: "amo_step", + config: new StepConfig { Semantics = StepSemantics.AtMostOncePerRetry }); + + Assert.Equal("done", result); + + // START must be flushed before user func runs (AtMostOnce invariant). + Assert.NotNull(flushedAtFuncEntry); + Assert.Equal(new[] { "START" }, flushedAtFuncEntry); + + // After step returns, SUCCEED has also been flushed. + var actions = recorder.Flushed.Select(o => o.Action.ToString()).ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, actions); + } + + [Fact] + public async Task StepAsync_AtMostOnce_StartedReplay_TriggersRetryHandler() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Started + } + } + }); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + var executed = false; + var stepTask = context.StepAsync( + async (_) => { executed = true; await Task.CompletedTask; return "should not run"; }, + name: "amo_replay", + config: new StepConfig + { + Semantics = StepSemantics.AtMostOncePerRetry, + RetryStrategy = RetryStrategy.Exponential(maxAttempts: 3, jitter: JitterStrategy.None) + }); + + await Task.Delay(50); + + Assert.False(executed); + Assert.True(tm.IsTerminated); + Assert.False(stepTask.IsCompleted); + + var checkpoints = recorder.Flushed; + Assert.Single(checkpoints); + Assert.Equal("RETRY", checkpoints[0].Action); + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs new file mode 100644 index 000000000..76062a682 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableExecutionHandlerTests.cs @@ -0,0 +1,140 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableExecutionHandlerTests +{ + [Fact] + public async Task RunAsync_UserCodeCompletes_ReturnsSucceeded() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.Delay(1); + return "hello"; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("hello", result.Result); + Assert.Null(result.Exception); + } + + [Fact] + public async Task RunAsync_UserCodeThrows_ReturnsFailed() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.Delay(1); + throw new InvalidOperationException("something broke"); + }); + + Assert.Equal(InvocationStatus.Failed, result.Status); + Assert.Equal("something broke", result.Message); + Assert.IsType(result.Exception); + } + + [Fact] + public async Task RunAsync_TerminationWins_ReturnsPending() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + // Simulate: user code hits a wait, signals termination, then blocks forever + termination.Terminate(TerminationReason.WaitScheduled, "waiting 30s"); + await new TaskCompletionSource().Task; // blocks forever + return "unreachable"; + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + Assert.Equal("waiting 30s", result.Message); + Assert.Null(result.Exception); + } + + [Fact] + public async Task RunAsync_TerminationWithException_ReturnsFailed() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + termination.Terminate( + TerminationReason.CheckpointFailed, + "checkpoint error", + new InvalidOperationException("service unavailable")); + await new TaskCompletionSource().Task; + return "unreachable"; + }); + + Assert.Equal(InvocationStatus.Failed, result.Status); + Assert.IsType(result.Exception); + } + + [Fact] + public async Task RunAsync_FastUserCode_BeatsTermination() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + // User code completes before termination is called + return 42; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal(42, result.Result); + } + + [Fact] + public async Task RunAsync_IntResult_WorksWithValueTypes() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var termination = new TerminationManager(); + + var result = await DurableExecutionHandler.RunAsync( + state, + termination, + async () => + { + await Task.CompletedTask; + return 100; + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal(100, result.Result); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs new file mode 100644 index 000000000..8078b0242 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableFunctionTests.cs @@ -0,0 +1,783 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Net; +using System.Text.Json; +using Amazon.Lambda; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Amazon.Runtime; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class DurableFunctionTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private readonly IAmazonLambda _mockClient = new MockLambdaClient(); + + [Fact] + public async Task WrapAsync_FreshExecution_StepThenWait_ReturnsPending() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:order-123", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-123\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + } + + [Fact] + public async Task WrapAsync_ReplayWithElapsedWait_ReturnsSucceeded() + { + var pastExpirationMs = DateTimeOffset.UtcNow.AddSeconds(-5).ToUnixTimeMilliseconds(); + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:order-123", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-123\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "{\"IsValid\":true}" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + WaitDetails = new WaitDetails { ScheduledEndTimestamp = pastExpirationMs } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.NotNull(output.Result); + var result = JsonSerializer.Deserialize(output.Result!); + Assert.Equal("approved", result!.Status); + } + + [Fact] + public async Task WrapAsync_WorkflowThrows_ReturnsFailed() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:fail-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"bad-order\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => throw new InvalidOperationException("workflow error"), + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Failed, output.Status); + Assert.NotNull(output.Error); + Assert.Equal("workflow error", output.Error!.ErrorMessage); + Assert.Contains("InvalidOperationException", output.Error.ErrorType!); + } + + [Fact] + public async Task WrapAsync_VoidWorkflow_ReturnSucceeded() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:void-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + var executed = false; + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + await ctx.StepAsync(async (_) => { await Task.CompletedTask; executed = true; }, name: "do_work"); + }, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.True(executed); + } + + [Fact] + public async Task WrapAsync_CheckpointsAreSentToService() + { + var mockClient = new MockLambdaClient(); + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-test", + CheckpointToken = "initial-token", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + MyWorkflow, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + + // Each StepAsync emits a fire-and-forget START before user code runs + // (telemetry under AtLeastOncePerRetry). With FlushInterval = 0 the + // worker may flush the START on its own before SUCCEED arrives, so the + // exact batching of START vs SUCCEED is timing-dependent. Assert on + // the flat sequence of updates instead. + var allUpdates = mockClient.CheckpointCalls + .SelectMany(c => c.Updates) + .ToList(); + + // Expect: step START, step SUCCEED, wait START (in that order). + Assert.Equal(3, allUpdates.Count); + + Assert.Equal("STEP", allUpdates[0].Type); + Assert.Equal("START", allUpdates[0].Action); + Assert.Equal("validate", allUpdates[0].Name); + + Assert.Equal("STEP", allUpdates[1].Type); + Assert.Equal("SUCCEED", allUpdates[1].Action); + Assert.Equal("validate", allUpdates[1].Name); + Assert.NotNull(allUpdates[1].Payload); + + Assert.Equal("WAIT", allUpdates[2].Type); + Assert.Equal("START", allUpdates[2].Action); + Assert.Equal("delay", allUpdates[2].Name); + Assert.NotNull(allUpdates[2].WaitOptions); + Assert.Equal(30, allUpdates[2].WaitOptions.WaitSeconds); + + // The first call sends the initial checkpoint token. + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-test", mockClient.CheckpointCalls[0].DurableExecutionArn); + Assert.Equal("initial-token", mockClient.CheckpointCalls[0].CheckpointToken); + } + + [Fact] + public async Task WrapAsync_UserPayload_BindsCamelCaseToPascalCaseProperty() + { + // The wire payload uses camelCase ("orderId"), the user POCO uses PascalCase (OrderId). + // ExtractUserPayload must do case-insensitive binding so workflows can read input.OrderId. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:case-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"abc-123\"}" } + } + } + } + }; + + string? observedOrderId = null; + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + observedOrderId = evt.OrderId; + await Task.CompletedTask; + return new OrderResult { Status = "ok", OrderId = evt.OrderId }; + }, + input, + CreateLambdaContext(), + _mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.Equal("abc-123", observedOrderId); + } + + [Fact] + public async Task WrapAsync_NoExecutionOp_ThrowsMalformedEnvelope() + { + // No EXECUTION operation in the envelope — ExtractUserPayload must throw a typed + // DurableExecutionException so the malformed envelope surfaces as a clear error + // instead of leaking default!/null into user code as a NullReferenceException. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:no-exec", + InitialExecutionState = new InitialExecutionState + { + Operations = new List() + } + }; + + var ex = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + async (evt, ctx) => + { + await Task.CompletedTask; + return new OrderResult { Status = "ok" }; + }, + input, + CreateLambdaContext(), + _mockClient)); + + Assert.Contains("malformed", ex.Message, StringComparison.OrdinalIgnoreCase); + Assert.Contains("EXECUTION", ex.Message); + } + + [Fact] + public async Task WrapAsync_PaginatedInitialState_HydratesAllPages() + { + // The service can return execution state across multiple pages — the first + // page comes inline on the invocation envelope (InitialExecutionState) and + // subsequent pages must be fetched via GetDurableExecutionState. Verify the + // pagination loop in WrapAsyncCore (DurableFunction.cs:160-167) walks every + // page so the workflow sees the full operation history on replay. + var arn = "arn:aws:lambda:us-east-1:123:durable-execution:paginated"; + + // Page 0 (in InitialExecutionState): EXECUTION op + step1 SUCCEEDED. + // Page 1 (fetched with marker "marker-1"): step2 SUCCEEDED, points to marker-2. + // Page 2 (fetched with marker "marker-2"): step3 SUCCEEDED, no NextMarker — loop exits. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = arn, + CheckpointToken = "ckpt-0", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"page-0-result\"" } + } + }, + NextMarker = "marker-1" + } + }; + + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = req => req.Marker switch + { + "marker-1" => new Amazon.Lambda.Model.GetDurableExecutionStateResponse + { + Operations = new List + { + new() + { + Id = IdAt(2), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new Amazon.Lambda.Model.StepDetails { Result = "\"page-1-result\"" } + } + }, + NextMarker = "marker-2" + }, + "marker-2" => new Amazon.Lambda.Model.GetDurableExecutionStateResponse + { + Operations = new List + { + new() + { + Id = IdAt(3), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new Amazon.Lambda.Model.StepDetails { Result = "\"page-2-result\"" } + } + } + // NextMarker omitted -> loop terminates. + }, + _ => throw new InvalidOperationException($"Unexpected marker: {req.Marker}") + } + }; + + var observed = new List(); + var output = await DurableFunction.WrapAsync( + async (evt, ctx) => + { + // All three steps must replay the cached results from the paginated state + // without re-executing — if the loop missed a page, the corresponding step + // would run fresh and append a different value to `observed`. + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step1")); + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step2")); + observed.Add(await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return "fresh"; }, name: "step3")); + return new OrderResult { Status = "ok", OrderId = evt.OrderId }; + }, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + + // Two GetDurableExecutionState calls — one per fetched page (page 0 was inline). + Assert.Equal(2, mockClient.GetExecutionStateCalls.Count); + Assert.Equal("marker-1", mockClient.GetExecutionStateCalls[0].Marker); + Assert.Equal(arn, mockClient.GetExecutionStateCalls[0].DurableExecutionArn); + Assert.Equal("ckpt-0", mockClient.GetExecutionStateCalls[0].CheckpointToken); + Assert.Equal("marker-2", mockClient.GetExecutionStateCalls[1].Marker); + + // The workflow saw replayed results from ALL three pages — none re-executed. + Assert.Equal(new[] { "page-0-result", "page-1-result", "page-2-result" }, observed); + + // No checkpoints were written: every step replayed from cache. + Assert.Empty(mockClient.CheckpointCalls); + } + + [Fact] + public async Task WrapAsync_NullInitialExecutionState_ThrowsMalformedEnvelope() + { + // No initial execution state at all — same malformed-envelope branch in ExtractUserPayload. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:null-state" + }; + + var ex = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + async (evt, ctx) => + { + await Task.CompletedTask; + return new OrderResult { Status = "ok" }; + }, + input, + CreateLambdaContext(), + _mockClient)); + + Assert.Contains("malformed", ex.Message, StringComparison.OrdinalIgnoreCase); + } + + // ────────────────────────────────────────────────────────────────────── + // IsTerminalCheckpointError classification (mirrors CheckpointError in + // aws-durable-execution-sdk-python): + // 4xx (except 429) → terminal (Failed envelope) + // 429 / 5xx / no status → transient (escapes to host for Lambda retry) + // Carve-out: InvalidParameterValueException "Invalid Checkpoint Token" → transient + // + // Driven through CheckpointDurableExecution: a workflow that succeeds a single Step + // forces the batcher to flush, which is wrapped by the try/catch in WrapAsyncCore. + // ────────────────────────────────────────────────────────────────────── + + public static IEnumerable TerminalCheckpointErrorCases() => new[] + { + new object[] { MakeServiceException("ResourceNotFoundException", HttpStatusCode.NotFound, "ARN not found") }, + new object[] { MakeServiceException("AccessDeniedException", HttpStatusCode.Forbidden, "denied") }, + new object[] { MakeServiceException("KMSAccessDeniedException", HttpStatusCode.BadRequest, "kms denied") }, + new object[] { MakeServiceException("ValidationException", HttpStatusCode.BadRequest, "bad input") }, + new object[] { MakeServiceException("InvalidParameterValueException", HttpStatusCode.BadRequest, "Some other parameter") }, + }; + + [Theory] + [MemberData(nameof(TerminalCheckpointErrorCases))] + public async Task WrapAsync_CheckpointThrowsTerminal_ReturnsFailed(AmazonServiceException ex) + { + // LambdaDurableServiceClient now wraps SDK exceptions in DurableExecutionException + // so user logs carry context (which call, which ARN). The outer message includes + // the inner SDK message; the classifier matches on the wrapper's InnerException. + var input = MakeCheckpointInput(); + var mockClient = new MockLambdaClient { CheckpointThrows = ex }; + + var output = await DurableFunction.WrapAsync( + SingleStepWorkflow, input, CreateLambdaContext(), mockClient); + + Assert.Equal(InvocationStatus.Failed, output.Status); + Assert.NotNull(output.Error); + Assert.Contains(ex.Message, output.Error!.ErrorMessage); + Assert.Contains("Failed to checkpoint", output.Error.ErrorMessage); + } + + public static IEnumerable TransientCheckpointErrorCases() => new[] + { + // 5xx + new object[] { MakeServiceException("InternalServerError", HttpStatusCode.InternalServerError, "boom") }, + new object[] { MakeServiceException("ServiceUnavailable", HttpStatusCode.ServiceUnavailable, "down") }, + // 429 + new object[] { MakeServiceException("TooManyRequestsException", (HttpStatusCode)429, "throttled") }, + // No status (network / SDK-internal). HttpStatusCode default (0) → classifier treats < 400 as transient. + new object[] { MakeServiceException("RequestTimeout", 0, "timeout") }, + // Carve-out: stale checkpoint token is transient. + new object[] { MakeServiceException("InvalidParameterValueException", HttpStatusCode.BadRequest, "Invalid Checkpoint Token: stale") }, + }; + + [Theory] + [MemberData(nameof(TransientCheckpointErrorCases))] + public async Task WrapAsync_CheckpointThrowsTransient_PropagatesToHost(AmazonServiceException ex) + { + // Transient SDK errors escape the IsTerminalCheckpointError catch and propagate + // to the host as DurableExecutionException wrapping the original SDK exception + // — Lambda's normal retry semantics fire on the wrapper. The original SDK + // exception is preserved as InnerException so callers can still introspect + // the original status code / error code. + var input = MakeCheckpointInput(); + var mockClient = new MockLambdaClient { CheckpointThrows = ex }; + + var thrown = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + SingleStepWorkflow, input, CreateLambdaContext(), mockClient)); + + Assert.Same(ex, thrown.InnerException); + } + + [Fact] + public async Task WrapAsync_HydrationThrows_AlwaysPropagatesToHost() + { + // State hydration is OUTSIDE the IsTerminalCheckpointError try/catch — every + // GetExecutionStateAsync failure escapes for Lambda retry. Use a 4xx that + // *would* be terminal if it came from a checkpoint flush to prove the path + // isn't classified. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:hydrate-fail", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + }, + NextMarker = "page-1" // force the hydration loop to run + } + }; + var ex = MakeServiceException("ResourceNotFoundException", HttpStatusCode.NotFound, "ARN gone"); + var mockClient = new MockLambdaClient { GetExecutionStateThrows = ex }; + + // Hydration errors are wrapped in DurableExecutionException by + // LambdaDurableServiceClient.GetExecutionStateAsync but are NOT caught by the + // IsTerminalCheckpointError filter, so they escape to the host. + var thrown = await Assert.ThrowsAsync(() => + DurableFunction.WrapAsync( + MyWorkflow, input, CreateLambdaContext(), mockClient)); + + Assert.Same(ex, thrown.InnerException); + Assert.Contains("Failed to fetch execution state", thrown.Message); + } + + private static AmazonServiceException MakeServiceException(string code, HttpStatusCode status, string message) + { + return new AmazonServiceException(message, innerException: null, ErrorType.Unknown, code, requestId: "req-1", statusCode: status); + } + + private static DurableExecutionInvocationInput MakeCheckpointInput() => new() + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:checkpoint-fail", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"orderId\":\"order-1\"}" } + } + } + } + }; + + private static async Task SingleStepWorkflow(OrderEvent input, IDurableContext context) + { + // One step succeed → forces a checkpoint flush, which the mock fails. + await context.StepAsync(async (_) => { await Task.CompletedTask; return "ok"; }, name: "s1"); + return new OrderResult { Status = "done" }; + } + + [Fact] + public async Task WrapAsync_CreateCallbackThenWait_AllocatesCallbackIdAndSuspends() + { + // End-to-end through the real LambdaDurableServiceClient: the mock + // client returns NewExecutionState carrying a CallbackId on the + // CALLBACK START checkpoint response, and the SDK plumbs it through. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:cb-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + } + } + } + }; + + var capturedCallbackId = (string?)null; + var mockClient = new MockLambdaClient + { + CheckpointHandler = req => + { + // Echo back any CALLBACK START as a STARTED op with a service-allocated id. + var newOps = new List(); + foreach (var u in req.Updates) + { + if (u.Type == OperationTypes.Callback && u.Action == "START") + { + newOps.Add(new Amazon.Lambda.Model.Operation + { + Id = u.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = u.Name, + CallbackDetails = new Amazon.Lambda.Model.CallbackDetails + { + CallbackId = "servicealloccbid" + } + }); + } + } + return new Amazon.Lambda.Model.CheckpointDurableExecutionResponse + { + NewExecutionState = newOps.Count == 0 + ? null + : new Amazon.Lambda.Model.CheckpointUpdatedExecutionState { Operations = newOps } + }; + } + }; + + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + capturedCallbackId = cb.CallbackId; + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + mockClient); + + Assert.Equal(InvocationStatus.Pending, output.Status); + Assert.Equal("servicealloccbid", capturedCallbackId); + } + + [Fact] + public async Task WrapAsync_ReplayCallbackSucceeded_ReturnsResultAfterSuspend() + { + // Second invocation: the callback's checkpoint is now SUCCEEDED; + // the workflow returns the deserialized result. + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:aws:lambda:us-east-1:123:durable-execution:cb-test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Succeeded, + Name = "approval", + CallbackDetails = new CallbackDetails + { + CallbackId = "servicealloccbid", + Result = "\"approved\"" + } + } + } + } + }; + + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + new MockLambdaClient()); + + Assert.Equal(InvocationStatus.Succeeded, output.Status); + Assert.NotNull(output.Result); + var result = JsonSerializer.Deserialize(output.Result!); + Assert.Equal("approved", result!.Status); + } + + [Fact] + public async Task WrapAsync_ReplayDeterminism_CallbackIdStableAcrossInvocations() + { + // First invocation allocates a callback ID via the mock; in a real run + // that ID would be persisted in the service's checkpoint state and + // returned to the second invocation via InitialExecutionState. Verify + // the same ID survives that round-trip (we model "round-trip" by + // replaying with a STARTED checkpoint that carries the same ID). + const string id = "stablecbidreplay"; + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:test", + InitialExecutionState = new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "exec-0", + Type = OperationTypes.Execution, + Status = OperationStatuses.Started, + ExecutionDetails = new ExecutionDetails { InputPayload = "{\"OrderId\":\"o-1\"}" } + }, + new() + { + Id = IdAt(1), + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = "approval", + CallbackDetails = new CallbackDetails { CallbackId = id } + } + } + } + }; + + string? observed = null; + var output = await DurableFunction.WrapAsync( + async (e, ctx) => + { + var cb = await ctx.CreateCallbackAsync(name: "approval"); + observed = cb.CallbackId; + var status = await cb.GetResultAsync(); + return new OrderResult { Status = status, OrderId = e.OrderId }; + }, + input, + CreateLambdaContext(), + new MockLambdaClient()); + + Assert.Equal(InvocationStatus.Pending, output.Status); + Assert.Equal(id, observed); + } + + private static async Task MyWorkflow(OrderEvent input, IDurableContext context) + { + var validation = await context.StepAsync( + async (_) => { await Task.CompletedTask; return new ValidationResult { IsValid = true }; }, + name: "validate"); + + await context.WaitAsync(TimeSpan.FromSeconds(30), name: "delay"); + + return new OrderResult { Status = "approved", OrderId = input.OrderId }; + } + + private class OrderEvent + { + public string? OrderId { get; set; } + } + + private class OrderResult + { + public string? Status { get; set; } + public string? OrderId { get; set; } + } + + private class ValidationResult + { + public bool IsValid { get; set; } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs new file mode 100644 index 000000000..36b9b3a70 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/EnumsTests.cs @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class EnumsTests +{ + [Fact] + public void InvocationStatus_HasExpectedValues() + { + Assert.Equal(0, (int)InvocationStatus.Succeeded); + Assert.Equal(1, (int)InvocationStatus.Failed); + Assert.Equal(2, (int)InvocationStatus.Pending); + } + + [Fact] + public void OperationTypes_HasExpectedConstants() + { + Assert.Equal("STEP", OperationTypes.Step); + Assert.Equal("WAIT", OperationTypes.Wait); + Assert.Equal("CALLBACK", OperationTypes.Callback); + Assert.Equal("CHAINED_INVOKE", OperationTypes.ChainedInvoke); + Assert.Equal("CONTEXT", OperationTypes.Context); + Assert.Equal("EXECUTION", OperationTypes.Execution); + } + + [Fact] + public void OperationStatuses_HasExpectedConstants() + { + Assert.Equal("STARTED", OperationStatuses.Started); + Assert.Equal("SUCCEEDED", OperationStatuses.Succeeded); + Assert.Equal("FAILED", OperationStatuses.Failed); + Assert.Equal("PENDING", OperationStatuses.Pending); + Assert.Equal("CANCELLED", OperationStatuses.Cancelled); + Assert.Equal("READY", OperationStatuses.Ready); + Assert.Equal("STOPPED", OperationStatuses.Stopped); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs new file mode 100644 index 000000000..f89a72cb1 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExceptionsTests.cs @@ -0,0 +1,267 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ExceptionsTests +{ + [Fact] + public void DurableExecutionException_IsBaseException() + { + var ex = new DurableExecutionException("test error"); + Assert.IsAssignableFrom(ex); + Assert.Equal("test error", ex.Message); + } + + [Fact] + public void DurableExecutionException_WrapsInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new DurableExecutionException("outer", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void DurableExecutionException_ParameterlessCtor() + { + var ex = new DurableExecutionException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void StepException_ParameterlessCtor() + { + var ex = new StepException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void StepException_MessageOnlyCtor() + { + var ex = new StepException("step blew up"); + Assert.Equal("step blew up", ex.Message); + } + + [Fact] + public void StepException_WithInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new StepException("wrapped", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void StepException_HasErrorProperties() + { + var ex = new StepException("step failed") + { + ErrorType = "System.TimeoutException", + ErrorData = "operation timed out", + OriginalStackTrace = new[] { "at Foo.Bar()", "at Baz.Qux()" } + }; + + Assert.IsAssignableFrom(ex); + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("operation timed out", ex.ErrorData); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + } + + [Fact] + public void CallbackException_BaseClassCtors() + { + var empty = new CallbackException(); + Assert.IsAssignableFrom(empty); + + var withMsg = new CallbackException("cb error"); + Assert.Equal("cb error", withMsg.Message); + + var inner = new InvalidOperationException("inner"); + var wrapping = new CallbackException("outer", inner); + Assert.Same(inner, wrapping.InnerException); + } + + [Fact] + public void CallbackException_InitProperties() + { + var ex = new CallbackException("rejected") + { + CallbackId = "cb-1", + ErrorType = "ExternalSystemError", + ErrorData = "{\"reviewer\":\"jane\"}", + OriginalStackTrace = new[] { "at A.B()" } + }; + + Assert.Equal("cb-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + Assert.Equal("{\"reviewer\":\"jane\"}", ex.ErrorData); + Assert.Single(ex.OriginalStackTrace!); + } + + [Fact] + public void CallbackFailedException_IsCallbackException() + { + var ex = new CallbackFailedException("rejected") { CallbackId = "cb-1" }; + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("rejected", ex.Message); + Assert.Equal("cb-1", ex.CallbackId); + } + + [Fact] + public void CallbackFailedException_AllCtors() + { + Assert.NotNull(new CallbackFailedException()); + Assert.Equal("m", new CallbackFailedException("m").Message); + var inner = new Exception("inner"); + Assert.Same(inner, new CallbackFailedException("m", inner).InnerException); + } + + [Fact] + public void CallbackTimeoutException_IsCallbackException() + { + var ex = new CallbackTimeoutException("timed out") { CallbackId = "cb-1" }; + Assert.IsAssignableFrom(ex); + Assert.Equal("timed out", ex.Message); + } + + [Fact] + public void CallbackTimeoutException_AllCtors() + { + Assert.NotNull(new CallbackTimeoutException()); + Assert.Equal("m", new CallbackTimeoutException("m").Message); + var inner = new Exception("inner"); + Assert.Same(inner, new CallbackTimeoutException("m", inner).InnerException); + } + + [Fact] + public void CallbackSubmitterException_IsCallbackException() + { + var inner = new StepException("submitter failed"); + var ex = new CallbackSubmitterException("submitter failed", inner); + Assert.IsAssignableFrom(ex); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void CallbackSubmitterException_AllCtors() + { + Assert.NotNull(new CallbackSubmitterException()); + Assert.Equal("m", new CallbackSubmitterException("m").Message); + } + + #region InvokeException tree + + [Fact] + public void InvokeException_IsDurableExecutionException() + { + var ex = new InvokeException("invoke failed"); + Assert.IsAssignableFrom(ex); + Assert.Equal("invoke failed", ex.Message); + } + + [Fact] + public void InvokeException_ParameterlessCtor() + { + var ex = new InvokeException(); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public void InvokeException_WrapsInnerException() + { + var inner = new InvalidOperationException("inner"); + var ex = new InvokeException("outer", inner); + Assert.Same(inner, ex.InnerException); + } + + [Fact] + public void InvokeException_HasInvokeProperties() + { + var ex = new InvokeException("boom") + { + FunctionName = "arn:aws:lambda:us-east-1:123:function:fn:prod", + ErrorType = "System.TimeoutException", + ErrorData = "{\"detail\":\"x\"}", + OriginalStackTrace = new[] { "at A.B()" } + }; + + Assert.Equal("arn:aws:lambda:us-east-1:123:function:fn:prod", ex.FunctionName); + Assert.Equal("System.TimeoutException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.Single(ex.OriginalStackTrace!); + } + + [Fact] + public void InvokeFailedException_IsInvokeException() + { + var ex = new InvokeFailedException("boom") { FunctionName = "fn:prod" }; + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("boom", ex.Message); + Assert.Equal("fn:prod", ex.FunctionName); + } + + [Fact] + public void InvokeFailedException_AllCtorOverloads() + { + var inner = new InvalidOperationException("inner"); + Assert.IsAssignableFrom(new InvokeFailedException()); + Assert.Equal("m", new InvokeFailedException("m").Message); + Assert.Same(inner, new InvokeFailedException("m", inner).InnerException); + } + + [Fact] + public void InvokeTimedOutException_IsInvokeException() + { + var ex = new InvokeTimedOutException("timed out"); + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("timed out", ex.Message); + } + + [Fact] + public void InvokeTimedOutException_AllCtorOverloads() + { + var inner = new TimeoutException("inner"); + Assert.IsAssignableFrom(new InvokeTimedOutException()); + Assert.Equal("m", new InvokeTimedOutException("m").Message); + Assert.Same(inner, new InvokeTimedOutException("m", inner).InnerException); + } + + [Fact] + public void InvokeStoppedException_IsInvokeException() + { + var ex = new InvokeStoppedException("stopped"); + Assert.IsAssignableFrom(ex); + Assert.IsAssignableFrom(ex); + Assert.Equal("stopped", ex.Message); + } + + [Fact] + public void InvokeStoppedException_AllCtorOverloads() + { + var inner = new InvalidOperationException("inner"); + Assert.IsAssignableFrom(new InvokeStoppedException()); + Assert.Equal("m", new InvokeStoppedException("m").Message); + Assert.Same(inner, new InvokeStoppedException("m", inner).InnerException); + } + + [Fact] + public void InvokeException_SubclassesCaughtByBase() + { + // Verifies the documented pattern-matching contract: catch + // (InvokeException) catches all three subclasses. + Exception failed = new InvokeFailedException("fail"); + Exception timedOut = new InvokeTimedOutException("timeout"); + Exception stopped = new InvokeStoppedException("stop"); + + Assert.True(failed is InvokeException); + Assert.True(timedOut is InvokeException); + Assert.True(stopped is InvokeException); + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs new file mode 100644 index 000000000..a55c9912e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ExecutionStateTests.cs @@ -0,0 +1,260 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ExecutionStateTests +{ + private const string ExecutionInputId = "exec-input"; + + private static Operation ExecutionInputOp(string id = ExecutionInputId) => new() + { + Id = id, + Type = OperationTypes.Execution, + Status = OperationStatuses.Started + }; + + private static Operation StepOp(string id, string status, string? name = null) => new() + { + Id = id, + Type = OperationTypes.Step, + Status = status, + Name = name, + StepDetails = new StepDetails { Result = "true" } + }; + + [Fact] + public void LoadFromCheckpoint_NullState_NotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + + Assert.False(state.IsReplaying); + Assert.Equal(0, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_EmptyOperations_NotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState { Operations = new List() }); + + Assert.False(state.IsReplaying); + Assert.Equal(0, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_OnlyExecutionInputOp_NotReplaying() + { + // The service sends one EXECUTION-type op carrying the input payload + // even on the first invocation. That op is bookkeeping, not user + // history — it must not put us into replay mode. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List { ExecutionInputOp() } + }); + + Assert.False(state.IsReplaying); + Assert.Equal(1, state.CheckpointedOperationCount); + } + + [Fact] + public void LoadFromCheckpoint_WithReplayableOperations_IsReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0-fetch_user", OperationStatuses.Succeeded) + } + }); + + Assert.True(state.IsReplaying); + Assert.Equal(2, state.CheckpointedOperationCount); + } + + [Fact] + public void TrackReplay_FlipsOutOfReplay_OnceAllCompletedOpsVisited() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + StepOp("1", OperationStatuses.Succeeded), + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0"); + Assert.True(state.IsReplaying); // 1-of-2 completed ops visited + + state.TrackReplay("1"); + Assert.False(state.IsReplaying); // all completed ops visited → fresh + } + + [Fact] + public void TrackReplay_PendingOpDoesNotBlockTransition() + { + // A PENDING op (e.g. retry timer waiting) is not "completed" in the + // checkpoint sense — once the workflow has visited every terminally- + // completed op the SDK treats subsequent code as fresh. Terminal set + // is {SUCCEEDED, FAILED, CANCELLED, STOPPED}. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + StepOp("1", OperationStatuses.Pending), + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void TrackReplay_IsIdempotent() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + StepOp("0", OperationStatuses.Succeeded), + } + }); + + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + + // Second call is a no-op. + state.TrackReplay("0"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void TrackReplay_NoOpWhenNotReplaying() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + Assert.False(state.IsReplaying); + + state.TrackReplay("anything"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void GetOperation_ReturnsCheckpointedRecord() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + StepOp("0-validate", OperationStatuses.Succeeded) + } + }); + + var op = state.GetOperation("0-validate"); + Assert.NotNull(op); + Assert.Equal(OperationStatuses.Succeeded, op!.Status); + } + + [Fact] + public void GetOperation_ReturnsNull_WhenNotFound() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + + var op = state.GetOperation("0-nonexistent"); + Assert.Null(op); + } + + [Fact] + public void HasOperation_ReturnsTrueForExisting() + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List { StepOp("0-step_a", OperationStatuses.Succeeded) } + }); + + Assert.True(state.HasOperation("0-step_a")); + Assert.False(state.HasOperation("1-step_b")); + } + + [Fact] + public void TrackReplay_TerminalSet_IncludesTimedOut() + { + // TIMED_OUT is a terminal state (matches Python/JS/Java reference SDKs). + // A timed-out chained-invoke that has been visited must allow the + // replay-mode flag to flip; otherwise IsReplaying would stay stuck on + // for the rest of the invocation and downstream replay-aware features + // (e.g., the future replay-aware logger) would mis-fire. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + ExecutionInputOp(), + new() + { + Id = "0-invoke", + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.TimedOut + } + } + }); + Assert.True(state.IsReplaying); + + state.TrackReplay("0-invoke"); + Assert.False(state.IsReplaying); + } + + [Fact] + public void GetOperation_ReturnsLatestRecord_WhenIdAppearsMultipleTimes() + { + // Wire format: when the service replays an envelope it includes the + // most recent record per ID. We key by ID alone and rely on the service + // to provide the authoritative record. + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = "0-payment", + Type = OperationTypes.Step, + Status = OperationStatuses.Started + }, + new() + { + Id = "0-payment", + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + StepDetails = new StepDetails { Result = "\"paid\"" } + } + } + }); + + var op = state.GetOperation("0-payment"); + Assert.NotNull(op); + Assert.Equal(OperationStatuses.Succeeded, op!.Status); + Assert.Equal("\"paid\"", op.StepDetails?.Result); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs new file mode 100644 index 000000000..eb8b7a757 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs @@ -0,0 +1,577 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class InvokeOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + private const string FunctionArn = "arn:aws:lambda:us-east-1:123456789012:function:downstream:prod"; + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + #region Argument validation + + [Fact] + public async Task InvokeAsync_NullFunctionName_ThrowsArgumentNullException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: null!, payload: "x")); + } + + [Fact] + public async Task InvokeAsync_EmptyFunctionName_ThrowsArgumentException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: "", payload: "x")); + } + + [Fact] + public async Task InvokeAsync_WhitespaceFunctionName_ThrowsArgumentException() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(functionName: " ", payload: "x")); + } + + [Fact] + public async Task InvokeAsync_PreservesUnqualifiedArn_AndPassesItThrough() + { + // The SDK does NOT regex-validate qualified ARNs. The service enforces + // that rule. We verify the value is propagated unmodified to the + // ChainedInvokeOptions.FunctionName so that service-side rejection + // surfaces with the user's exact input. + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync( + "arn:aws:lambda:us-east-1:123456789012:function:no-version", + payload: "x", + name: "noversion"); + + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + var start = recorder.Flushed.Single(o => o.Action == "START"); + Assert.Equal("arn:aws:lambda:us-east-1:123456789012:function:no-version", + start.ChainedInvokeOptions.FunctionName); + } + + #endregion + + #region Fresh execution + + [Fact] + public async Task InvokeAsync_FreshExecution_CheckpointsStartAndSuspends() + { + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync( + FunctionArn, + new RequestPayload { Amount = 100, Currency = "USD" }, + name: "process_payment", + config: new InvokeConfig { TenantId = "tenant-A" }); + + // Service-side suspend mechanics: TerminationManager fires before the + // user task completes; the task itself never resolves on the fresh path. + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + + var start = recorder.Flushed.Single(); + Assert.Equal("CHAINED_INVOKE", start.Type); + Assert.Equal("START", start.Action); + Assert.Equal("ChainedInvoke", start.SubType); + Assert.Equal("process_payment", start.Name); + Assert.Equal(IdAt(1), start.Id); + + // Payload is JSON-serialized via the registered ILambdaSerializer. + Assert.Contains("\"Amount\":100", start.Payload); + Assert.Contains("\"Currency\":\"USD\"", start.Payload); + + // ChainedInvokeOptions carries function name + tenant id. + Assert.NotNull(start.ChainedInvokeOptions); + Assert.Equal(FunctionArn, start.ChainedInvokeOptions.FunctionName); + Assert.Equal("tenant-A", start.ChainedInvokeOptions.TenantId); + } + + [Fact] + public async Task InvokeAsync_FreshExecution_NoTenantId_OmitsTenantId() + { + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync(FunctionArn, "payload", name: "no_tenant"); + + await Task.Delay(20); + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + await recorder.Batcher.DrainAsync(); + + var start = recorder.Flushed.Single(); + Assert.NotNull(start.ChainedInvokeOptions); + Assert.Equal(FunctionArn, start.ChainedInvokeOptions.FunctionName); + // null tenant means the SDK didn't set the field; the AWS SDK model's + // IsSet property is what callers actually inspect, but the easy + // deterministic assertion is that the property is null. + Assert.Null(start.ChainedInvokeOptions.TenantId); + } + + [Fact] + public async Task InvokeAsync_FreshExecution_StartIsSyncFlushed() + { + // Critical correctness invariant: START must be flushed BEFORE we + // suspend. A queued-but-unflushed START is "the service doesn't know + // about the chained invocation," so the parent suspends forever. + var (context, recorder, tm, _) = CreateContext(); + + var task = context.InvokeAsync(FunctionArn, "x", name: "sync_flush"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // No DrainAsync — the START must already be flushed at the moment + // suspension is signaled. This mirrors WaitOperation_NewExecution_SignalsTermination's + // contract: TerminationManager firing implies the matching START is durable. + Assert.Single(recorder.Flushed); + Assert.Equal("START", recorder.Flushed[0].Action); + } + + [Fact] + public async Task InvokeAsync_TerminationReason_IsInvokePending() + { + var (context, _, tm, _) = CreateContext(); + + _ = context.InvokeAsync(FunctionArn, "x", name: "reason_check"); + var termination = await tm.TerminationTask; + + Assert.Equal(TerminationReason.InvokePending, termination.Reason); + } + + [Fact] + public async Task InvokeAsync_NoSerializerRegistered_ThrowsInvalidOperationException() + { + // If the user constructs a Lambda runtime without a serializer (or in + // tests, neglects to set TestLambdaContext.Serializer), InvokeAsync + // surfaces a helpful error rather than NREing inside InvokeOperation. + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = new TestLambdaContext(); // no serializer! + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "no_serializer")); + } + + #endregion + + #region Replay — terminal status mapping + + [Fact] + public async Task InvokeAsync_ReplaySucceeded_ReturnsCachedResultWithoutRescheduling() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "cached", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Result = "{\"OrderId\":\"abc\",\"Total\":42}" + } + } + } + }); + + var result = await context.InvokeAsync( + FunctionArn, "x", name: "cached"); + + Assert.False(tm.IsTerminated); + Assert.Equal("abc", result.OrderId); + Assert.Equal(42, result.Total); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayFailed_ThrowsInvokeFailedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Failed, + Name = "boom", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "downstream exploded", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new[] { "at A.B()", "at C.D()" } + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "boom")); + + Assert.Equal("downstream exploded", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.Equal("System.InvalidOperationException", ex.ErrorType); + Assert.Equal("{\"detail\":\"x\"}", ex.ErrorData); + Assert.NotNull(ex.OriginalStackTrace); + Assert.Equal(2, ex.OriginalStackTrace!.Count); + + // Subclass relationship — `catch (InvokeException)` catches all three. + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayTimedOut_ThrowsInvokeTimedOutException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.TimedOut, + Name = "slow", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Error = new ErrorObject + { + ErrorMessage = "execution timed out after 60s" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "slow")); + + Assert.Equal("execution timed out after 60s", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayStopped_ThrowsInvokeStoppedException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Stopped, + Name = "stopped" + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "stopped")); + + // No recorded ErrorMessage → fallback default. + Assert.Equal("Chained invoke was stopped.", ex.Message); + Assert.Equal(FunctionArn, ex.FunctionName); + Assert.IsAssignableFrom(ex); + } + + [Fact] + public async Task InvokeAsync_ReplayStarted_ResuspendsWithoutRecheckpoint() + { + // Service hasn't reached terminal yet. The original START is still + // authoritative; do not re-emit, just suspend. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Started, + Name = "still_running" + } + } + }); + + var task = context.InvokeAsync(FunctionArn, "x", name: "still_running"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + + // Crucially: no checkpoint was emitted. Original START is authoritative. + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayPending_ResuspendsWithoutRecheckpoint() + { + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Pending, + Name = "pending" + } + } + }); + + var task = context.InvokeAsync(FunctionArn, "x", name: "pending"); + await Task.Delay(20); + + Assert.True(tm.IsTerminated); + Assert.False(task.IsCompleted); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task InvokeAsync_ReplayUnknownStatus_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = "TOTALLY_BOGUS", + Name = "mystery" + } + } + }); + + await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "mystery")); + } + + [Fact] + public async Task InvokeAsync_ReplayTypeMismatch_ThrowsNonDeterministicException() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, // wrong type + Status = OperationStatuses.Succeeded, + Name = "kept_consistent", + StepDetails = new StepDetails { Result = "\"x\"" } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.InvokeAsync(FunctionArn, "x", name: "kept_consistent")); + + Assert.Contains("expected type 'CHAINED_INVOKE'", ex.Message); + Assert.Contains("found 'STEP'", ex.Message); + } + + #endregion + + #region Serialization + + [Fact] + public async Task InvokeAsync_DeserializesResultViaRegisteredSerializer() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "json_result", + ChainedInvokeDetails = new ChainedInvokeDetails + { + Result = "{\"OrderId\":\"o-7\",\"Total\":1024}" + } + } + } + }); + + var result = await context.InvokeAsync( + FunctionArn, + new RequestPayload { Amount = 1, Currency = "USD" }, + name: "json_result"); + + Assert.Equal("o-7", result.OrderId); + Assert.Equal(1024, result.Total); + } + + #endregion + + #region End-to-end suspension / resume parity + + [Fact] + public async Task EndToEnd_StepInvokeStep_FirstInvocation_SuspendsOnInvoke() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(null); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var batcher = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, batcher.Batcher); + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + await context.StepAsync(async (_) => { await Task.CompletedTask; return "validated"; }, name: "validate"); + var paymentId = await context.InvokeAsync( + FunctionArn, "validated", name: "process_payment"); + return await context.StepAsync(async (_) => { await Task.CompletedTask; return paymentId + "-done"; }, name: "finalize"); + }); + + Assert.Equal(InvocationStatus.Pending, result.Status); + + await batcher.Batcher.DrainAsync(); + Assert.Contains(batcher.Flushed, o => o.Type == "CHAINED_INVOKE" && o.Action == "START"); + Assert.DoesNotContain(batcher.Flushed, o => o.Type == "STEP" && o.Name == "finalize"); + } + + [Fact] + public async Task EndToEnd_StepInvokeStep_SecondInvocation_ResumesAndCompletes() + { + var tm = new TerminationManager(); + var state = new ExecutionState(); + state.LoadFromCheckpoint(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "validate", + StepDetails = new StepDetails { Result = "\"validated\"" } + }, + new() + { + Id = IdAt(2), + Type = OperationTypes.ChainedInvoke, + Status = OperationStatuses.Succeeded, + Name = "process_payment", + ChainedInvokeDetails = new ChainedInvokeDetails { Result = "\"pmt-42\"" } + } + } + }); + + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext); + var finalizeRan = false; + + var result = await DurableExecutionHandler.RunAsync( + state, tm, + async () => + { + var validated = await context.StepAsync(async (_) => { await Task.CompletedTask; return "fresh-validated"; }, name: "validate"); + Assert.Equal("validated", validated); // cached + + var paymentId = await context.InvokeAsync( + FunctionArn, validated, name: "process_payment"); + Assert.Equal("pmt-42", paymentId); // cached + + return await context.StepAsync(async (_) => + { + finalizeRan = true; + await Task.CompletedTask; + return paymentId + "-done"; + }, name: "finalize"); + }); + + Assert.Equal(InvocationStatus.Succeeded, result.Status); + Assert.Equal("pmt-42-done", result.Result); + Assert.True(finalizeRan); + } + + #endregion + + #region Test-only types + + private class RequestPayload + { + public int Amount { get; set; } + public string? Currency { get; set; } + } + + private class ResponsePayload + { + public string? OrderId { get; set; } + public long Total { get; set; } + } + + #endregion +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs new file mode 100644 index 000000000..ab649f150 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs @@ -0,0 +1,407 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Services; +using Amazon.Lambda.Model; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class LambdaDurableServiceClientTests +{ + [Fact] + public async Task CheckpointAsync_EmptyOperations_NoApiCallReturnsToken() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + var token = await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + "input-token", + Array.Empty()); + + Assert.Equal("input-token", token); + Assert.Empty(mockClient.CheckpointCalls); + } + + [Fact] + public async Task CheckpointAsync_NullCheckpointToken_SendsEmptyString() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + checkpointToken: null, + new[] + { + new OperationUpdate + { + Id = "0-step", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Name = "do_thing", + Payload = "\"ok\"" + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + Assert.Equal("", call.CheckpointToken); + } + + [Fact] + public async Task CheckpointAsync_StepWithError_PropagatesError() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn:aws:lambda:us-east-1:123:durable-execution:e1", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-bad", + Type = "STEP", + Action = "FAIL", + SubType = "Step", + Name = "bad", + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new List { "at A.B()", "at C.D()" } + } + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + var update = Assert.Single(call.Updates); + Assert.Equal("STEP", update.Type); + Assert.Equal("FAIL", update.Action); + Assert.NotNull(update.Error); + Assert.Equal("System.TimeoutException", update.Error.ErrorType); + Assert.Equal("timed out", update.Error.ErrorMessage); + Assert.Equal("{\"detail\":\"x\"}", update.Error.ErrorData); + Assert.Equal(2, update.Error.StackTrace.Count); + } + + [Fact] + public async Task CheckpointAsync_WaitWithOptions_PropagatesWaitOptions() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-wait", + Type = "WAIT", + Action = "START", + SubType = "Wait", + Name = "delay", + WaitOptions = new WaitOptions { WaitSeconds = 45 } + } + }); + + var update = mockClient.CheckpointCalls[0].Updates[0]; + Assert.NotNull(update.WaitOptions); + Assert.Equal(45, update.WaitOptions.WaitSeconds); + } + + [Fact] + public async Task CheckpointAsync_ParentIdAndPayload_ArePropagated() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "child-1", + ParentId = "parent-0", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Payload = "{\"a\":1}" + } + }); + + var update = mockClient.CheckpointCalls[0].Updates[0]; + Assert.Equal("parent-0", update.ParentId); + Assert.Equal("{\"a\":1}", update.Payload); + } + + [Fact] + public async Task CheckpointAsync_MultipleUpdates_AllForwarded() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + await client.CheckpointAsync( + "arn", + "tok", + new[] + { + new OperationUpdate + { + Id = "0-step", + Type = "STEP", + Action = "SUCCEED", + SubType = "Step", + Name = "validate" + }, + new OperationUpdate + { + Id = "1-wait", + Type = "WAIT", + Action = "START", + SubType = "Wait", + Name = "delay", + WaitOptions = new WaitOptions { WaitSeconds = 30 } + } + }); + + var call = Assert.Single(mockClient.CheckpointCalls); + Assert.Equal(2, call.Updates.Count); + Assert.Equal("STEP", call.Updates[0].Type); + Assert.Equal("WAIT", call.Updates[1].Type); + } + + [Fact] + public async Task GetExecutionStateAsync_CopiesContextDetailsResultAndError() + { + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "SUCCEEDED", + Name = "phase", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Result = "\"ok\"" + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "ctx-2", + Type = "CONTEXT", + Status = "FAILED", + Name = "phase2", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "boom", + ErrorData = "{\"detail\":\"x\"}", + StackTrace = new List { "at A.B()", "at C.D()" } + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + Assert.Equal(2, operations.Count); + + Assert.NotNull(operations[0].ContextDetails); + Assert.Equal("\"ok\"", operations[0].ContextDetails!.Result); + Assert.Null(operations[0].ContextDetails!.Error); + + Assert.NotNull(operations[1].ContextDetails); + Assert.NotNull(operations[1].ContextDetails!.Error); + Assert.Equal("System.InvalidOperationException", operations[1].ContextDetails!.Error!.ErrorType); + Assert.Equal("boom", operations[1].ContextDetails!.Error!.ErrorMessage); + Assert.Equal("{\"detail\":\"x\"}", operations[1].ContextDetails!.Error!.ErrorData); + Assert.Equal(new[] { "at A.B()", "at C.D()" }, operations[1].ContextDetails!.Error!.StackTrace); + } + + [Fact] + public async Task GetExecutionStateAsync_CopiesStepDetailsErrorStackTraceAndErrorData() + { + // Round-trip safety: the SDK returns ErrorObject with all four fields, + // and Internal.Operation must preserve them so StepException can surface + // OriginalStackTrace / ErrorData on replay. + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "step-1", + Type = "STEP", + Status = "FAILED", + Name = "charge", + StepDetails = new Amazon.Lambda.Model.StepDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + ErrorData = "{\"detail\":\"y\"}", + StackTrace = new List { "at E.F()", "at G.H()" } + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + var op = Assert.Single(operations); + Assert.NotNull(op.StepDetails); + Assert.NotNull(op.StepDetails!.Error); + Assert.Equal("System.TimeoutException", op.StepDetails!.Error!.ErrorType); + Assert.Equal("timed out", op.StepDetails!.Error!.ErrorMessage); + Assert.Equal("{\"detail\":\"y\"}", op.StepDetails!.Error!.ErrorData); + Assert.Equal(new[] { "at E.F()", "at G.H()" }, op.StepDetails!.Error!.StackTrace); + } + + [Fact] + public async Task GetExecutionStateAsync_MapFromSdkOperation_RoundTripsAllErrorFields() + { + // Pre-existing bug guard: MapFromSdkOperation used to drop ErrorData + // and StackTrace from the SDK error object, so the durable exception + // builders (StepException, ChildContextException, and the + // InvokeException tree) always saw nulls for those fields on + // real-service replay. This test pins down the fix for all three + // operation types that carry an error. + var stack = new List { "at Frame.One()", "at Frame.Two()" }; + + var mockClient = new MockLambdaClient + { + GetExecutionStateHandler = _ => new GetDurableExecutionStateResponse + { + Operations = new List + { + new Amazon.Lambda.Model.Operation + { + Id = "step-1", + Type = "STEP", + Status = "FAILED", + StepDetails = new Amazon.Lambda.Model.StepDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "step blew up", + ErrorData = "{\"detail\":\"step\"}", + StackTrace = stack + } + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "FAILED", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.ArgumentException", + ErrorMessage = "ctx blew up", + ErrorData = "{\"detail\":\"ctx\"}", + StackTrace = stack + } + } + }, + new Amazon.Lambda.Model.Operation + { + Id = "inv-1", + Type = "CHAINED_INVOKE", + Status = "FAILED", + ChainedInvokeDetails = new Amazon.Lambda.Model.ChainedInvokeDetails + { + Error = new SdkErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "invoke blew up", + ErrorData = "{\"detail\":\"invoke\"}", + StackTrace = stack + } + } + } + } + } + }; + var client = new LambdaDurableServiceClient(mockClient); + + var (operations, _) = await client.GetExecutionStateAsync("arn", "tok", "marker"); + + Assert.Equal(3, operations.Count); + + // STEP — all four fields propagate. + var stepError = operations[0].StepDetails!.Error!; + Assert.Equal("System.InvalidOperationException", stepError.ErrorType); + Assert.Equal("step blew up", stepError.ErrorMessage); + Assert.Equal("{\"detail\":\"step\"}", stepError.ErrorData); + Assert.NotNull(stepError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, stepError.StackTrace!); + + // CHILD CONTEXT — all four fields propagate. + var ctxError = operations[1].ContextDetails!.Error!; + Assert.Equal("System.ArgumentException", ctxError.ErrorType); + Assert.Equal("ctx blew up", ctxError.ErrorMessage); + Assert.Equal("{\"detail\":\"ctx\"}", ctxError.ErrorData); + Assert.NotNull(ctxError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, ctxError.StackTrace!); + + // CHAINED_INVOKE — all four fields propagate. + var invError = operations[2].ChainedInvokeDetails!.Error!; + Assert.Equal("System.TimeoutException", invError.ErrorType); + Assert.Equal("invoke blew up", invError.ErrorMessage); + Assert.Equal("{\"detail\":\"invoke\"}", invError.ErrorData); + Assert.NotNull(invError.StackTrace); + Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, invError.StackTrace!); + } + + [Fact] + public async Task CheckpointAsync_ReturnsNewToken() + { + var mockClient = new MockLambdaClient(); + var client = new LambdaDurableServiceClient(mockClient); + + var newToken = await client.CheckpointAsync( + "arn", + "old-token", + new[] + { + new OperationUpdate + { + Id = "0-x", + Type = "STEP", + Action = "SUCCEED" + } + }); + + // MockLambdaClient returns "token-1", "token-2", etc. + Assert.Equal("token-1", newToken); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs new file mode 100644 index 000000000..9739b2907 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MockLambdaClient.cs @@ -0,0 +1,85 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Model; +using Amazon.Runtime; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// A mock that subclasses AmazonLambdaClient and overrides CheckpointDurableExecutionAsync +/// to avoid real API calls. Records checkpoint requests for test assertions. +/// +internal class MockLambdaClient : AmazonLambdaClient +{ + public List CheckpointCalls { get; } = new(); + public List GetExecutionStateCalls { get; } = new(); + + /// + /// Optional handler for calls. Tests + /// that exercise the paginated-state path can set this to control the response + /// for each page. + /// + public Func? GetExecutionStateHandler { get; set; } + + private int _tokenCounter; + + public MockLambdaClient() : base("fake-access-key", "fake-secret-key", Amazon.RegionEndpoint.USEast1) { } + + /// + /// Optional exception thrown by . Tests + /// that exercise checkpoint-error classification can set this to inject a specific + /// SDK exception on the orchestration-path drain. + /// + public Exception? CheckpointThrows { get; set; } + + /// + /// Optional exception thrown by . Tests + /// that exercise hydration-error classification can set this to inject a specific + /// SDK exception on the initial state-fetch path. + /// + public Exception? GetExecutionStateThrows { get; set; } + + /// + /// Optional handler that produces a + /// per request. Tests modeling the durable-execution service's + /// NewExecutionState response (e.g. stamping a CallbackId onto a + /// freshly-started CALLBACK op) wire this up. When null, a default + /// response is produced with only the auto-incremented checkpoint token. + /// + public Func? CheckpointHandler { get; set; } + + public override Task CheckpointDurableExecutionAsync( + CheckpointDurableExecutionRequest request, + CancellationToken cancellationToken = default) + { + CheckpointCalls.Add(request); + if (CheckpointThrows != null) throw CheckpointThrows; + if (CheckpointHandler != null) + { + var resp = CheckpointHandler(request); + // Auto-fill token if the test left it blank. + if (string.IsNullOrEmpty(resp.CheckpointToken)) + resp.CheckpointToken = $"token-{++_tokenCounter}"; + return Task.FromResult(resp); + } + return Task.FromResult(new CheckpointDurableExecutionResponse + { + CheckpointToken = $"token-{++_tokenCounter}" + }); + } + + public override Task GetDurableExecutionStateAsync( + GetDurableExecutionStateRequest request, + CancellationToken cancellationToken = default) + { + GetExecutionStateCalls.Add(request); + if (GetExecutionStateThrows != null) throw GetExecutionStateThrows; + if (GetExecutionStateHandler != null) + { + return Task.FromResult(GetExecutionStateHandler(request)); + } + return Task.FromResult(new GetDurableExecutionStateResponse()); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs new file mode 100644 index 000000000..4c9aaeba4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ModelsTests.cs @@ -0,0 +1,295 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ModelsTests +{ + [Fact] + public void Operation_PropertiesAssignable() + { + var op = new Operation + { + Id = "op-1", + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "fetch_user", + StepDetails = new StepDetails { Result = "{\"name\":\"Alice\"}" } + }; + + Assert.Equal("op-1", op.Id); + Assert.Equal(OperationTypes.Step, op.Type); + Assert.Equal(OperationStatuses.Succeeded, op.Status); + Assert.Equal("fetch_user", op.Name); + Assert.Equal("{\"name\":\"Alice\"}", op.StepDetails?.Result); + } + + [Fact] + public void Operation_WaitWithScheduledEndTimestamp() + { + var op = new Operation + { + Id = "op-2", + Type = OperationTypes.Wait, + Status = OperationStatuses.Pending, + Name = "cooldown", + WaitDetails = new WaitDetails + { + ScheduledEndTimestamp = 1767268830000L // 2026-01-01T12:00:30Z in ms + } + }; + + Assert.Equal(OperationTypes.Wait, op.Type); + Assert.Equal(1767268830000L, op.WaitDetails?.ScheduledEndTimestamp); + } + + [Fact] + public void ErrorObject_FromException() + { + var ex = new InvalidOperationException("something went wrong"); + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.InvalidOperationException", error.ErrorType); + Assert.Equal("something went wrong", error.ErrorMessage); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsStepException() + { + // A failing user step gets wrapped as StepException carrying the original + // ErrorType. Recording the wrapper's type would lose the user-facing + // exception identity across a chained-invoke boundary, so FromException + // pulls the original error fields through. + var ex = new StepException("intentional child failure") + { + ErrorType = "System.InvalidOperationException", + ErrorData = "{\"hint\":\"data\"}", + OriginalStackTrace = new[] { "at User.Workflow.Body()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.InvalidOperationException", error.ErrorType); + Assert.Equal("intentional child failure", error.ErrorMessage); + Assert.Equal("{\"hint\":\"data\"}", error.ErrorData); + Assert.Equal(new[] { "at User.Workflow.Body()" }, error.StackTrace); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsChildContextException() + { + var ex = new ChildContextException("child failed") + { + ErrorType = "System.ArgumentException", + ErrorData = "{\"k\":\"v\"}", + OriginalStackTrace = new[] { "at Inner()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.ArgumentException", error.ErrorType); + Assert.Equal("child failed", error.ErrorMessage); + Assert.Equal("{\"k\":\"v\"}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsInvokeException() + { + var ex = new InvokeFailedException("downstream failed") + { + FunctionName = "arn:aws:lambda:...:function:downstream", + ErrorType = "System.TimeoutException", + ErrorData = "{\"region\":\"us-east-1\"}", + OriginalStackTrace = new[] { "at Downstream.Run()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("System.TimeoutException", error.ErrorType); + Assert.Equal("downstream failed", error.ErrorMessage); + Assert.Equal("{\"region\":\"us-east-1\"}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsCallbackException() + { + var ex = new CallbackFailedException("callback failed") + { + CallbackId = "cb-123", + ErrorType = "Acme.Errors.PaymentDeclined", + ErrorData = "{\"code\":42}", + OriginalStackTrace = new[] { "at External.Reject()" } + }; + + var error = ErrorObject.FromException(ex); + + Assert.Equal("Acme.Errors.PaymentDeclined", error.ErrorType); + Assert.Equal("callback failed", error.ErrorMessage); + Assert.Equal("{\"code\":42}", error.ErrorData); + } + + [Fact] + public void ErrorObject_FromException_UnwrapsStepException_WithNullErrorType() + { + // StepException without an explicit ErrorType (e.g., constructed by code + // that didn't set the init-only property) records null rather than + // falling back to the wrapper's type — the wrapper type is never useful. + var ex = new StepException("no type set"); + + var error = ErrorObject.FromException(ex); + + Assert.Null(error.ErrorType); + Assert.Equal("no type set", error.ErrorMessage); + } + + [Fact] + public void ErrorObject_RoundTripSerialization() + { + var error = new ErrorObject + { + ErrorType = "System.TimeoutException", + ErrorMessage = "timed out", + StackTrace = new[] { "at Foo.Bar()", "at Baz.Qux()" }, + ErrorData = "{\"key\":\"value\"}" + }; + + var json = JsonSerializer.Serialize(error); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal("System.TimeoutException", deserialized.ErrorType); + Assert.Equal("timed out", deserialized.ErrorMessage); + Assert.Equal(2, deserialized.StackTrace!.Count); + Assert.Equal("{\"key\":\"value\"}", deserialized.ErrorData); + } + + [Fact] + public void DurableExecutionInvocationInput_Deserialization() + { + var json = """ + { + "DurableExecutionArn": "arn:aws:lambda:us-east-1:123:durable-execution:abc", + "CheckpointToken": "token-1", + "InitialExecutionState": { + "Operations": [ + { + "Id": "exec-1", + "Type": "EXECUTION", + "Status": "STARTED", + "ExecutionDetails": { + "InputPayload": "{\"orderId\":\"order-123\",\"amount\":99.99}" + } + }, + { + "Id": "op-1", + "Type": "STEP", + "Status": "SUCCEEDED", + "Name": "validate", + "StepDetails": { + "Result": "true" + } + } + ] + } + } + """; + + var input = JsonSerializer.Deserialize(json)!; + + Assert.Equal("arn:aws:lambda:us-east-1:123:durable-execution:abc", input.DurableExecutionArn); + Assert.Equal("token-1", input.CheckpointToken); + Assert.NotNull(input.InitialExecutionState); + Assert.Equal(2, input.InitialExecutionState!.Operations!.Count); + + var stepOp = input.InitialExecutionState.Operations![1]; + Assert.Equal("op-1", stepOp.Id); + Assert.Equal(OperationTypes.Step, stepOp.Type); + Assert.Equal("true", stepOp.StepDetails?.Result); + + // The EXECUTION operation carries the user payload in ExecutionDetails.InputPayload. + var execOp = input.InitialExecutionState.Operations[0]; + Assert.Equal(OperationTypes.Execution, execOp.Type); + var payload = JsonSerializer.Deserialize(execOp.ExecutionDetails!.InputPayload!); + Assert.Equal("order-123", payload!.OrderId); + Assert.Equal(99.99m, payload.Amount); + } + + [Fact] + public void DurableExecutionInvocationInput_NoExecutionOp_HasNullPayload() + { + var input = new DurableExecutionInvocationInput + { + DurableExecutionArn = "arn:test" + }; + + // No InitialExecutionState means no EXECUTION operation and thus no user payload + Assert.Null(input.InitialExecutionState); + } + + [Fact] + public void DurableExecutionInvocationOutput_Succeeded() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Succeeded, + Result = "{\"status\":\"approved\"}" + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Succeeded, deserialized.Status); + Assert.Equal("{\"status\":\"approved\"}", deserialized.Result); + } + + [Fact] + public void DurableExecutionInvocationOutput_Failed() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Failed, + Error = new ErrorObject + { + ErrorMessage = "step failed", + ErrorType = "StepException" + } + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Failed, deserialized.Status); + Assert.NotNull(deserialized.Error); + Assert.Equal("step failed", deserialized.Error!.ErrorMessage); + Assert.Equal("StepException", deserialized.Error.ErrorType); + } + + [Fact] + public void DurableExecutionInvocationOutput_Pending() + { + var output = new DurableExecutionInvocationOutput + { + Status = InvocationStatus.Pending + }; + + var json = JsonSerializer.Serialize(output); + var deserialized = JsonSerializer.Deserialize(json)!; + + Assert.Equal(InvocationStatus.Pending, deserialized.Status); + Assert.Null(deserialized.Result); + Assert.Null(deserialized.Error); + } + + private class TestOrderEvent + { + [System.Text.Json.Serialization.JsonPropertyName("orderId")] + public string? OrderId { get; set; } + + [System.Text.Json.Serialization.JsonPropertyName("amount")] + public decimal Amount { get; set; } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs new file mode 100644 index 000000000..2c4d4ce90 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/OperationIdGeneratorTests.cs @@ -0,0 +1,126 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class OperationIdGeneratorTests +{ + private static string Sha256Hex(string input) + { + using var sha = SHA256.Create(); + var bytes = sha.ComputeHash(Encoding.UTF8.GetBytes(input)); + var sb = new StringBuilder(bytes.Length * 2); + foreach (var b in bytes) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + [Fact] + public void NextId_ProducesSha256OfPositionString_StartingAtOne() + { + var gen = new OperationIdGenerator(); + Assert.Equal(Sha256Hex("1"), gen.NextId()); + Assert.Equal(Sha256Hex("2"), gen.NextId()); + Assert.Equal(Sha256Hex("3"), gen.NextId()); + } + + [Fact] + public void HashOperationId_IsStable() + { + Assert.Equal(Sha256Hex("hello"), OperationIdGenerator.HashOperationId("hello")); + Assert.Equal(Sha256Hex("1"), OperationIdGenerator.HashOperationId("1")); + } + + [Fact] + public void ChildGenerator_PrefixesPositionWithParentHash() + { + var gen = new OperationIdGenerator(); + var parentId = gen.NextId(); + var child = gen.CreateChild(parentId); + + Assert.Equal(Sha256Hex(parentId + "-1"), child.NextId()); + Assert.Equal(Sha256Hex(parentId + "-2"), child.NextId()); + } + + [Fact] + public void ChildGenerator_ParentIdProperty() + { + var gen = new OperationIdGenerator(); + Assert.Null(gen.ParentId); + + var child = new OperationIdGenerator("op-5"); + Assert.Equal("op-5", child.ParentId); + } + + [Fact] + public void MultipleChildren_IndependentCounters() + { + var child1 = new OperationIdGenerator("parent-1"); + var child2 = new OperationIdGenerator("parent-2"); + + Assert.Equal(Sha256Hex("parent-1-1"), child1.NextId()); + Assert.Equal(Sha256Hex("parent-2-1"), child2.NextId()); + Assert.Equal(Sha256Hex("parent-1-2"), child1.NextId()); + Assert.Equal(Sha256Hex("parent-2-2"), child2.NextId()); + } + + [Fact] + public void Deterministic_SameSequenceOnReplay() + { + var gen1 = new OperationIdGenerator(); + var ids1 = new[] { gen1.NextId(), gen1.NextId(), gen1.NextId() }; + + var gen2 = new OperationIdGenerator(); + var ids2 = new[] { gen2.NextId(), gen2.NextId(), gen2.NextId() }; + + Assert.Equal(ids1, ids2); + } + + [Fact] + public void Reset_RewindsCounter() + { + var gen = new OperationIdGenerator(); + gen.NextId(); + gen.NextId(); + gen.Reset(); + Assert.Equal(Sha256Hex("1"), gen.NextId()); + } + + [Fact] + public async Task NextId_ConcurrentCallers_ProduceUniqueIds() + { + // Without Interlocked.Increment, two threads racing on ++_counter can + // both observe the same pre-increment value and emit duplicate IDs, + // silently breaking replay determinism. Drive enough contention to + // catch a regression: many parallel callers, each making many calls. + const int threads = 16; + const int idsPerThread = 500; + const int total = threads * idsPerThread; + + var gen = new OperationIdGenerator(); + var allIds = new string[total]; + var start = new ManualResetEventSlim(false); + + var tasks = Enumerable.Range(0, threads).Select(t => Task.Run(() => + { + start.Wait(); + for (var i = 0; i < idsPerThread; i++) + { + allIds[t * idsPerThread + i] = gen.NextId(); + } + })).ToArray(); + + start.Set(); + await Task.WhenAll(tasks); + + Assert.Equal(total, allIds.Distinct().Count()); + + // Counter advanced exactly `total` times — the next ID must be hash("total+1"). + Assert.Equal(Sha256Hex((total + 1).ToString(System.Globalization.CultureInfo.InvariantCulture)), + gen.NextId()); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs new file mode 100644 index 000000000..992ebdb22 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RecordingBatcher.cs @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// Test helper: a that records every flushed +/// update without making any network calls. Tests construct one of these in +/// place of a real batcher to inspect what would have been sent to the service. +/// +internal sealed class RecordingBatcher +{ + private readonly List _flushed = new(); + private readonly List _flushBatchSizes = new(); + private readonly object _lock = new(); + + public CheckpointBatcher Batcher { get; } + + /// + /// Optional hook invoked synchronously after each batch flush, with that + /// batch's updates. Tests modeling the durable-execution service's + /// NewExecutionState response (e.g. stamping a CallbackId onto a + /// freshly-started CALLBACK op) wire this up to mutate the test's + /// . + /// + public Action>? OnFlush { get; set; } + + public RecordingBatcher(CheckpointBatcherConfig? config = null) + { + Batcher = new CheckpointBatcher("test-token", Flush, config); + } + + /// + /// Cumulative list of every update that has been flushed, in order. + /// + public IReadOnlyList Flushed + { + get { lock (_lock) return _flushed.ToArray(); } + } + + /// + /// One entry per batch flushed, recording the batch size. With + /// = Zero (default), + /// every produces one batch. + /// + public IReadOnlyList FlushBatchSizes + { + get { lock (_lock) return _flushBatchSizes.ToArray(); } + } + + private Task Flush(string? token, IReadOnlyList ops, CancellationToken ct) + { + lock (_lock) + { + _flushed.AddRange(ops); + _flushBatchSizes.Add(ops.Count); + } + OnFlush?.Invoke(ops); + return Task.FromResult(token); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs new file mode 100644 index 000000000..f226ea079 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/RetryStrategyTests.cs @@ -0,0 +1,205 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class RetryStrategyTests +{ + [Fact] + public void ExponentialDefault_RetriesUpToMaxAttempts() + { + var strategy = RetryStrategy.Default; + + // Attempts 1-5 should retry (maxAttempts=6 means 6 total attempts) + for (int i = 1; i < 6; i++) + { + var decision = strategy.ShouldRetry(new InvalidOperationException("fail"), i); + Assert.True(decision.ShouldRetry); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + } + + // Attempt 6 should not retry (exhausted) + var lastDecision = strategy.ShouldRetry(new InvalidOperationException("fail"), 6); + Assert.False(lastDecision.ShouldRetry); + } + + [Fact] + public void None_NeverRetries() + { + var strategy = RetryStrategy.None; + + var decision = strategy.ShouldRetry(new Exception("fail"), 1); + Assert.False(decision.ShouldRetry); + } + + [Fact] + public void Transient_RetriesUpTo3Attempts() + { + var strategy = RetryStrategy.Transient; + + Assert.True(strategy.ShouldRetry(new Exception("fail"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("fail"), 2).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception("fail"), 3).ShouldRetry); + } + + [Fact] + public void Exponential_DelayIncreases() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(2), + maxDelay: TimeSpan.FromSeconds(120), + backoffRate: 2.0, + jitter: JitterStrategy.None); + + var d1 = strategy.ShouldRetry(new Exception(), 1).Delay; + var d2 = strategy.ShouldRetry(new Exception(), 2).Delay; + var d3 = strategy.ShouldRetry(new Exception(), 3).Delay; + + // With no jitter: 2s, 4s, 8s (ceiling to whole seconds) + Assert.Equal(TimeSpan.FromSeconds(2), d1); + Assert.Equal(TimeSpan.FromSeconds(4), d2); + Assert.Equal(TimeSpan.FromSeconds(8), d3); + } + + [Fact] + public void Exponential_DelayCapsAtMax() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 10, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(30), + backoffRate: 3.0, + jitter: JitterStrategy.None); + + // Attempt 3: 10 * 3^2 = 90, capped to 30 + var decision = strategy.ShouldRetry(new Exception(), 3); + Assert.Equal(TimeSpan.FromSeconds(30), decision.Delay); + } + + [Fact] + public void Exponential_FullJitter_BoundedByDelay() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Full); + + // Run multiple times to check bounds + for (int i = 0; i < 50; i++) + { + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + Assert.True(decision.Delay <= TimeSpan.FromSeconds(10)); + } + } + + [Fact] + public void Exponential_HalfJitter_BoundedBetween50And100Percent() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 5, + initialDelay: TimeSpan.FromSeconds(10), + maxDelay: TimeSpan.FromSeconds(100), + backoffRate: 2.0, + jitter: JitterStrategy.Half); + + for (int i = 0; i < 50; i++) + { + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(5)); + Assert.True(decision.Delay <= TimeSpan.FromSeconds(10)); + } + } + + [Fact] + public void Exponential_RetryableExceptions_FiltersCorrectly() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException), typeof(HttpRequestException) }); + + Assert.True(strategy.ShouldRetry(new TimeoutException(), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new HttpRequestException(), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new InvalidOperationException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_RetryableExceptions_MatchesDerivedTypes() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(IOException) }); + + Assert.True(strategy.ShouldRetry(new FileNotFoundException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_MessagePatterns_FiltersCorrectly() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableMessagePatterns: new[] { "timeout", "throttl", "5\\d{2}" }); + + Assert.True(strategy.ShouldRetry(new Exception("connection timeout"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("request throttled"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new Exception("HTTP 503"), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception("not found"), 1).ShouldRetry); + } + + [Fact] + public void Exponential_BothFilters_EitherMatches() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + retryableExceptions: new[] { typeof(TimeoutException) }, + retryableMessagePatterns: new[] { "throttl" }); + + // Matches exception type + Assert.True(strategy.ShouldRetry(new TimeoutException("any message"), 1).ShouldRetry); + // Matches message pattern + Assert.True(strategy.ShouldRetry(new Exception("throttled"), 1).ShouldRetry); + // Matches neither + Assert.False(strategy.ShouldRetry(new InvalidOperationException("bad state"), 1).ShouldRetry); + } + + [Fact] + public void Exponential_NoFilters_RetriesAllExceptions() + { + var strategy = RetryStrategy.Exponential(maxAttempts: 3); + + Assert.True(strategy.ShouldRetry(new Exception("anything"), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new InvalidOperationException(), 1).ShouldRetry); + Assert.True(strategy.ShouldRetry(new OutOfMemoryException(), 1).ShouldRetry); + } + + [Fact] + public void Exponential_MinimumDelayIsOneSecond() + { + var strategy = RetryStrategy.Exponential( + maxAttempts: 3, + initialDelay: TimeSpan.FromMilliseconds(100), + jitter: JitterStrategy.None); + + var decision = strategy.ShouldRetry(new Exception(), 1); + Assert.True(decision.Delay >= TimeSpan.FromSeconds(1)); + } + + [Fact] + public void FromDelegate_UsesProvidedFunction() + { + var strategy = RetryStrategy.FromDelegate((ex, attempt) => + attempt < 2 && ex is TimeoutException + ? RetryDecision.RetryAfter(TimeSpan.FromSeconds(5)) + : RetryDecision.DoNotRetry()); + + Assert.True(strategy.ShouldRetry(new TimeoutException(), 1).ShouldRetry); + Assert.False(strategy.ShouldRetry(new TimeoutException(), 2).ShouldRetry); + Assert.False(strategy.ShouldRetry(new Exception(), 1).ShouldRetry); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs new file mode 100644 index 000000000..3c163ccee --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationManagerTests.cs @@ -0,0 +1,91 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class TerminationManagerTests +{ + [Fact] + public async Task Terminate_ResolvesTerminationTask() + { + var manager = new TerminationManager(); + Assert.False(manager.IsTerminated); + + manager.Terminate(TerminationReason.WaitScheduled, "wait pending"); + + Assert.True(manager.IsTerminated); + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.WaitScheduled, result.Reason); + Assert.Equal("wait pending", result.Message); + } + + [Fact] + public void Terminate_OnlyFirstCallWins() + { + var manager = new TerminationManager(); + + var first = manager.Terminate(TerminationReason.WaitScheduled, "first"); + var second = manager.Terminate(TerminationReason.CallbackPending, "second"); + + Assert.True(first); + Assert.False(second); + } + + [Fact] + public async Task Terminate_FirstReasonIsPreserved() + { + var manager = new TerminationManager(); + + manager.Terminate(TerminationReason.CallbackPending, "callback"); + manager.Terminate(TerminationReason.WaitScheduled, "wait"); + + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.CallbackPending, result.Reason); + Assert.Equal("callback", result.Message); + } + + [Fact] + public async Task Terminate_WithException() + { + var manager = new TerminationManager(); + var ex = new Exception("checkpoint failed"); + + manager.Terminate(TerminationReason.CheckpointFailed, "error", ex); + + var result = await manager.TerminationTask; + Assert.Equal(TerminationReason.CheckpointFailed, result.Reason); + Assert.Same(ex, result.Exception); + } + + [Fact] + public async Task TerminationTask_WinsRaceAgainstNeverCompletingTask() + { + var manager = new TerminationManager(); + var neverCompletes = new TaskCompletionSource().Task; + + manager.Terminate(TerminationReason.WaitScheduled); + + var winner = await Task.WhenAny(neverCompletes, manager.TerminationTask); + Assert.Same(manager.TerminationTask, winner); + } + + [Fact] + public async Task ConcurrentTerminate_OnlyOneSucceeds() + { + var manager = new TerminationManager(); + var results = new bool[10]; + + var tasks = Enumerable.Range(0, 10).Select(i => Task.Run(() => + { + results[i] = manager.Terminate(TerminationReason.WaitScheduled, $"caller-{i}"); + })); + + await Task.WhenAll(tasks); + + Assert.Equal(1, results.Count(r => r)); + Assert.True(manager.IsTerminated); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs new file mode 100644 index 000000000..7f7f92412 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/UpperSnakeCaseEnumConverterTests.cs @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Text.Json; +using System.Text.Json.Serialization; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// Direct tests for UpperSnakeCaseEnumConverter via a sample enum, exercising +/// every branch (Read with multi-word value, Read with single word, Read with +/// null/unparsable, plus the Write path for outbound serialization). +/// +public class UpperSnakeCaseEnumConverterTests +{ + public enum Sample + { + None, + FooBar, + BazQuxQuux + } + + public class Holder + { + [JsonConverter(typeof(UpperSnakeCaseEnumConverter))] + public Sample Value { get; set; } + } + + [Theory] + [InlineData("\"FOO_BAR\"", Sample.FooBar)] + [InlineData("\"BAZ_QUX_QUUX\"", Sample.BazQuxQuux)] + [InlineData("\"NONE\"", Sample.None)] + public void Read_UpperSnakeCase_ReturnsExpectedEnum(string json, Sample expected) + { + var holder = JsonSerializer.Deserialize($"{{\"Value\":{json}}}")!; + Assert.Equal(expected, holder.Value); + } + + [Fact] + public void Read_NullValue_ReturnsDefault() + { + var holder = JsonSerializer.Deserialize("{\"Value\":null}")!; + Assert.Equal(Sample.None, holder.Value); + } + + [Fact] + public void Read_CamelCase_ParsesCaseInsensitively() + { + // The converter first tries snake→pascal, then a raw case-insensitive parse. + // A camel-case input like "fooBar" hits the fallback path. + var holder = JsonSerializer.Deserialize("{\"Value\":\"fooBar\"}")!; + Assert.Equal(Sample.FooBar, holder.Value); + } + + [Fact] + public void Read_UnparsableValue_ThrowsJsonException() + { + // Unknown wire values must surface as JsonException rather than + // silently coercing to default(T) — otherwise an unrecognized + // service status would be indistinguishable from the zero value. + Assert.Throws(() => + JsonSerializer.Deserialize("{\"Value\":\"NOT_A_REAL_VALUE\"}")); + } + + [Fact] + public void Write_PascalCase_EmitsUpperSnake() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.FooBar }); + Assert.Contains("\"FOO_BAR\"", json); + } + + [Fact] + public void Write_MultiWord_EmitsUpperSnake() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.BazQuxQuux }); + Assert.Contains("\"BAZ_QUX_QUUX\"", json); + } + + [Fact] + public void Write_SingleWord_EmitsUpperWithoutUnderscores() + { + var json = JsonSerializer.Serialize(new Holder { Value = Sample.None }); + Assert.Contains("\"NONE\"", json); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs new file mode 100644 index 000000000..430df41c5 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForCallbackTests.cs @@ -0,0 +1,543 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class WaitForCallbackTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static TestLambdaContext CreateLambdaContext() => +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + new() { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); + var lambdaContext = CreateLambdaContext(); + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + private static void WireServiceCallbackIdAllocation( + RecordingBatcher recorder, ExecutionState state, string callbackId) + { + recorder.OnFlush = ops => + { + foreach (var op in ops) + { + if (op.Type == OperationTypes.Callback && op.Action == "START") + { + state.AddOperations(new[] + { + new Operation + { + Id = op.Id, + Type = OperationTypes.Callback, + Status = OperationStatuses.Started, + Name = op.Name, + CallbackDetails = new CallbackDetails { CallbackId = callbackId } + } + }); + } + } + }; + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_RunsSubmitterAndSuspendsForCallback() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-wait-1"); + + string? receivedCallbackId = null; + var resultTask = context.WaitForCallbackAsync( + async (callbackId, ctx) => + { + receivedCallbackId = callbackId; + Assert.NotNull(ctx.Logger); + await Task.CompletedTask; + }, + name: "approval"); + + // Race the suspended user task against termination — same idiom as the + // production handler. Once Terminate() is called inside the inner + // GetResultAsync, this completes immediately. + var winner = await Task.WhenAny(resultTask, tm.TerminationTask); + Assert.Same(tm.TerminationTask, winner); + + Assert.True(tm.IsTerminated); + Assert.False(resultTask.IsCompleted); + Assert.Equal("cb-wait-1", receivedCallbackId); + + await recorder.Batcher.DrainAsync(); + + var actions = recorder.Flushed.Select(o => $"{o.Type}:{o.Action}:{o.SubType}").ToArray(); + Assert.Equal(new[] + { + $"{OperationTypes.Context}:START:{OperationSubTypes.WaitForCallback}", + $"{OperationTypes.Callback}:START:{OperationSubTypes.Callback}", + $"{OperationTypes.Step}:START:{OperationSubTypes.Step}", + $"{OperationTypes.Step}:SUCCEED:{OperationSubTypes.Step}", + }, actions); + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_KebabSuffixedSubOpNames() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + var stepSucceed = recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED"); + + Assert.Equal("approval-callback", callbackStart.Name); + Assert.Equal("approval-submitter", stepSucceed.Name); + + // Avoid unobserved-task warning. + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_FreshExecution_NullParentName_LeavesSubOpsNameless() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + var stepSucceed = recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED"); + + Assert.Null(callbackStart.Name); + Assert.Null(stepSucceed.Name); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_ChildOperationIdsDeterministic() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT has IdAt(1); the inner callback is child #1, the inner + // submitter step is child #2 (under the same parent context op id). + var parentOpId = IdAt(1); + var callbackChildId = ChildIdAt(parentOpId, 1); + var submitterChildId = ChildIdAt(parentOpId, 2); + + Assert.Equal(callbackChildId, + recorder.Flushed.Single(o => o.Type == OperationTypes.Callback).Id); + Assert.Equal(submitterChildId, + recorder.Flushed.Single(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED").Id); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_CallbackTimeoutInheritsFromConfig() + { + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var resultTask = context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval", + config: new WaitForCallbackConfig + { + Timeout = TimeSpan.FromHours(2), + HeartbeatTimeout = TimeSpan.FromMinutes(15), + }); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + var callbackStart = recorder.Flushed.Single(o => o.Type == OperationTypes.Callback); + Assert.NotNull(callbackStart.CallbackOptions); + Assert.Equal(7200, callbackStart.CallbackOptions.TimeoutSeconds); + Assert.Equal(900, callbackStart.CallbackOptions.HeartbeatTimeoutSeconds); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayWithCallbackSucceeded_ReturnsResult() + { + // Full replay: parent CONTEXT SUCCEEDED with the callback's deserialized + // payload as its checkpointed result. + var (context, recorder, tm, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails { Result = "\"approved\"" } + } + } + }); + + var executed = false; + var result = await context.WaitForCallbackAsync( + async (_, _) => { executed = true; await Task.CompletedTask; }, + name: "approval"); + + Assert.False(executed); // Replay returns cached without re-running submitter. + Assert.Equal("approved", result); + Assert.False(tm.IsTerminated); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayCallbackTimedOut_ThrowsCallbackTimeoutException() + { + // Inside-out replay: parent CONTEXT is STARTED (still in flight), + // inner callback is TIMED_OUT, inner submitter step has SUCCEEDED. + var parentId = IdAt(1); + var callbackChildId = ChildIdAt(parentId, 1); + var submitterChildId = ChildIdAt(parentId, 2); + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + }, + new() + { + Id = callbackChildId, + Type = OperationTypes.Callback, + Status = OperationStatuses.TimedOut, + Name = "approval-callback", + ParentId = parentId, + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-to-1", + Error = new ErrorObject { ErrorMessage = "callback timed out" } + } + }, + new() + { + Id = submitterChildId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval-submitter", + ParentId = parentId, + StepDetails = new StepDetails { Result = "null" } + }, + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal("callback timed out", ex.Message); + Assert.Equal("cb-to-1", ex.CallbackId); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayCallbackFailed_ThrowsCallbackFailedException() + { + var parentId = IdAt(1); + var callbackChildId = ChildIdAt(parentId, 1); + var submitterChildId = ChildIdAt(parentId, 2); + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + }, + new() + { + Id = callbackChildId, + Type = OperationTypes.Callback, + Status = OperationStatuses.Failed, + Name = "approval-callback", + ParentId = parentId, + CallbackDetails = new CallbackDetails + { + CallbackId = "cb-fail-1", + Error = new ErrorObject + { + ErrorType = "ExternalSystemError", + ErrorMessage = "external rejected" + } + } + }, + new() + { + Id = submitterChildId, + Type = OperationTypes.Step, + Status = OperationStatuses.Succeeded, + Name = "approval-submitter", + ParentId = parentId, + StepDetails = new StepDetails { Result = "null" } + }, + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal("external rejected", ex.Message); + Assert.Equal("cb-fail-1", ex.CallbackId); + Assert.Equal("ExternalSystemError", ex.ErrorType); + } + + [Fact] + public async Task WaitForCallbackAsync_SubmitterFails_ThrowsCallbackSubmitterException() + { + // Replay: parent CONTEXT is FAILED with a Step-error inside. + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(StepException).FullName, + ErrorMessage = "submitter API failed", + ErrorData = "{\"code\":\"500\"}", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.IsAssignableFrom(ex); + Assert.Equal("submitter API failed", ex.Message); + // On the replay path the live StepException was lost across invocations; + // we preserve the StepException type-name string and carry the + // ChildContextException as the InnerException for traceability. + Assert.NotNull(ex.InnerException); + Assert.Equal(typeof(StepException).FullName, ex.ErrorType); + Assert.Equal("{\"code\":\"500\"}", ex.ErrorData); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayParentContextFailedWithCallbackTimeoutErrorType_PreservesSubclass() + { + // Subclass-fidelity guarantee: when the parent CONTEXT was checkpointed + // FAILED on a previous invocation with a CallbackTimeoutException + // ErrorType, replay must surface CallbackTimeoutException — not the + // more generic CallbackFailedException — so user catch blocks behave + // identically across live and replay paths. + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(CallbackTimeoutException).FullName, + ErrorMessage = "callback timed out after 24h", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + // Concrete-type check: not just `is CallbackException` — must be the + // CallbackTimeoutException subclass exactly. + Assert.Equal(typeof(CallbackTimeoutException), ex.GetType()); + Assert.Equal("callback timed out after 24h", ex.Message); + Assert.Equal(typeof(CallbackTimeoutException).FullName, ex.ErrorType); + } + + [Fact] + public async Task WaitForCallbackAsync_ReplayParentContextFailedWithCallbackFailedErrorType_RemapsToCallbackFailed() + { + // Companion case: a stored CallbackFailedException ErrorType remaps to + // CallbackFailedException (not the base or CallbackTimeoutException). + var parentId = IdAt(1); + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + Name = "approval", + SubType = OperationSubTypes.WaitForCallback, + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = typeof(CallbackFailedException).FullName, + ErrorMessage = "external rejected", + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.WaitForCallbackAsync( + async (_, _) => await Task.CompletedTask, + name: "approval")); + + Assert.Equal(typeof(CallbackFailedException), ex.GetType()); + Assert.Equal("external rejected", ex.Message); + } + + [Fact] + public async Task WaitForCallbackAsync_RetryStrategyForwardedToSubmitterStep() + { + // Verifies the WaitForCallbackConfig.RetryStrategy gets passed into the + // submitter step's StepConfig (via the kebab "-submitter" inner step). + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + var seenAttempts = new List(); + var resultTask = context.WaitForCallbackAsync( + async (_, ctx) => + { + // The submitter receives an IWaitForCallbackContext (no AttemptNumber) + // — but this test doesn't need to verify retry mechanics, only + // that the StepConfig with a retry strategy is wired through. + seenAttempts.Add(seenAttempts.Count + 1); + await Task.CompletedTask; + }, + name: "approval", + config: new WaitForCallbackConfig + { + RetryStrategy = new CountingRetryStrategy() + }); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + // Submitter ran exactly once (no failures to retry); a single STEP SUCCEED + // is sufficient evidence that the strategy was wired without throwing. + Assert.Single(recorder.Flushed.Where(o => o.Type == OperationTypes.Step && o.Action == "SUCCEED")); + + _ = resultTask; + } + + [Fact] + public async Task WaitForCallbackAsync_SubmitterContext_IsIWaitForCallbackContext_NotIStepContext() + { + // Verifies the submitter delegate receives our distinct + // IWaitForCallbackContext type (not IStepContext) — protects the + // architectural decision against accidental conflation. + var (context, recorder, tm, state) = CreateContext(); + WireServiceCallbackIdAllocation(recorder, state, "cb-1"); + + Type? observedContextType = null; + var resultTask = context.WaitForCallbackAsync( + async (_, ctx) => + { + observedContextType = ctx.GetType(); + await Task.CompletedTask; + }, + name: "approval"); + + await Task.WhenAny(resultTask, tm.TerminationTask); + await recorder.Batcher.DrainAsync(); + + Assert.NotNull(observedContextType); + Assert.True(typeof(IWaitForCallbackContext).IsAssignableFrom(observedContextType)); + Assert.False(typeof(IStepContext).IsAssignableFrom(observedContextType)); + + _ = resultTask; + } + + private sealed class CountingRetryStrategy : IRetryStrategy + { + public int Attempts; + public RetryDecision ShouldRetry(Exception exception, int attemptNumber) + { + Attempts = attemptNumber; + return RetryDecision.DoNotRetry(); + } + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings new file mode 100644 index 000000000..6c38b1258 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.runsettings @@ -0,0 +1,15 @@ + + + + + + + cobertura + [Amazon.Lambda.DurableExecution]* + [Amazon.Lambda.DurableExecution.Tests]* + GeneratedCodeAttribute + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh new file mode 100644 index 000000000..b953bd07e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/coverage.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -e +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$HERE/../../.." && pwd)" +PROJ="$HERE/Amazon.Lambda.DurableExecution.Tests.csproj" +OUT="$HERE/TestResults" + +rm -rf "$OUT" +dotnet test "$PROJ" -c Release \ + --collect:"XPlat Code Coverage" \ + --settings "$HERE/coverage.runsettings" \ + --results-directory "$OUT" + +REPORT_FILE=$(find "$OUT" -name "coverage.cobertura.xml" -type f | head -1) +if [ -z "$REPORT_FILE" ]; then + echo "No coverage report found under $OUT" + exit 1 +fi + +reportgenerator \ + "-reports:$REPORT_FILE" \ + "-targetdir:$OUT/report" \ + "-reporttypes:Html;TextSummary" + +echo +echo "==================== Coverage Summary ====================" +cat "$OUT/report/Summary.txt" +echo "==========================================================" +echo "Full HTML report: $OUT/report/index.html" diff --git a/README.md b/README.md index 405e952a5..afd2c11e3 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ For a history of releases view the [release change log](CHANGELOG.md) - [Amazon.Lambda.Annotations](#amazonlambdaannotations) - [Amazon.Lambda.AspNetCoreServer](#amazonlambdaaspnetcoreserver) - [Amazon.Lambda.TestUtilities](#amazonlambdatestutilities) + - [Amazon.Lambda.DurableExecution](#amazonlambdadurableexecution) - [Blueprints](#blueprints) - [Dotnet CLI Templates](#dotnet-cli-templates) - [Yeoman (Deprecated)](#yeoman-deprecated) @@ -113,6 +114,11 @@ For more information see the [README.md](Libraries/src/Amazon.Lambda.AspNetCoreS Package includes test implementation of the interfaces from Amazon.Lambda.Core and helper methods to help in locally testing. For more information see the [README.md](Libraries/src/Amazon.Lambda.TestUtilities/README.md) file for Amazon.Lambda.TestUtilities. +### Amazon.Lambda.DurableExecution + +The Durable Execution SDK lets you write multi-step Lambda workflows that automatically checkpoint progress and resume after failures. +For more information see the [README.md](Libraries/src/Amazon.Lambda.DurableExecution/README.md) file for Amazon.Lambda.DurableExecution. + ## Blueprints Blueprints in this repository are .NET Core Lambda functions that can used to get started. In Visual Studio the Blueprints are available when creating a new project and selecting the AWS Lambda Project. diff --git a/buildtools/build.proj b/buildtools/build.proj index 037c11f0a..0b80ec612 100644 --- a/buildtools/build.proj +++ b/buildtools/build.proj @@ -215,6 +215,7 @@ +