-
Notifications
You must be signed in to change notification settings - Fork 499
Adds retry support to the Amazon.Lambda.DurableExecution #2363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feature/durablefunction
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| namespace Amazon.Lambda.DurableExecution; | ||
|
|
||
| /// <summary> | ||
| /// Determines whether a failed step should be retried and with what delay. | ||
| /// </summary> | ||
| public interface IRetryStrategy | ||
| { | ||
| /// <summary> | ||
| /// Evaluates whether the given exception warrants a retry. | ||
| /// </summary> | ||
| /// <param name="exception">The exception that caused the step to fail.</param> | ||
| /// <param name="attemptNumber">The 1-based attempt number that just failed.</param> | ||
| /// <returns>A decision indicating whether to retry and the delay before the next attempt.</returns> | ||
| RetryDecision ShouldRetry(Exception exception, int attemptNumber); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// The outcome of a retry evaluation. | ||
| /// </summary> | ||
| public readonly struct RetryDecision | ||
| { | ||
| /// <summary>Whether the step should be retried.</summary> | ||
| public bool ShouldRetry { get; } | ||
|
|
||
| /// <summary>The delay before the next retry attempt.</summary> | ||
| public TimeSpan Delay { get; } | ||
|
|
||
| private RetryDecision(bool shouldRetry, TimeSpan delay) | ||
| { | ||
| ShouldRetry = shouldRetry; | ||
| Delay = delay; | ||
| } | ||
|
|
||
| /// <summary>Indicates the step should not be retried.</summary> | ||
| public static RetryDecision DoNotRetry() => new(false, TimeSpan.Zero); | ||
|
|
||
| /// <summary>Indicates the step should be retried after the specified delay.</summary> | ||
| public static RetryDecision RetryAfter(TimeSpan delay) => new(true, delay); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,6 @@ namespace Amazon.Lambda.DurableExecution.Internal; | |
| /// <see cref="CheckpointBatcher"/>; this type is the inbound side only. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// Replay tracking mirrors the Python / Java / JavaScript reference SDKs: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just updating docs here and everywhere to remove stuff like "similar to python/js" |
||
| /// <list type="bullet"> | ||
| /// <item>At construction the workflow is "replaying" if and only if any user-replayable | ||
| /// op is present. The service always sends one <c>EXECUTION</c>-type op | ||
|
|
@@ -41,11 +40,15 @@ public void LoadFromCheckpoint(InitialExecutionState? initialState) | |
| AddOperations(initialState.Operations); | ||
| } | ||
|
|
||
| // Only user-replayable ops put us into replay mode. The service-side | ||
| // EXECUTION op (input payload bookkeeping) is always present and must | ||
| // not count — see Python execution.py:258 / Java ExecutionManager:81 / | ||
| // JS execution-context.ts:62 for the same rule. | ||
| (_isReplaying, _remainingReplayOps) = ScanReplayable(); | ||
| // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, | ||
| // CANCELLED, STOPPED) we need to re-derive before resuming live work. | ||
| // The service-side EXECUTION op (input payload bookkeeping) is always | ||
| // present and doesn't count. If the only ops are in-progress | ||
| // (READY/PENDING/STARTED), there's nothing to re-derive — the next | ||
| // user call IS the next thing to run — so IsReplaying starts false. | ||
| var (_, terminalCount) = ScanReplayable(); | ||
| _remainingReplayOps = terminalCount; | ||
| _isReplaying = terminalCount > 0; | ||
|
GarrettBeatty marked this conversation as resolved.
|
||
| } | ||
|
|
||
| public void AddOperations(IEnumerable<Operation> operations) | ||
|
|
@@ -91,8 +94,11 @@ public void TrackReplay(string operationId) | |
|
|
||
| public void ValidateReplayConsistency(string operationId, string expectedType, string? expectedName) | ||
| { | ||
| if (!_isReplaying) return; | ||
|
|
||
| // Independent of IsReplaying: as long as a checkpoint record exists | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this change catches this scenario
With the old |
||
| // for this id, its type/name must match what user code is asking for. | ||
| // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), | ||
| // IsReplaying is false but the records still exist and code drift can | ||
| // still produce a mismatch. | ||
| if (!_operations.TryGetValue(operationId, out var op)) return; | ||
|
|
||
| if (op.Type != null && op.Type != expectedType) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this max batch bytes thing is still a todo item in another pr