From 0bfa29587625e75a3f0fd2c6eae13d6ddad63141 Mon Sep 17 00:00:00 2001 From: Pablo Maldonado Date: Thu, 12 Mar 2026 11:40:05 +0000 Subject: [PATCH 1/3] feat(monitor-v2): add failure grace period to PM notifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Transient Polymarket API failures (timeouts, 5xx) currently trigger immediate Slack alerts and permanently mark the proposal as handled, preventing any retry on subsequent runs. This adds a two-layer resiliency mechanism: Layer 1 — Better in-process HTTP retries: - RETRY_ATTEMPTS default 1 → 3 - RETRY_DELAY_MS default 0 → 1000 (exponential backoff) Layer 2 — Cross-run failure grace period: - New Datastore kind "FailedProposals" tracks proposals that fail with firstFailureAt timestamp, failureCount, and lastError - First failure stores a record and warns, does NOT alert - Subsequent failures within the grace period update the record - Only after FAILURE_GRACE_PERIOD_SECONDS (default 630s ≈ 2 serverless runs + buffer) does the alert fire - Successful checks silently clear any prior failure record - Setting grace period to 0 preserves the original behavior No changes to the existing NotifiedProposals schema or notification format. --- .../MonitorProposalsOrderBook.ts | 51 ++++- .../src/monitor-polymarket/common.ts | 41 +++- packages/monitor-v2/test/PolymarketMonitor.ts | 214 ++++++++++++++++++ 3 files changed, 303 insertions(+), 3 deletions(-) diff --git a/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts b/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts index 35deca585d..04315e14ee 100644 --- a/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts +++ b/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts @@ -13,6 +13,7 @@ import { decodeMultipleValuesQuery, fetchOrderFilledEventsBounded, generateAIDeepLink, + getFailedProposal, getNotifiedProposals, getOrderFilledEvents, getPolymarketMarketInformation, @@ -28,7 +29,9 @@ import { ONE_SCALED, POLYGON_BLOCKS_PER_HOUR, PolymarketTradeInformation, + removeFailedProposal, shouldIgnoreThirdPartyProposal, + storeFailedProposal, storeNotifiedProposals, Logger, Market, @@ -246,8 +249,53 @@ export async function monitorTransactionsProposedOrderBook( const logErrorAndPersist = async (proposal: OptimisticPriceRequest, err: Error) => { const aiDeeplink = generateAIDeepLink(proposal.proposalHash, proposal.proposalLogIndex, params.aiResultsBaseUrl); - await logFailedMarketProposalVerification(logger, params.chainId, proposal, err as Error, aiDeeplink); + + // When grace period is 0, alert immediately (original behavior). + if (params.failureGracePeriodSeconds <= 0) { + await logFailedMarketProposalVerification(logger, params.chainId, proposal, err, aiDeeplink); + await persistNotified(proposal, logger); + return; + } + + const proposalKey = getProposalKeyToStore(proposal); + const failureRecord = await getFailedProposal(proposalKey); + const now = Date.now(); + + if (!failureRecord) { + await storeFailedProposal(proposalKey, { + firstFailureAt: new Date(now).toISOString(), + failureCount: 1, + lastError: err.message, + }); + logger.warn({ + at: "PolymarketMonitor", + message: "Check failed, will retry before alerting", + proposalHash: proposal.proposalHash, + error: err.message, + }); + return; + } + + const elapsed = now - new Date(failureRecord.firstFailureAt).getTime(); + if (elapsed < params.failureGracePeriodSeconds * 1000) { + await storeFailedProposal(proposalKey, { + ...failureRecord, + failureCount: failureRecord.failureCount + 1, + lastError: err.message, + }); + logger.warn({ + at: "PolymarketMonitor", + message: `Check failed (attempt ${failureRecord.failureCount + 1}), still within grace period`, + proposalHash: proposal.proposalHash, + error: err.message, + }); + return; + } + + // Grace period exceeded — alert and persist as notified. + await logFailedMarketProposalVerification(logger, params.chainId, proposal, err, aiDeeplink); await persistNotified(proposal, logger); + await removeFailedProposal(proposalKey); }; await Promise.all( @@ -386,6 +434,7 @@ export async function monitorTransactionsProposedOrderBook( tradeFilterFromTimestamp, }); if (alerted) await persistNotified(proposal, logger); + await removeFailedProposal(getProposalKeyToStore(proposal)); } catch (err) { await logErrorAndPersist(proposal, err as Error); } diff --git a/packages/monitor-v2/src/monitor-polymarket/common.ts b/packages/monitor-v2/src/monitor-polymarket/common.ts index dca45886b9..bc58855696 100644 --- a/packages/monitor-v2/src/monitor-polymarket/common.ts +++ b/packages/monitor-v2/src/monitor-polymarket/common.ts @@ -103,6 +103,7 @@ export interface MonitoringParams { paginatedEventQueryConcurrency: number; maxTradesPerToken: number; fillEventsChunkBlocks: number; + failureGracePeriodSeconds: number; } interface PolymarketMarketGraphql { question: string; @@ -851,6 +852,35 @@ export const getNotifiedProposals = async (): Promise<{ }, {}); }; +export interface FailedProposalRecord { + firstFailureAt: string; + failureCount: number; + lastError: string; +} + +export const getFailedProposal = async (proposalKey: string): Promise => { + if (!datastore) return null; + const key = datastore.key(["FailedProposals", proposalKey]); + const [entity] = await datastore.get(key); + return entity ? (entity as FailedProposalRecord) : null; +}; + +export const storeFailedProposal = async (proposalKey: string, record: FailedProposalRecord): Promise => { + if (!datastore) return; + const key = datastore.key(["FailedProposals", proposalKey]); + await datastore.save({ key, data: record }); +}; + +export const removeFailedProposal = async (proposalKey: string): Promise => { + if (!datastore) return; + try { + const key = datastore.key(["FailedProposals", proposalKey]); + await datastore.delete(key); + } catch { + // Ignore — entity may not exist + } +}; + export const parseEnvList = (env: NodeJS.ProcessEnv, key: string, defaultValue: string[]): string[] => { const rawValue = env[key]; if (!rawValue) return defaultValue; @@ -895,8 +925,8 @@ export const initMonitoringParams = async ( const pollingDelay = env.POLLING_DELAY ? Number(env.POLLING_DELAY) : 60; const maxBlockLookBack = env.MAX_BLOCK_LOOK_BACK ? Number(env.MAX_BLOCK_LOOK_BACK) : 3499; - const retryAttempts = env.RETRY_ATTEMPTS ? Number(env.RETRY_ATTEMPTS) : 1; - const retryDelayMs = env.RETRY_DELAY_MS ? Number(env.RETRY_DELAY_MS) : 0; + const retryAttempts = env.RETRY_ATTEMPTS ? Number(env.RETRY_ATTEMPTS) : 3; + const retryDelayMs = env.RETRY_DELAY_MS ? Number(env.RETRY_DELAY_MS) : 1000; const unknownProposalNotificationInterval = env.UNKNOWN_PROPOSAL_NOTIFICATION_INTERVAL ? Number(env.UNKNOWN_PROPOSAL_NOTIFICATION_INTERVAL) @@ -936,6 +966,12 @@ export const initMonitoringParams = async ( const orderBookBatchSize = env.ORDER_BOOK_BATCH_SIZE ? Number(env.ORDER_BOOK_BATCH_SIZE) : 499; + // Grace period before alerting on failures. 0 = alert immediately (original behavior). + // Default 630s = 11 minutes = 2 consecutive 5-minute serverless runs + 30s buffer. + const failureGracePeriodSeconds = env.FAILURE_GRACE_PERIOD_SECONDS + ? Number(env.FAILURE_GRACE_PERIOD_SECONDS) + : 630; + // Rate limit and retry with exponential backoff and jitter to handle rate limiting and errors from the APIs. const httpClient = createHttpClient({ axios: { timeout: httpTimeout }, @@ -985,6 +1021,7 @@ export const initMonitoringParams = async ( paginatedEventQueryConcurrency, maxTradesPerToken, fillEventsChunkBlocks, + failureGracePeriodSeconds, }; }; diff --git a/packages/monitor-v2/test/PolymarketMonitor.ts b/packages/monitor-v2/test/PolymarketMonitor.ts index 828ad822f9..cbe3195013 100644 --- a/packages/monitor-v2/test/PolymarketMonitor.ts +++ b/packages/monitor-v2/test/PolymarketMonitor.ts @@ -115,6 +115,7 @@ describe("PolymarketNotifier", function () { paginatedEventQueryConcurrency: 5, maxTradesPerToken: 50, fillEventsChunkBlocks: 30, + failureGracePeriodSeconds: 0, // Disable grace period by default to preserve existing test behavior }; }; @@ -167,6 +168,11 @@ describe("PolymarketNotifier", function () { // Tests that need to override this should call fetchBoundedStub.restore() first fetchBoundedStub = sandbox.stub(commonModule, "fetchOrderFilledEventsBounded").resolves(new Map()); + // Stub failure grace period Datastore functions (no-ops by default) + sandbox.stub(commonModule, "getFailedProposal").resolves(null); + sandbox.stub(commonModule, "storeFailedProposal").resolves(); + sandbox.stub(commonModule, "removeFailedProposal").resolves(); + // Fund staker and stake tokens. const TEN_MILLION = ethers.utils.parseEther("10000000"); await (await votingToken.addMinter(await deployer.getAddress())).wait(); @@ -1393,4 +1399,212 @@ describe("PolymarketNotifier", function () { assert.deepEqual(result[1], [], "token2 returns empty array"); }); }); + + describe("failure grace period", function () { + const makeGracePeriodParams = async (gracePeriodSeconds: number) => { + const base = await createMonitoringParams(); + return { ...base, failureGracePeriodSeconds: gracePeriodSeconds }; + }; + + const makeMockProposal = async (): Promise => ({ + proposalHash: "0xgracetest", + requester: await deployer.getAddress(), + proposer: await deployer.getAddress(), + identifier: "0x5945535f4f525f4e4f5f51554552590000000000000000000000000000000000", + proposedPrice: ONE, + requestTimestamp: ethers.BigNumber.from(Date.now()), + proposalBlockNumber: 12345, + ancillaryData: ethers.utils.hexlify(ancillaryData), + requestHash: "0xrequesthash", + requestLogIndex: 0, + proposalTimestamp: ethers.BigNumber.from(Date.now()), + proposalExpirationTimestamp: ethers.BigNumber.from(Date.now() + 1000 * 60 * 60 * 24), + proposalLogIndex: 0, + }); + + it("First failure within grace period: warns but does not alert", async function () { + const params = await makeGracePeriodParams(900); // 15 min grace period + + const mockProposal = await makeMockProposal(); + sandbox.stub(commonModule, "getPolymarketProposedPriceRequestsOO").callsFake(async (_p, v) => { + return v === "v2" ? [mockProposal] : []; + }); + + mockFunctionThrowsError("getPolymarketMarketInformation", "Network timeout"); + mockFunctionWithReturnValue("getPolymarketOrderBooks", asBooksRecord(emptyOrders)); + mockSyncFunctionWithReturnValue("getOrderFilledEvents", emptyTradeInformation); + + // getFailedProposal returns null (first failure) + // storeFailedProposal and removeFailedProposal already stubbed in beforeEach + + await oov2.requestPrice(identifier, 1, ancillaryData, votingToken.address, 0); + await oov2.proposePrice(await deployer.getAddress(), identifier, 1, ancillaryData, ONE); + + const spy = sinon.spy(); + const spyLogger = createNewLogger([new SpyTransport({}, { spy })]); + await monitorTransactionsProposedOrderBook(spyLogger, params); + + // Should get a warn log, NOT an error alert + const warnLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message === "Check failed, will retry before alerting"); + assert.equal(warnLogs.length, 1, "Should log a warning for first failure"); + + // Should NOT have the "Failed to verify" error notification + const errorLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message?.includes("Failed to verify proposed market")); + assert.equal(errorLogs.length, 0, "Should not send error notification on first failure"); + + // storeFailedProposal should have been called + assert.isTrue( + (commonModule.storeFailedProposal as sinon.SinonStub).calledOnce, + "Should store failure record" + ); + }); + + it("Subsequent failure within grace period: warns with attempt count", async function () { + const params = await makeGracePeriodParams(900); + + const mockProposal = await makeMockProposal(); + sandbox.stub(commonModule, "getPolymarketProposedPriceRequestsOO").callsFake(async (_p, v) => { + return v === "v2" ? [mockProposal] : []; + }); + + mockFunctionThrowsError("getPolymarketMarketInformation", "Network timeout"); + mockFunctionWithReturnValue("getPolymarketOrderBooks", asBooksRecord(emptyOrders)); + mockSyncFunctionWithReturnValue("getOrderFilledEvents", emptyTradeInformation); + + // Simulate existing failure record from a recent first failure (within grace period) + (commonModule.getFailedProposal as sinon.SinonStub).resolves({ + firstFailureAt: new Date(Date.now() - 5 * 60 * 1000).toISOString(), // 5 min ago + failureCount: 1, + lastError: "Previous error", + }); + + await oov2.requestPrice(identifier, 1, ancillaryData, votingToken.address, 0); + await oov2.proposePrice(await deployer.getAddress(), identifier, 1, ancillaryData, ONE); + + const spy = sinon.spy(); + const spyLogger = createNewLogger([new SpyTransport({}, { spy })]); + await monitorTransactionsProposedOrderBook(spyLogger, params); + + // Should get a warn log with attempt count + const warnLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message?.includes("still within grace period")); + assert.equal(warnLogs.length, 1, "Should log retry warning"); + assert.include(warnLogs[0].message, "attempt 2", "Should include attempt count"); + + // Should NOT have error notification + const errorLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message?.includes("Failed to verify proposed market")); + assert.equal(errorLogs.length, 0, "Should not alert within grace period"); + }); + + it("Failure after grace period exceeded: sends alert and cleans up", async function () { + const params = await makeGracePeriodParams(900); + + const mockProposal = await makeMockProposal(); + sandbox.stub(commonModule, "getPolymarketProposedPriceRequestsOO").callsFake(async (_p, v) => { + return v === "v2" ? [mockProposal] : []; + }); + + mockFunctionThrowsError("getPolymarketMarketInformation", "Network timeout"); + mockFunctionWithReturnValue("getPolymarketOrderBooks", asBooksRecord(emptyOrders)); + mockSyncFunctionWithReturnValue("getOrderFilledEvents", emptyTradeInformation); + + // Simulate failure record from 20 min ago (exceeds 15 min grace period) + (commonModule.getFailedProposal as sinon.SinonStub).resolves({ + firstFailureAt: new Date(Date.now() - 20 * 60 * 1000).toISOString(), // 20 min ago + failureCount: 3, + lastError: "Previous error", + }); + + await oov2.requestPrice(identifier, 1, ancillaryData, votingToken.address, 0); + await oov2.proposePrice(await deployer.getAddress(), identifier, 1, ancillaryData, ONE); + + const spy = sinon.spy(); + const spyLogger = createNewLogger([new SpyTransport({}, { spy })]); + await monitorTransactionsProposedOrderBook(spyLogger, params); + + // Should now have the "Failed to verify" error notification + const errorLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message?.includes("Failed to verify proposed market")); + assert.equal(errorLogs.length, 1, "Should send error alert after grace period"); + + // removeFailedProposal should have been called to clean up + assert.isTrue( + (commonModule.removeFailedProposal as sinon.SinonStub).called, + "Should remove failure record after alerting" + ); + }); + + it("Successful check clears any prior failure record", async function () { + const params = await makeGracePeriodParams(900); + + const mockProposal = await makeMockProposal(); + sandbox.stub(commonModule, "getPolymarketProposedPriceRequestsOO").callsFake(async (_p, v) => { + return v === "v2" ? [mockProposal] : []; + }); + + sandbox.stub(commonModule, "getPolymarketMarketInformation").resolves(marketInfo); + sandbox.stub(commonModule, "getPolymarketOrderBooks").resolves(asBooksRecord(emptyOrders)); + sandbox.stub(commonModule, "getOrderFilledEvents").returns([[], []]); + sandbox.stub(commonModule, "isInitialConfirmationLogged").resolves(true); + + await oov2.requestPrice(identifier, 1, ancillaryData, votingToken.address, 0); + await oov2.proposePrice(await deployer.getAddress(), identifier, 1, ancillaryData, ONE); + + const spy = sinon.spy(); + const spyLogger = createNewLogger([new SpyTransport({}, { spy })]); + await monitorTransactionsProposedOrderBook(spyLogger, params); + + // removeFailedProposal should have been called on the success path + assert.isTrue( + (commonModule.removeFailedProposal as sinon.SinonStub).called, + "Should clean up failure record on success" + ); + }); + + it("Grace period of 0 alerts immediately (backward-compatible)", async function () { + const params = await makeGracePeriodParams(0); + + const mockProposal = await makeMockProposal(); + sandbox.stub(commonModule, "getPolymarketProposedPriceRequestsOO").callsFake(async (_p, v) => { + return v === "v2" ? [mockProposal] : []; + }); + + mockFunctionThrowsError("getPolymarketMarketInformation", "Network timeout"); + mockFunctionWithReturnValue("getPolymarketOrderBooks", asBooksRecord(emptyOrders)); + mockSyncFunctionWithReturnValue("getOrderFilledEvents", emptyTradeInformation); + + await oov2.requestPrice(identifier, 1, ancillaryData, votingToken.address, 0); + await oov2.proposePrice(await deployer.getAddress(), identifier, 1, ancillaryData, ONE); + + const spy = sinon.spy(); + const spyLogger = createNewLogger([new SpyTransport({}, { spy })]); + await monitorTransactionsProposedOrderBook(spyLogger, params); + + // Should get the immediate error alert (no grace period) + const errorLogs = spy + .getCalls() + .map((c) => c.lastArg) + .filter((a) => a?.message?.includes("Failed to verify proposed market")); + assert.equal(errorLogs.length, 1, "Grace period of 0 should alert immediately"); + + // Should NOT have called storeFailedProposal (skips grace period logic entirely) + assert.isFalse( + (commonModule.storeFailedProposal as sinon.SinonStub).called, + "Should not store failure record when grace period is 0" + ); + }); + }); }); From 2d088b8b3a722b010603b579f6e3b2ab2df3bf6c Mon Sep 17 00:00:00 2001 From: Pablo Maldonado Date: Thu, 12 Mar 2026 11:45:16 +0000 Subject: [PATCH 2/3] fix(monitor-v2): prettier formatting --- packages/monitor-v2/src/monitor-polymarket/common.ts | 4 +--- packages/monitor-v2/test/PolymarketMonitor.ts | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/packages/monitor-v2/src/monitor-polymarket/common.ts b/packages/monitor-v2/src/monitor-polymarket/common.ts index bc58855696..2dd2d8706a 100644 --- a/packages/monitor-v2/src/monitor-polymarket/common.ts +++ b/packages/monitor-v2/src/monitor-polymarket/common.ts @@ -968,9 +968,7 @@ export const initMonitoringParams = async ( // Grace period before alerting on failures. 0 = alert immediately (original behavior). // Default 630s = 11 minutes = 2 consecutive 5-minute serverless runs + 30s buffer. - const failureGracePeriodSeconds = env.FAILURE_GRACE_PERIOD_SECONDS - ? Number(env.FAILURE_GRACE_PERIOD_SECONDS) - : 630; + const failureGracePeriodSeconds = env.FAILURE_GRACE_PERIOD_SECONDS ? Number(env.FAILURE_GRACE_PERIOD_SECONDS) : 630; // Rate limit and retry with exponential backoff and jitter to handle rate limiting and errors from the APIs. const httpClient = createHttpClient({ diff --git a/packages/monitor-v2/test/PolymarketMonitor.ts b/packages/monitor-v2/test/PolymarketMonitor.ts index cbe3195013..52f155f177 100644 --- a/packages/monitor-v2/test/PolymarketMonitor.ts +++ b/packages/monitor-v2/test/PolymarketMonitor.ts @@ -1459,10 +1459,7 @@ describe("PolymarketNotifier", function () { assert.equal(errorLogs.length, 0, "Should not send error notification on first failure"); // storeFailedProposal should have been called - assert.isTrue( - (commonModule.storeFailedProposal as sinon.SinonStub).calledOnce, - "Should store failure record" - ); + assert.isTrue((commonModule.storeFailedProposal as sinon.SinonStub).calledOnce, "Should store failure record"); }); it("Subsequent failure within grace period: warns with attempt count", async function () { From c5534ae7d83dd62d661b71c6d88c4cee1c78ed2c Mon Sep 17 00:00:00 2001 From: Pablo Maldonado Date: Thu, 12 Mar 2026 12:02:35 +0000 Subject: [PATCH 3/3] fix(monitor-v2): skip unnecessary Datastore delete when grace period is disabled --- .../src/monitor-polymarket/MonitorProposalsOrderBook.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts b/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts index 04315e14ee..120d3b5517 100644 --- a/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts +++ b/packages/monitor-v2/src/monitor-polymarket/MonitorProposalsOrderBook.ts @@ -434,7 +434,7 @@ export async function monitorTransactionsProposedOrderBook( tradeFilterFromTimestamp, }); if (alerted) await persistNotified(proposal, logger); - await removeFailedProposal(getProposalKeyToStore(proposal)); + if (params.failureGracePeriodSeconds > 0) await removeFailedProposal(getProposalKeyToStore(proposal)); } catch (err) { await logErrorAndPersist(proposal, err as Error); }