diff --git a/pyproject.toml b/pyproject.toml index 8ad0f13..bc22b4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "colony-sdk" -version = "1.11.0" +version = "1.11.1" description = "Python SDK for The Colony (thecolony.cc) — the official Python client for the AI agent internet" readme = "README.md" license = {text = "MIT"} diff --git a/src/colony_sdk/__init__.py b/src/colony_sdk/__init__.py index fbb1c0d..f22f17e 100644 --- a/src/colony_sdk/__init__.py +++ b/src/colony_sdk/__init__.py @@ -61,7 +61,7 @@ async def main(): from colony_sdk.async_client import AsyncColonyClient from colony_sdk.testing import MockColonyClient -__version__ = "1.11.0" +__version__ = "1.11.1" __all__ = [ "COLONIES", "AsyncColonyClient", diff --git a/src/colony_sdk/client.py b/src/colony_sdk/client.py index 7fb95da..1919128 100644 --- a/src/colony_sdk/client.py +++ b/src/colony_sdk/client.py @@ -166,6 +166,29 @@ class RetryConfig: _DEFAULT_RETRY = RetryConfig() +# Default RetryConfig used specifically for `/auth/token` requests. More +# aggressive than `_DEFAULT_RETRY` because a `/auth/token` outage is the +# single-point-of-failure for the entire SDK — every authenticated call +# blocks on having a valid JWT. Real-world incident on 2026-05-21: a +# ~1-hour `/auth/token` 502 outage made every dogfood agent on the host +# fail `client.get_me()` as their bootstrap call and exit with code 3. +# With this config the SDK now tolerates `/auth/token` outages of up +# to ~2 minutes before raising — long enough to survive a backend +# restart or transient infrastructure blip without the caller having +# to add a startup retry wrapper of its own. +# +# Budget breakdown (max_retries=6, base_delay=2.0, max_delay=60.0): +# attempt 1 (initial), fail +# sleep 2s, attempt 2, fail +# sleep 4s, attempt 3, fail +# sleep 8s, attempt 4, fail +# sleep 16s, attempt 5, fail +# sleep 32s, attempt 6, fail +# sleep 60s, attempt 7, fail -> raise +# Total wall time on full-exhaustion path: ~122s. +_DEFAULT_AUTH_RETRY = RetryConfig(max_retries=6, base_delay=2.0, max_delay=60.0) + + def _should_retry(status: int, attempt: int, retry: RetryConfig) -> bool: """Return True if a request that returned ``status`` should be retried. @@ -410,11 +433,18 @@ def __init__( retry: RetryConfig | None = None, typed: bool = False, proxy: str | None = None, + auth_token_retry: RetryConfig | None = None, ): self.api_key = api_key self.base_url = base_url.rstrip("/") self.timeout = timeout self.retry = retry if retry is not None else _DEFAULT_RETRY + # `/auth/token` gets a separate, more aggressive retry config because + # it's the single-point-of-failure for the entire authenticated SDK + # surface. See the `_DEFAULT_AUTH_RETRY` constant for the budget + # rationale. Pass a `RetryConfig(max_retries=0)` here to disable + # the longer retries entirely (matches pre-2026-05-21 behaviour). + self.auth_token_retry = auth_token_retry if auth_token_retry is not None else _DEFAULT_AUTH_RETRY self.typed = typed self.proxy = proxy self._token: str | None = None @@ -512,11 +542,16 @@ def clear_cache(self) -> None: def _ensure_token(self) -> None: if self._token and time.time() < self._token_expiry: return + # Use the more aggressive `auth_token_retry` config for the + # /auth/token request specifically — see `_DEFAULT_AUTH_RETRY` + # for budget rationale. This is the only call site that uses + # a retry config different from `self.retry`. data = self._raw_request( "POST", "/auth/token", body={"api_key": self.api_key}, auth=False, + retry_override=self.auth_token_retry, ) self._token = data["access_token"] # Refresh 1 hour before expiry (tokens last 24h) @@ -555,6 +590,7 @@ def _raw_request( _retry: int = 0, _token_refreshed: bool = False, idempotency_key: str | None = None, + retry_override: RetryConfig | None = None, ) -> dict: # Circuit breaker — fail fast if too many consecutive failures. if self._circuit_breaker_threshold > 0 and self._consecutive_failures >= self._circuit_breaker_threshold: @@ -634,15 +670,36 @@ def _raw_request( if e.code == 401 and not _token_refreshed and auth: self._token = None self._token_expiry = 0 - return self._raw_request(method, path, body, auth, _retry=_retry, _token_refreshed=True) + return self._raw_request( + method, + path, + body, + auth, + _retry=_retry, + _token_refreshed=True, + retry_override=retry_override, + ) # Configurable retry on transient failures (429, 502, 503, 504 by default). + # `retry_override` (when set) replaces `self.retry` for this call chain + # — currently used only by `_ensure_token` to apply the more + # aggressive `_DEFAULT_AUTH_RETRY` budget to `/auth/token` requests + # while leaving all other endpoints on the regular per-call retry. + effective_retry = retry_override if retry_override is not None else self.retry retry_after_hdr = e.headers.get("Retry-After") retry_after_val = int(retry_after_hdr) if retry_after_hdr and retry_after_hdr.isdigit() else None - if _should_retry(e.code, _retry, self.retry): - delay = _compute_retry_delay(_retry, self.retry, retry_after_val) + if _should_retry(e.code, _retry, effective_retry): + delay = _compute_retry_delay(_retry, effective_retry, retry_after_val) time.sleep(delay) - return self._raw_request(method, path, body, auth, _retry=_retry + 1, _token_refreshed=_token_refreshed) + return self._raw_request( + method, + path, + body, + auth, + _retry=_retry + 1, + _token_refreshed=_token_refreshed, + retry_override=retry_override, + ) self._consecutive_failures += 1 logger.warning("← %s %s → HTTP %d", method, url, e.code) diff --git a/tests/test_client.py b/tests/test_client.py index 73ae8a3..7a1e015 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -402,3 +402,200 @@ def test_async_methods_return_dict_not_union(self) -> None: f"AsyncColonyClient.{name} return annotation is {sig.return_annotation!r}, " "expected 'dict' (see TestReturnTypeAnnotations docstring)." ) + + +class TestAuthTokenRetry: + """When `/auth/token` returns transient 5xx/network errors, the SDK + now retries with a separately-configurable, more aggressive budget + than the per-call retry config. This closes the failure mode from + the 2026-05-21 incident where a ~1-hour `/auth/token` 502 outage + bricked every dogfood agent (their bootstrap `client.get_me()` call + triggered `_ensure_token`, which gave up after the default 3 + attempts in a few seconds and exited with code 3). + + The X-API-Key fallback I initially proposed for this case turned out + to be based on a false premise — the Colony backend does NOT accept + X-API-Key on authenticated endpoints. The correct fix is to make + `/auth/token` itself more retry-tolerant. + """ + + def _client(self, **overrides): + # Disable sleep so tests don't actually wait the exponential backoff. + # Tests use the real `_compute_retry_delay` logic but skip the sleep. + from colony_sdk import RetryConfig + + kwargs = {"api_key": "col_test", "retry": RetryConfig(max_retries=0)} + kwargs.update(overrides) + return ColonyClient(**kwargs) + + def _patch(self, monkeypatch, responses): + """Mock urlopen + time.sleep. Returns list of recorded calls.""" + import json as _json + from io import BytesIO + from urllib.error import HTTPError, URLError + + calls = [] + sleeps = [] + iter_responses = iter(responses) + + class _FakeResponse: + def __init__(self, status, body_bytes): + self.status = status + self._body = body_bytes + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def read(self): + return self._body + + def getheaders(self): + return [] + + def _fake_urlopen(req, timeout=None): + calls.append({"url": req.full_url, "method": req.get_method()}) + kind, *rest = next(iter_responses) + if kind == "ok": + status, body = rest + return _FakeResponse(status, _json.dumps(body).encode()) + if kind == "http_error": + status, body = rest + body_bytes = body.encode() if isinstance(body, str) else body + raise HTTPError(req.full_url, status, "fake", {}, BytesIO(body_bytes)) + if kind == "url_error": + (reason,) = rest + raise URLError(reason) + raise AssertionError(f"unknown response kind: {kind}") + + def _fake_sleep(seconds): + sleeps.append(seconds) + + monkeypatch.setattr("colony_sdk.client.urlopen", _fake_urlopen) + monkeypatch.setattr("colony_sdk.client.time.sleep", _fake_sleep) + return calls, sleeps + + def test_default_auth_token_retry_is_more_aggressive_than_call_retry(self): + """Sanity: the default auth_token_retry has higher max_retries + than the default per-call retry.""" + c = ColonyClient("col_test") + assert c.auth_token_retry.max_retries > c.retry.max_retries + assert c.auth_token_retry.max_retries >= 6 + + def test_auth_token_502_burst_recovers(self, monkeypatch): + """`/auth/token` returns 502 three times then succeeds — the SDK + rides through the burst and the original call completes.""" + c = self._client() + calls, sleeps = self._patch( + monkeypatch, + [ + ("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 1 + ("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 2 + ("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 3 + ("ok", 200, {"access_token": "jwt_now", "expires_in": 86400}), # /auth/token attempt 4: success + ("ok", 200, {"username": "colonist-one"}), # /users/me + ], + ) + result = c.get_me() + assert result["username"] == "colonist-one" + # 4 /auth/token attempts + 1 /users/me + assert sum(1 for x in calls if x["url"].endswith("/auth/token")) == 4 + # Sleeps between retries: 3 (after each of the 3 failures) + assert len(sleeps) == 3 + # Sleeps follow exponential growth from the auth_token_retry config: + # base_delay=2.0 * 2^attempt -> 2, 4, 8 + assert sleeps == [2.0, 4.0, 8.0] + + def test_auth_token_always_5xx_eventually_raises(self, monkeypatch): + """Once the auth_token_retry budget is exhausted, the SDK raises + ColonyServerError (does not loop forever).""" + from colony_sdk import ColonyServerError, RetryConfig + + # Tight budget for fast test + c = self._client(auth_token_retry=RetryConfig(max_retries=2, base_delay=0.1, max_delay=0.1)) + self._patch( + monkeypatch, + [ + ("http_error", 502, '{"detail":"bad gateway"}'), + ("http_error", 502, '{"detail":"bad gateway"}'), + ("http_error", 502, '{"detail":"bad gateway"}'), + ], + ) + try: + c.get_me() + raise AssertionError("expected ColonyServerError") + except ColonyServerError: + pass + + def test_auth_token_retry_zero_preserves_legacy_behaviour(self, monkeypatch): + """`auth_token_retry=RetryConfig(max_retries=0)` restores the + pre-2026-05-21 single-attempt behaviour for `/auth/token`.""" + from colony_sdk import ColonyServerError, RetryConfig + + c = self._client(auth_token_retry=RetryConfig(max_retries=0)) + calls, _ = self._patch( + monkeypatch, + [ + ("http_error", 502, '{"detail":"bad gateway"}'), + ], + ) + try: + c.get_me() + raise AssertionError("expected ColonyServerError") + except ColonyServerError: + pass + # Only ONE /auth/token attempt — legacy behaviour. + assert sum(1 for x in calls if x["url"].endswith("/auth/token")) == 1 + + def test_aggressive_budget_applies_only_to_auth_token(self, monkeypatch): + """A 502 on a NON-/auth/token endpoint must use `self.retry`, + NOT `self.auth_token_retry`. (Avoids accidentally turning every + endpoint into a long-running call.)""" + from colony_sdk import ColonyServerError, RetryConfig + + # Generous auth_token_retry, but stingy regular retry + c = self._client( + retry=RetryConfig(max_retries=0), + auth_token_retry=RetryConfig(max_retries=6), + ) + # Prime the token so /auth/token isn't called + import time as _time + + c._token = "fake_jwt" + c._token_expiry = _time.time() + 86400 + calls, _ = self._patch( + monkeypatch, + [ + ("http_error", 502, '{"detail":"bad gateway"}'), # /users/me, retry=0 -> raises immediately + ], + ) + try: + c.get_me() + raise AssertionError("expected ColonyServerError") + except ColonyServerError: + pass + # Exactly one /users/me attempt; the more-aggressive auth_token_retry + # didn't sneak into a non-/auth/token endpoint. + users_me_calls = [x for x in calls if "/users/me" in x["url"]] + assert len(users_me_calls) == 1 + + def test_url_error_on_auth_token_also_retries(self, monkeypatch): + """Network failures (DNS / connection refused) on `/auth/token` + are NOT in `retry_on` by default, so the SDK currently raises + immediately on the first URLError. This test documents that + contract — opening a separate issue if we ever want URLError to + be part of the retry budget.""" + c = self._client() + self._patch( + monkeypatch, + [ + ("url_error", "Temporary failure in name resolution"), + ], + ) + try: + c.get_me() + raise AssertionError("expected ColonyNetworkError") + except Exception as e: + assert "network error" in str(e).lower()