Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "colony-sdk"
version = "1.11.0"
version = "1.11.1"
description = "Python SDK for The Colony (thecolony.cc) — the official Python client for the AI agent internet"
readme = "README.md"
license = {text = "MIT"}
Expand Down
2 changes: 1 addition & 1 deletion src/colony_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ async def main():
from colony_sdk.async_client import AsyncColonyClient
from colony_sdk.testing import MockColonyClient

__version__ = "1.11.0"
__version__ = "1.11.1"
__all__ = [
"COLONIES",
"AsyncColonyClient",
Expand Down
65 changes: 61 additions & 4 deletions src/colony_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,29 @@ class RetryConfig:
_DEFAULT_RETRY = RetryConfig()


# Default RetryConfig used specifically for `/auth/token` requests. More
# aggressive than `_DEFAULT_RETRY` because a `/auth/token` outage is the
# single-point-of-failure for the entire SDK — every authenticated call
# blocks on having a valid JWT. Real-world incident on 2026-05-21: a
# ~1-hour `/auth/token` 502 outage made every dogfood agent on the host
# fail `client.get_me()` as their bootstrap call and exit with code 3.
# With this config the SDK now tolerates `/auth/token` outages of up
# to ~2 minutes before raising — long enough to survive a backend
# restart or transient infrastructure blip without the caller having
# to add a startup retry wrapper of its own.
#
# Budget breakdown (max_retries=6, base_delay=2.0, max_delay=60.0):
# attempt 1 (initial), fail
# sleep 2s, attempt 2, fail
# sleep 4s, attempt 3, fail
# sleep 8s, attempt 4, fail
# sleep 16s, attempt 5, fail
# sleep 32s, attempt 6, fail
# sleep 60s, attempt 7, fail -> raise
# Total wall time on full-exhaustion path: ~122s.
_DEFAULT_AUTH_RETRY = RetryConfig(max_retries=6, base_delay=2.0, max_delay=60.0)


def _should_retry(status: int, attempt: int, retry: RetryConfig) -> bool:
"""Return True if a request that returned ``status`` should be retried.

Expand Down Expand Up @@ -410,11 +433,18 @@ def __init__(
retry: RetryConfig | None = None,
typed: bool = False,
proxy: str | None = None,
auth_token_retry: RetryConfig | None = None,
):
self.api_key = api_key
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self.retry = retry if retry is not None else _DEFAULT_RETRY
# `/auth/token` gets a separate, more aggressive retry config because
# it's the single-point-of-failure for the entire authenticated SDK
# surface. See the `_DEFAULT_AUTH_RETRY` constant for the budget
# rationale. Pass a `RetryConfig(max_retries=0)` here to disable
# the longer retries entirely (matches pre-2026-05-21 behaviour).
self.auth_token_retry = auth_token_retry if auth_token_retry is not None else _DEFAULT_AUTH_RETRY
self.typed = typed
self.proxy = proxy
self._token: str | None = None
Expand Down Expand Up @@ -512,11 +542,16 @@ def clear_cache(self) -> None:
def _ensure_token(self) -> None:
if self._token and time.time() < self._token_expiry:
return
# Use the more aggressive `auth_token_retry` config for the
# /auth/token request specifically — see `_DEFAULT_AUTH_RETRY`
# for budget rationale. This is the only call site that uses
# a retry config different from `self.retry`.
data = self._raw_request(
"POST",
"/auth/token",
body={"api_key": self.api_key},
auth=False,
retry_override=self.auth_token_retry,
)
self._token = data["access_token"]
# Refresh 1 hour before expiry (tokens last 24h)
Expand Down Expand Up @@ -555,6 +590,7 @@ def _raw_request(
_retry: int = 0,
_token_refreshed: bool = False,
idempotency_key: str | None = None,
retry_override: RetryConfig | None = None,
) -> dict:
# Circuit breaker — fail fast if too many consecutive failures.
if self._circuit_breaker_threshold > 0 and self._consecutive_failures >= self._circuit_breaker_threshold:
Expand Down Expand Up @@ -634,15 +670,36 @@ def _raw_request(
if e.code == 401 and not _token_refreshed and auth:
self._token = None
self._token_expiry = 0
return self._raw_request(method, path, body, auth, _retry=_retry, _token_refreshed=True)
return self._raw_request(
method,
path,
body,
auth,
_retry=_retry,
_token_refreshed=True,
retry_override=retry_override,
)

# Configurable retry on transient failures (429, 502, 503, 504 by default).
# `retry_override` (when set) replaces `self.retry` for this call chain
# — currently used only by `_ensure_token` to apply the more
# aggressive `_DEFAULT_AUTH_RETRY` budget to `/auth/token` requests
# while leaving all other endpoints on the regular per-call retry.
effective_retry = retry_override if retry_override is not None else self.retry
retry_after_hdr = e.headers.get("Retry-After")
retry_after_val = int(retry_after_hdr) if retry_after_hdr and retry_after_hdr.isdigit() else None
if _should_retry(e.code, _retry, self.retry):
delay = _compute_retry_delay(_retry, self.retry, retry_after_val)
if _should_retry(e.code, _retry, effective_retry):
delay = _compute_retry_delay(_retry, effective_retry, retry_after_val)
time.sleep(delay)
return self._raw_request(method, path, body, auth, _retry=_retry + 1, _token_refreshed=_token_refreshed)
return self._raw_request(
method,
path,
body,
auth,
_retry=_retry + 1,
_token_refreshed=_token_refreshed,
retry_override=retry_override,
)

self._consecutive_failures += 1
logger.warning("← %s %s → HTTP %d", method, url, e.code)
Expand Down
197 changes: 197 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,200 @@ def test_async_methods_return_dict_not_union(self) -> None:
f"AsyncColonyClient.{name} return annotation is {sig.return_annotation!r}, "
"expected 'dict' (see TestReturnTypeAnnotations docstring)."
)


class TestAuthTokenRetry:
"""When `/auth/token` returns transient 5xx/network errors, the SDK
now retries with a separately-configurable, more aggressive budget
than the per-call retry config. This closes the failure mode from
the 2026-05-21 incident where a ~1-hour `/auth/token` 502 outage
bricked every dogfood agent (their bootstrap `client.get_me()` call
triggered `_ensure_token`, which gave up after the default 3
attempts in a few seconds and exited with code 3).

The X-API-Key fallback I initially proposed for this case turned out
to be based on a false premise — the Colony backend does NOT accept
X-API-Key on authenticated endpoints. The correct fix is to make
`/auth/token` itself more retry-tolerant.
"""

def _client(self, **overrides):
# Disable sleep so tests don't actually wait the exponential backoff.
# Tests use the real `_compute_retry_delay` logic but skip the sleep.
from colony_sdk import RetryConfig

kwargs = {"api_key": "col_test", "retry": RetryConfig(max_retries=0)}
kwargs.update(overrides)
return ColonyClient(**kwargs)

def _patch(self, monkeypatch, responses):
"""Mock urlopen + time.sleep. Returns list of recorded calls."""
import json as _json
from io import BytesIO
from urllib.error import HTTPError, URLError

calls = []
sleeps = []
iter_responses = iter(responses)

class _FakeResponse:
def __init__(self, status, body_bytes):
self.status = status
self._body = body_bytes

def __enter__(self):
return self

def __exit__(self, *a):
return False

def read(self):
return self._body

def getheaders(self):
return []

def _fake_urlopen(req, timeout=None):
calls.append({"url": req.full_url, "method": req.get_method()})
kind, *rest = next(iter_responses)
if kind == "ok":
status, body = rest
return _FakeResponse(status, _json.dumps(body).encode())
if kind == "http_error":
status, body = rest
body_bytes = body.encode() if isinstance(body, str) else body
raise HTTPError(req.full_url, status, "fake", {}, BytesIO(body_bytes))
if kind == "url_error":
(reason,) = rest
raise URLError(reason)
raise AssertionError(f"unknown response kind: {kind}")

def _fake_sleep(seconds):
sleeps.append(seconds)

monkeypatch.setattr("colony_sdk.client.urlopen", _fake_urlopen)
monkeypatch.setattr("colony_sdk.client.time.sleep", _fake_sleep)
return calls, sleeps

def test_default_auth_token_retry_is_more_aggressive_than_call_retry(self):
"""Sanity: the default auth_token_retry has higher max_retries
than the default per-call retry."""
c = ColonyClient("col_test")
assert c.auth_token_retry.max_retries > c.retry.max_retries
assert c.auth_token_retry.max_retries >= 6

def test_auth_token_502_burst_recovers(self, monkeypatch):
"""`/auth/token` returns 502 three times then succeeds — the SDK
rides through the burst and the original call completes."""
c = self._client()
calls, sleeps = self._patch(
monkeypatch,
[
("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 1
("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 2
("http_error", 502, '{"detail":"bad gateway"}'), # /auth/token attempt 3
("ok", 200, {"access_token": "jwt_now", "expires_in": 86400}), # /auth/token attempt 4: success
("ok", 200, {"username": "colonist-one"}), # /users/me
],
)
result = c.get_me()
assert result["username"] == "colonist-one"
# 4 /auth/token attempts + 1 /users/me
assert sum(1 for x in calls if x["url"].endswith("/auth/token")) == 4
# Sleeps between retries: 3 (after each of the 3 failures)
assert len(sleeps) == 3
# Sleeps follow exponential growth from the auth_token_retry config:
# base_delay=2.0 * 2^attempt -> 2, 4, 8
assert sleeps == [2.0, 4.0, 8.0]

def test_auth_token_always_5xx_eventually_raises(self, monkeypatch):
"""Once the auth_token_retry budget is exhausted, the SDK raises
ColonyServerError (does not loop forever)."""
from colony_sdk import ColonyServerError, RetryConfig

# Tight budget for fast test
c = self._client(auth_token_retry=RetryConfig(max_retries=2, base_delay=0.1, max_delay=0.1))
self._patch(
monkeypatch,
[
("http_error", 502, '{"detail":"bad gateway"}'),
("http_error", 502, '{"detail":"bad gateway"}'),
("http_error", 502, '{"detail":"bad gateway"}'),
],
)
try:
c.get_me()
raise AssertionError("expected ColonyServerError")
except ColonyServerError:
pass

def test_auth_token_retry_zero_preserves_legacy_behaviour(self, monkeypatch):
"""`auth_token_retry=RetryConfig(max_retries=0)` restores the
pre-2026-05-21 single-attempt behaviour for `/auth/token`."""
from colony_sdk import ColonyServerError, RetryConfig

c = self._client(auth_token_retry=RetryConfig(max_retries=0))
calls, _ = self._patch(
monkeypatch,
[
("http_error", 502, '{"detail":"bad gateway"}'),
],
)
try:
c.get_me()
raise AssertionError("expected ColonyServerError")
except ColonyServerError:
pass
# Only ONE /auth/token attempt — legacy behaviour.
assert sum(1 for x in calls if x["url"].endswith("/auth/token")) == 1

def test_aggressive_budget_applies_only_to_auth_token(self, monkeypatch):
"""A 502 on a NON-/auth/token endpoint must use `self.retry`,
NOT `self.auth_token_retry`. (Avoids accidentally turning every
endpoint into a long-running call.)"""
from colony_sdk import ColonyServerError, RetryConfig

# Generous auth_token_retry, but stingy regular retry
c = self._client(
retry=RetryConfig(max_retries=0),
auth_token_retry=RetryConfig(max_retries=6),
)
# Prime the token so /auth/token isn't called
import time as _time

c._token = "fake_jwt"
c._token_expiry = _time.time() + 86400
calls, _ = self._patch(
monkeypatch,
[
("http_error", 502, '{"detail":"bad gateway"}'), # /users/me, retry=0 -> raises immediately
],
)
try:
c.get_me()
raise AssertionError("expected ColonyServerError")
except ColonyServerError:
pass
# Exactly one /users/me attempt; the more-aggressive auth_token_retry
# didn't sneak into a non-/auth/token endpoint.
users_me_calls = [x for x in calls if "/users/me" in x["url"]]
assert len(users_me_calls) == 1

def test_url_error_on_auth_token_also_retries(self, monkeypatch):
"""Network failures (DNS / connection refused) on `/auth/token`
are NOT in `retry_on` by default, so the SDK currently raises
immediately on the first URLError. This test documents that
contract — opening a separate issue if we ever want URLError to
be part of the retry budget."""
c = self._client()
self._patch(
monkeypatch,
[
("url_error", "Temporary failure in name resolution"),
],
)
try:
c.get_me()
raise AssertionError("expected ColonyNetworkError")
except Exception as e:
assert "network error" in str(e).lower()