Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ See [bot/src/metrics.ts](bot/src/metrics.ts) for the full list of exported metri

#### Telegram API metrics

Two complementary counters track outbound Telegram API traffic. Both increment per-attempt (the inner transformer runs once per autoRetry attempt), so `rate(errors) / rate(calls)` over the same window yields the attempt-level error ratio.
Two complementary counters track outbound Telegram API traffic. Both increment per-attempt (the inner transformer runs once per autoRetry attempt), so `rate(errors) / rate(calls)` over the same window yields the attempt-level error ratio. Exception: `sendMessageDraft` is excluded from autoRetry (see issue #117), so its counters reflect one increment per logical call — attempt-level and call-level rates coincide for that method.

| Metric | Labels | Description |
|--------|--------|-------------|
Expand Down
46 changes: 45 additions & 1 deletion bot/src/__tests__/telegram-bot.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process.env.TZ = "UTC";
import { describe, it, beforeEach } from "node:test";
import assert from "node:assert/strict";
import { resolveBinding, isAuthorized, sessionKey, isImageMimeType, imageExtensionForMime, buildSourcePrefix, shouldRespondInGroup, BOT_COMMANDS, isStaleMessage, buildReplyContext, buildForwardContext, extensionForDocument, formatFileSize, formatDocumentMeta, buildReactionContext, AUTO_RETRY_OPTIONS, extractMediaInfo, extensionForMedia, formatMediaMeta, createTelegramBot, extractChatContext, formatChatContextForLog, createApiErrorLoggingTransformer, resolveBindingLabel, BINDING_LABEL_NONE, BINDING_LABEL_UNBOUND } from "../telegram-bot.js";
import { resolveBinding, isAuthorized, sessionKey, isImageMimeType, imageExtensionForMime, buildSourcePrefix, shouldRespondInGroup, BOT_COMMANDS, isStaleMessage, buildReplyContext, buildForwardContext, extensionForDocument, formatFileSize, formatDocumentMeta, buildReactionContext, AUTO_RETRY_OPTIONS, createDraftSkipAutoRetryTransformer, extractMediaInfo, extensionForMedia, formatMediaMeta, createTelegramBot, extractChatContext, formatChatContextForLog, createApiErrorLoggingTransformer, resolveBindingLabel, BINDING_LABEL_NONE, BINDING_LABEL_UNBOUND } from "../telegram-bot.js";
import client from "prom-client";
import { telegramApiCalls, telegramApiErrors } from "../metrics.js";
import type { TelegramBinding, BotConfig } from "../types.js";
Expand Down Expand Up @@ -1084,6 +1084,50 @@ describe("AUTO_RETRY_OPTIONS", () => {
});
});

describe("createDraftSkipAutoRetryTransformer", () => {
it("bypasses autoRetry for sendMessageDraft — calls prev exactly once on 429", async () => {
const transformer = createDraftSkipAutoRetryTransformer();
let callCount = 0;
const prev = async () => {
callCount++;
return { ok: false, error_code: 429, parameters: { retry_after: 3 } } as const;
};
const result = await transformer(prev as never, "sendMessageDraft", { chat_id: 555000111, draft_id: 1, text: "x" } as never);
assert.strictEqual(callCount, 1, "sendMessageDraft must not be retried");
assert.strictEqual((result as { ok: boolean }).ok, false);
});

it("retries sendMessage on 429 via autoRetry", async () => {
const transformer = createDraftSkipAutoRetryTransformer();
let callCount = 0;
const prev = async () => {
callCount++;
if (callCount === 1) {
return { ok: false, error_code: 429, parameters: { retry_after: 0 } } as const;
}
return { ok: true, result: { message_id: 1 } } as const;
};
const result = await transformer(prev as never, "sendMessage", { chat_id: 555000111, text: "x" } as never);
assert.strictEqual(callCount, 2, "sendMessage must retry once after 429");
assert.strictEqual((result as { ok: boolean }).ok, true);
});

it("retries sendChatAction on 429 via autoRetry", async () => {
const transformer = createDraftSkipAutoRetryTransformer();
let callCount = 0;
const prev = async () => {
callCount++;
if (callCount === 1) {
return { ok: false, error_code: 429, parameters: { retry_after: 0 } } as const;
}
return { ok: true, result: true } as const;
};
const result = await transformer(prev as never, "sendChatAction", { chat_id: 555000111, action: "typing" } as never);
assert.strictEqual(callCount, 2, "sendChatAction must retry once after 429");
assert.strictEqual((result as { ok: boolean }).ok, true);
});
});

describe("extractMediaInfo", () => {
it("extracts video info", () => {
const msg = { video: { file_id: "vid1", file_name: "clip.mp4", mime_type: "video/mp4", file_size: 5000 } };
Expand Down
7 changes: 6 additions & 1 deletion bot/src/stream-relay.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,12 @@ export async function sendOutboxFiles(outboxPath: string, platform: PlatformCont
}
}

/** Debounce interval for draft updates (ms). Drafts are cosmetic — no rate limits. */
/**
* Debounce interval for draft updates (ms). Drafts ARE subject to Telegram's
* per-chat ~1/sec rate limit (issue #117). Retries for sendMessageDraft are
* suppressed via createDraftSkipAutoRetryTransformer in telegram-bot.ts, so a
* 429 here surfaces once and is swallowed by the fire-and-forget caller.
*/
const DRAFT_DEBOUNCE_MS = 300;

/** Max time (ms) to wait for in-flight drafts before final delivery. */
Expand Down
25 changes: 23 additions & 2 deletions bot/src/telegram-bot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,25 @@ export const AUTO_RETRY_OPTIONS = {
rethrowHttpErrors: false,
} as const;

/**
* Build a transformer that runs autoRetry for every Telegram API method EXCEPT
* `sendMessageDraft`. Drafts are cosmetic fire-and-forget calls (see
* stream-relay.ts) — a retry that fires after Telegram's 3-10s retry_after is
* stale by the time it lands (the stream has produced newer text), and 5x
* amplification turns one rate-limited draft into five log/metric increments.
* Every other method retains the full AUTO_RETRY_OPTIONS retry behavior.
* See issue #117.
*/
export function createDraftSkipAutoRetryTransformer(): Transformer {
const retry = autoRetry(AUTO_RETRY_OPTIONS);
return async (prev, method, payload, signal) => {
if (method === "sendMessageDraft") {
return prev(method, payload, signal);
}
return retry(prev, method, payload, signal);
};
}

/**
* Create and configure the Telegram bot.
*/
Expand All @@ -612,8 +631,10 @@ export function createTelegramBot(
// before autoRetry decides whether to retry)
bot.api.config.use(createApiErrorLoggingTransformer({ bindings: config.bindings }));

// Auto-retry on rate limits (outermost transformer — retries after inner errors)
bot.api.config.use(autoRetry(AUTO_RETRY_OPTIONS));
// Auto-retry on rate limits (outermost transformer — retries after inner
// errors). `sendMessageDraft` is excluded: drafts are cosmetic fire-and-
// forget calls; retries amplify 429 log noise without user-visible benefit.
bot.api.config.use(createDraftSkipAutoRetryTransformer());

const maxMessageAgeMs = config.sessionDefaults.maxMessageAgeMs;

Expand Down
148 changes: 148 additions & 0 deletions docs/plans/2026-05-26-telegram-draft-autoretry-skip.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Stop autoRetry from amplifying rate-limited sendMessageDraft — Round 1

## Goal

Issue #117: streaming replies in DMs produce dense bursts of 429 logs because every rate-limited `sendMessageDraft` call is retried up to 5 times by `@grammyjs/auto-retry`. Drafts are cosmetic (fire-and-forget at the caller, auto-cleared by the final `sendMessage`), so retries add no user-visible value — only log noise and amplified Prometheus error counters. Skip the retry chain for `sendMessageDraft` only, leaving every other method's retry behavior identical.

## Validation Commands

```bash
cd bot && npx tsc --noEmit && npm test
```

## Reference: Current transformer chain

`bot/src/telegram-bot.ts:610-616` — two transformers registered on `bot.api.config`, both fire for every API call:

```ts
// Log Telegram API errors, especially 429 rate limits, and count every API
// call attempt by binding (inner transformer — sees each individual attempt
// before autoRetry decides whether to retry)
bot.api.config.use(createApiErrorLoggingTransformer({ bindings: config.bindings }));

// Auto-retry on rate limits (outermost transformer — retries after inner errors)
bot.api.config.use(autoRetry(AUTO_RETRY_OPTIONS));
```

`bot/src/telegram-bot.ts:589-594` — exported options that govern every retried method:

```ts
/** autoRetry options — exported so tests can assert the rethrowHttpErrors value. */
export const AUTO_RETRY_OPTIONS = {
maxRetryAttempts: 5,
maxDelaySeconds: 60,
rethrowHttpErrors: false,
} as const;
```

There is no per-method filter at registration time.

## Reference: @grammyjs/auto-retry has no per-method filter

`bot/node_modules/@grammyjs/auto-retry/out/mod.js:34-39` — the only options exposed:

```js
function autoRetry(options) {
const maxDelay = options?.maxDelaySeconds ?? Infinity;
const maxRetries = options?.maxRetryAttempts ?? Infinity;
const rethrowInternalServerErrors = options?.rethrowInternalServerErrors ?? false;
const rethrowHttpErrors = options?.rethrowHttpErrors ?? false;
```

The returned transformer is a flat closure: `async (prev, method, payload, signal) => ...`. It does not branch on `method`. Filtering must therefore happen one layer up — by either skipping `autoRetry`'s transformer for matching methods, or by wrapping a method-name guard around it.

## Reference: Drafts are fire-and-forget at the caller

`bot/src/stream-relay.ts:174-182`:

```ts
const sendDraftNow = () => {
if (!accumulated) return;
const collapsed = collapseNewlines(accumulated);
const displayText = collapsed.length > platform.maxMessageLength
? collapsed.slice(0, platform.maxMessageLength - 3) + "..."
: collapsed;
draftPromises.push(platform.sendDraft(draftId, displayText).catch(() => {}));
lastDraftTime = Date.now();
};
```

`bot/src/telegram-adapter.ts:60-71`:

```ts
async sendDraft(draftId: number, text: string): Promise<void> {
if (!chatId || !isDm) return;
const html = markdownToHtml(text);
try {
await ctx.api.sendMessageDraft(chatId, draftId, html, {
parse_mode: "HTML",
...threadOpts,
});
} catch {
// Draft failures are cosmetic — silently ignore
}
}
```

Drafts have no recovery semantics — a 429 retry that arrives 3-10 seconds late would land on a stream that has already moved on. The final `sendMessage` (which IS retried) is what guarantees delivery.

## Reference: Misleading comment that motivated the original 300ms debounce

`bot/src/stream-relay.ts:123-124`:

```ts
/** Debounce interval for draft updates (ms). Drafts are cosmetic — no rate limits. */
const DRAFT_DEBOUNCE_MS = 300;
```

The "no rate limits" claim is contradicted by empirical observation (issue #117 — drafts hit Telegram's per-chat ~1/sec limit in DMs).

## Reference: Live evidence of the cascade

Anonymized burst from the bot's stderr log (single user-message in a DM, post-observability deploy):

```
2026-05-25T15:15:07.093Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=4
2026-05-25T15:15:07.434Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=4
2026-05-25T15:15:07.967Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
2026-05-25T15:15:08.415Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
2026-05-25T15:15:08.981Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
2026-05-25T15:15:09.472Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
```

Cumulative rate-limit counts since the observability deploy (~10 days):

```
1109 sendMessageDraft
180 sendChatAction
```

`sendMessageDraft` accounts for ~86% of all rate-limit log entries. With `maxRetryAttempts: 5`, the underlying number of distinct user-streaming events triggering 429s is roughly 1/5th of that — the rest is amplification.

## Reference: Reviewer convergence (issue #117 comment)

Two independent reviewers (codex CLI + a fresh Opus session reading source separately) both proposed the same shape of fix, captured in #117's review comment: a thin wrapper transformer registered in place of bare `autoRetry`, gated on `method === "sendMessageDraft"`. They diverged on whether Option A (raise `DRAFT_DEBOUNCE_MS`) is also necessary — that decision is explicitly out of scope here; we ship the autoRetry skip first and measure residual draft 429 ratio against the new `bot_telegram_api_calls_total` counter before deciding.

## Reference: Existing test patterns for transformers

`bot/src/__tests__/telegram-bot.test.ts` already has the `createApiErrorLoggingTransformer` test block with `captureWarn`, mock `prev` returning `{ ok: false, error_code: 429, parameters: { retry_after: 3 } }`, and explicit invocation as `await transformer(prev as never, "sendMessageDraft", { chat_id: 555000111, draft_id: 1, text: "x" })`. The same shape extends naturally to verifying that the new wrapper does or does not invoke `autoRetry`'s inner transformer based on `method`.

## Tasks

### Task 1: Skip autoRetry for sendMessageDraft (telegram-draft-autoretry-skip, P1)

**Problem:** Every rate-limited `sendMessageDraft` call to a DM hits autoRetry, which awaits Telegram's `retry_after` (3-10 s) and retries up to 5 times. By the time a retry fires, the stream has produced newer accumulated text — the retry's payload is stale. Meanwhile the new draft, fired ~300 ms after the previous one, queues behind the same rate limit. The result is documented in #117: a single ~10-20 s streaming reply produces 60-80 `[telegram-api]` warn lines and 60-80 increments on `bot_telegram_api_errors_total{method="sendMessageDraft", error_code="429"}`, even though the user observes a single successful final `sendMessage`. Cumulative since observability landed: 1109 `sendMessageDraft` 429 lines vs ~180 for the next-noisiest method.

`@grammyjs/auto-retry` exposes no per-method filter (see Reference section); filtering must happen at the transformer-chain layer. The existing two-layer chain (inner = logging/counter, outer = autoRetry) is in `bot/src/telegram-bot.ts:610-616`.

The misleading comment at `bot/src/stream-relay.ts:123` ("Drafts are cosmetic — no rate limits") is the assumption that originally set `DRAFT_DEBOUNCE_MS = 300`. It needs to be replaced with an accurate statement so a future contributor doesn't recreate the same regression.

**What we want:** Outbound `sendMessageDraft` calls go through the API once and surface their result (success or 429) to the caller without any retry-induced amplification. Every other Telegram API method (`sendMessage`, `sendChatAction`, `editMessageText`, `deleteMessage`, etc.) retains the current `AUTO_RETRY_OPTIONS` behavior unchanged. The inline rationale at `stream-relay.ts:123` reflects reality.

- [x] A single rate-limited `sendMessageDraft` call produces exactly one `Rate limited` warn line and one `bot_telegram_api_errors_total{method="sendMessageDraft", error_code="429"}` increment — not the current up-to-five
- [x] A rate-limited `sendMessage` (and any other method targeting a chat) still retries per `AUTO_RETRY_OPTIONS.maxRetryAttempts = 5` after waiting `retry_after`
- [x] The fire-and-forget call path at `bot/src/stream-relay.ts:174-182` is unchanged — no caller change required
- [x] The misleading inline comment at `bot/src/stream-relay.ts:123` is replaced with one that accurately describes the per-chat rate-limit reality and references issue #117 (or the autoRetry-skip behavior) so the constant's value is understandable to a future reader
- [x] `AUTO_RETRY_OPTIONS` continues to be exported with the same shape (existing tests in `__tests__/telegram-bot.test.ts` assert this)
- [x] Add tests covering: (a) `sendMessageDraft` bypasses retry (calls `prev` exactly once when result is 429 with `retry_after`), (b) `sendMessage` still retries on 429, (c) `sendChatAction` still retries on 429
- [x] Verify existing tests pass
Loading