fitz123 · fitz123 · May 27, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/README.md b/README.md
@@ -335,7 +335,7 @@ See [bot/src/metrics.ts](bot/src/metrics.ts) for the full list of exported metri
 
 #### Telegram API metrics
 
-Two complementary counters track outbound Telegram API traffic. Both increment per-attempt (the inner transformer runs once per autoRetry attempt), so `rate(errors) / rate(calls)` over the same window yields the attempt-level error ratio.
+Two complementary counters track outbound Telegram API traffic. Both increment per-attempt (the inner transformer runs once per autoRetry attempt), so `rate(errors) / rate(calls)` over the same window yields the attempt-level error ratio. Exception: `sendMessageDraft` is excluded from autoRetry (see issue #117), so its counters reflect one increment per logical call — attempt-level and call-level rates coincide for that method.
 
 | Metric | Labels | Description |
 |--------|--------|-------------|

diff --git a/bot/src/__tests__/telegram-bot.test.ts b/bot/src/__tests__/telegram-bot.test.ts
@@ -1,7 +1,7 @@
 process.env.TZ = "UTC";
 import { describe, it, beforeEach } from "node:test";
 import assert from "node:assert/strict";
-import { resolveBinding, isAuthorized, sessionKey, isImageMimeType, imageExtensionForMime, buildSourcePrefix, shouldRespondInGroup, BOT_COMMANDS, isStaleMessage, buildReplyContext, buildForwardContext, extensionForDocument, formatFileSize, formatDocumentMeta, buildReactionContext, AUTO_RETRY_OPTIONS, extractMediaInfo, extensionForMedia, formatMediaMeta, createTelegramBot, extractChatContext, formatChatContextForLog, createApiErrorLoggingTransformer, resolveBindingLabel, BINDING_LABEL_NONE, BINDING_LABEL_UNBOUND } from "../telegram-bot.js";
+import { resolveBinding, isAuthorized, sessionKey, isImageMimeType, imageExtensionForMime, buildSourcePrefix, shouldRespondInGroup, BOT_COMMANDS, isStaleMessage, buildReplyContext, buildForwardContext, extensionForDocument, formatFileSize, formatDocumentMeta, buildReactionContext, AUTO_RETRY_OPTIONS, createDraftSkipAutoRetryTransformer, extractMediaInfo, extensionForMedia, formatMediaMeta, createTelegramBot, extractChatContext, formatChatContextForLog, createApiErrorLoggingTransformer, resolveBindingLabel, BINDING_LABEL_NONE, BINDING_LABEL_UNBOUND } from "../telegram-bot.js";
 import client from "prom-client";
 import { telegramApiCalls, telegramApiErrors } from "../metrics.js";
 import type { TelegramBinding, BotConfig } from "../types.js";
@@ -1084,6 +1084,50 @@ describe("AUTO_RETRY_OPTIONS", () => {
   });
 });
 
+describe("createDraftSkipAutoRetryTransformer", () => {
+  it("bypasses autoRetry for sendMessageDraft — calls prev exactly once on 429", async () => {
+    const transformer = createDraftSkipAutoRetryTransformer();
+    let callCount = 0;
+    const prev = async () => {
+      callCount++;
+      return { ok: false, error_code: 429, parameters: { retry_after: 3 } } as const;
+    };
+    const result = await transformer(prev as never, "sendMessageDraft", { chat_id: 555000111, draft_id: 1, text: "x" } as never);
+    assert.strictEqual(callCount, 1, "sendMessageDraft must not be retried");
+    assert.strictEqual((result as { ok: boolean }).ok, false);
+  });
+
+  it("retries sendMessage on 429 via autoRetry", async () => {
+    const transformer = createDraftSkipAutoRetryTransformer();
+    let callCount = 0;
+    const prev = async () => {
+      callCount++;
+      if (callCount === 1) {
+        return { ok: false, error_code: 429, parameters: { retry_after: 0 } } as const;
+      }
+      return { ok: true, result: { message_id: 1 } } as const;
+    };
+    const result = await transformer(prev as never, "sendMessage", { chat_id: 555000111, text: "x" } as never);
+    assert.strictEqual(callCount, 2, "sendMessage must retry once after 429");
+    assert.strictEqual((result as { ok: boolean }).ok, true);
+  });
+
+  it("retries sendChatAction on 429 via autoRetry", async () => {
+    const transformer = createDraftSkipAutoRetryTransformer();
+    let callCount = 0;
+    const prev = async () => {
+      callCount++;
+      if (callCount === 1) {
+        return { ok: false, error_code: 429, parameters: { retry_after: 0 } } as const;
+      }
+      return { ok: true, result: true } as const;
+    };
+    const result = await transformer(prev as never, "sendChatAction", { chat_id: 555000111, action: "typing" } as never);
+    assert.strictEqual(callCount, 2, "sendChatAction must retry once after 429");
+    assert.strictEqual((result as { ok: boolean }).ok, true);
+  });
+});
+
 describe("extractMediaInfo", () => {
   it("extracts video info", () => {
     const msg = { video: { file_id: "vid1", file_name: "clip.mp4", mime_type: "video/mp4", file_size: 5000 } };

diff --git a/bot/src/stream-relay.ts b/bot/src/stream-relay.ts
@@ -120,7 +120,12 @@ export async function sendOutboxFiles(outboxPath: string, platform: PlatformCont
   }
 }
 
-/** Debounce interval for draft updates (ms). Drafts are cosmetic — no rate limits. */
+/**
+ * Debounce interval for draft updates (ms). Drafts ARE subject to Telegram's
+ * per-chat ~1/sec rate limit (issue #117). Retries for sendMessageDraft are
+ * suppressed via createDraftSkipAutoRetryTransformer in telegram-bot.ts, so a
+ * 429 here surfaces once and is swallowed by the fire-and-forget caller.
+ */
 const DRAFT_DEBOUNCE_MS = 300;
 
 /** Max time (ms) to wait for in-flight drafts before final delivery. */

diff --git a/bot/src/telegram-bot.ts b/bot/src/telegram-bot.ts
@@ -593,6 +593,25 @@ export const AUTO_RETRY_OPTIONS = {
   rethrowHttpErrors: false,
 } as const;
 
+/**
+ * Build a transformer that runs autoRetry for every Telegram API method EXCEPT
+ * `sendMessageDraft`. Drafts are cosmetic fire-and-forget calls (see
+ * stream-relay.ts) — a retry that fires after Telegram's 3-10s retry_after is
+ * stale by the time it lands (the stream has produced newer text), and 5x
+ * amplification turns one rate-limited draft into five log/metric increments.
+ * Every other method retains the full AUTO_RETRY_OPTIONS retry behavior.
+ * See issue #117.
+ */
+export function createDraftSkipAutoRetryTransformer(): Transformer {
+  const retry = autoRetry(AUTO_RETRY_OPTIONS);
+  return async (prev, method, payload, signal) => {
+    if (method === "sendMessageDraft") {
+      return prev(method, payload, signal);
+    }
+    return retry(prev, method, payload, signal);
+  };
+}
+
 /**
  * Create and configure the Telegram bot.
  */
@@ -612,8 +631,10 @@ export function createTelegramBot(
   // before autoRetry decides whether to retry)
   bot.api.config.use(createApiErrorLoggingTransformer({ bindings: config.bindings }));
 
-  // Auto-retry on rate limits (outermost transformer — retries after inner errors)
-  bot.api.config.use(autoRetry(AUTO_RETRY_OPTIONS));
+  // Auto-retry on rate limits (outermost transformer — retries after inner
+  // errors). `sendMessageDraft` is excluded: drafts are cosmetic fire-and-
+  // forget calls; retries amplify 429 log noise without user-visible benefit.
+  bot.api.config.use(createDraftSkipAutoRetryTransformer());
 
   const maxMessageAgeMs = config.sessionDefaults.maxMessageAgeMs;
 

diff --git a/docs/plans/2026-05-26-telegram-draft-autoretry-skip.md b/docs/plans/2026-05-26-telegram-draft-autoretry-skip.md
@@ -0,0 +1,148 @@
+# Stop autoRetry from amplifying rate-limited sendMessageDraft — Round 1
+
+## Goal
+
+Issue #117: streaming replies in DMs produce dense bursts of 429 logs because every rate-limited `sendMessageDraft` call is retried up to 5 times by `@grammyjs/auto-retry`. Drafts are cosmetic (fire-and-forget at the caller, auto-cleared by the final `sendMessage`), so retries add no user-visible value — only log noise and amplified Prometheus error counters. Skip the retry chain for `sendMessageDraft` only, leaving every other method's retry behavior identical.
+
+## Validation Commands
+
+```bash
+cd bot && npx tsc --noEmit && npm test
+```
+
+## Reference: Current transformer chain
+
+`bot/src/telegram-bot.ts:610-616` — two transformers registered on `bot.api.config`, both fire for every API call:
+
+```ts
+  // Log Telegram API errors, especially 429 rate limits, and count every API
+  // call attempt by binding (inner transformer — sees each individual attempt
+  // before autoRetry decides whether to retry)
+  bot.api.config.use(createApiErrorLoggingTransformer({ bindings: config.bindings }));
+
+  // Auto-retry on rate limits (outermost transformer — retries after inner errors)
+  bot.api.config.use(autoRetry(AUTO_RETRY_OPTIONS));
+```
+
+`bot/src/telegram-bot.ts:589-594` — exported options that govern every retried method:
+
+```ts
+/** autoRetry options — exported so tests can assert the rethrowHttpErrors value. */
+export const AUTO_RETRY_OPTIONS = {
+  maxRetryAttempts: 5,
+  maxDelaySeconds: 60,
+  rethrowHttpErrors: false,
+} as const;
+```
+
+There is no per-method filter at registration time.
+
+## Reference: @grammyjs/auto-retry has no per-method filter
+
+`bot/node_modules/@grammyjs/auto-retry/out/mod.js:34-39` — the only options exposed:
+
+```js
+function autoRetry(options) {
+    const maxDelay = options?.maxDelaySeconds ?? Infinity;
+    const maxRetries = options?.maxRetryAttempts ?? Infinity;
+    const rethrowInternalServerErrors = options?.rethrowInternalServerErrors ?? false;
+    const rethrowHttpErrors = options?.rethrowHttpErrors ?? false;
+```
+
+The returned transformer is a flat closure: `async (prev, method, payload, signal) => ...`. It does not branch on `method`. Filtering must therefore happen one layer up — by either skipping `autoRetry`'s transformer for matching methods, or by wrapping a method-name guard around it.
+
+## Reference: Drafts are fire-and-forget at the caller
+
+`bot/src/stream-relay.ts:174-182`:
+
+```ts
+const sendDraftNow = () => {
+  if (!accumulated) return;
+  const collapsed = collapseNewlines(accumulated);
+  const displayText = collapsed.length > platform.maxMessageLength
+    ? collapsed.slice(0, platform.maxMessageLength - 3) + "..."
+    : collapsed;
+  draftPromises.push(platform.sendDraft(draftId, displayText).catch(() => {}));
+  lastDraftTime = Date.now();
+};
+```
+
+`bot/src/telegram-adapter.ts:60-71`:
+
+```ts
+async sendDraft(draftId: number, text: string): Promise<void> {
+  if (!chatId || !isDm) return;
+  const html = markdownToHtml(text);
+  try {
+    await ctx.api.sendMessageDraft(chatId, draftId, html, {
+      parse_mode: "HTML",
+      ...threadOpts,
+    });
+  } catch {
+    // Draft failures are cosmetic — silently ignore
+  }
+}
+```
+
+Drafts have no recovery semantics — a 429 retry that arrives 3-10 seconds late would land on a stream that has already moved on. The final `sendMessage` (which IS retried) is what guarantees delivery.
+
+## Reference: Misleading comment that motivated the original 300ms debounce
+
+`bot/src/stream-relay.ts:123-124`:
+
+```ts
+/** Debounce interval for draft updates (ms). Drafts are cosmetic — no rate limits. */
+const DRAFT_DEBOUNCE_MS = 300;
+```
+
+The "no rate limits" claim is contradicted by empirical observation (issue #117 — drafts hit Telegram's per-chat ~1/sec limit in DMs).
+
+## Reference: Live evidence of the cascade
+
+Anonymized burst from the bot's stderr log (single user-message in a DM, post-observability deploy):
+
+```
+2026-05-25T15:15:07.093Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=4
+2026-05-25T15:15:07.434Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=4
+2026-05-25T15:15:07.967Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
+2026-05-25T15:15:08.415Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
+2026-05-25T15:15:08.981Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
+2026-05-25T15:15:09.472Z WARN [telegram-api] Rate limited: method=sendMessageDraft chat_id=<DM> retry_after=3
+```
+
+Cumulative rate-limit counts since the observability deploy (~10 days):
+
+```
+1109 sendMessageDraft
+ 180 sendChatAction
+```
+
+`sendMessageDraft` accounts for ~86% of all rate-limit log entries. With `maxRetryAttempts: 5`, the underlying number of distinct user-streaming events triggering 429s is roughly 1/5th of that — the rest is amplification.
+
+## Reference: Reviewer convergence (issue #117 comment)
+
+Two independent reviewers (codex CLI + a fresh Opus session reading source separately) both proposed the same shape of fix, captured in #117's review comment: a thin wrapper transformer registered in place of bare `autoRetry`, gated on `method === "sendMessageDraft"`. They diverged on whether Option A (raise `DRAFT_DEBOUNCE_MS`) is also necessary — that decision is explicitly out of scope here; we ship the autoRetry skip first and measure residual draft 429 ratio against the new `bot_telegram_api_calls_total` counter before deciding.
+
+## Reference: Existing test patterns for transformers
+
+`bot/src/__tests__/telegram-bot.test.ts` already has the `createApiErrorLoggingTransformer` test block with `captureWarn`, mock `prev` returning `{ ok: false, error_code: 429, parameters: { retry_after: 3 } }`, and explicit invocation as `await transformer(prev as never, "sendMessageDraft", { chat_id: 555000111, draft_id: 1, text: "x" })`. The same shape extends naturally to verifying that the new wrapper does or does not invoke `autoRetry`'s inner transformer based on `method`.
+
+## Tasks
+
+### Task 1: Skip autoRetry for sendMessageDraft (telegram-draft-autoretry-skip, P1)
+
+**Problem:** Every rate-limited `sendMessageDraft` call to a DM hits autoRetry, which awaits Telegram's `retry_after` (3-10 s) and retries up to 5 times. By the time a retry fires, the stream has produced newer accumulated text — the retry's payload is stale. Meanwhile the new draft, fired ~300 ms after the previous one, queues behind the same rate limit. The result is documented in #117: a single ~10-20 s streaming reply produces 60-80 `[telegram-api]` warn lines and 60-80 increments on `bot_telegram_api_errors_total{method="sendMessageDraft", error_code="429"}`, even though the user observes a single successful final `sendMessage`. Cumulative since observability landed: 1109 `sendMessageDraft` 429 lines vs ~180 for the next-noisiest method.
+
+`@grammyjs/auto-retry` exposes no per-method filter (see Reference section); filtering must happen at the transformer-chain layer. The existing two-layer chain (inner = logging/counter, outer = autoRetry) is in `bot/src/telegram-bot.ts:610-616`.
+
+The misleading comment at `bot/src/stream-relay.ts:123` ("Drafts are cosmetic — no rate limits") is the assumption that originally set `DRAFT_DEBOUNCE_MS = 300`. It needs to be replaced with an accurate statement so a future contributor doesn't recreate the same regression.
+
+**What we want:** Outbound `sendMessageDraft` calls go through the API once and surface their result (success or 429) to the caller without any retry-induced amplification. Every other Telegram API method (`sendMessage`, `sendChatAction`, `editMessageText`, `deleteMessage`, etc.) retains the current `AUTO_RETRY_OPTIONS` behavior unchanged. The inline rationale at `stream-relay.ts:123` reflects reality.
+
+- [x] A single rate-limited `sendMessageDraft` call produces exactly one `Rate limited` warn line and one `bot_telegram_api_errors_total{method="sendMessageDraft", error_code="429"}` increment — not the current up-to-five
+- [x] A rate-limited `sendMessage` (and any other method targeting a chat) still retries per `AUTO_RETRY_OPTIONS.maxRetryAttempts = 5` after waiting `retry_after`
+- [x] The fire-and-forget call path at `bot/src/stream-relay.ts:174-182` is unchanged — no caller change required
+- [x] The misleading inline comment at `bot/src/stream-relay.ts:123` is replaced with one that accurately describes the per-chat rate-limit reality and references issue #117 (or the autoRetry-skip behavior) so the constant's value is understandable to a future reader
+- [x] `AUTO_RETRY_OPTIONS` continues to be exported with the same shape (existing tests in `__tests__/telegram-bot.test.ts` assert this)
+- [x] Add tests covering: (a) `sendMessageDraft` bypasses retry (calls `prev` exactly once when result is 429 with `retry_after`), (b) `sendMessage` still retries on 429, (c) `sendChatAction` still retries on 429
+- [x] Verify existing tests pass