Skip to content
2 changes: 2 additions & 0 deletions chatmock/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
def create_app(
verbose: bool = False,
verbose_obfuscation: bool = False,
client_compat: str = "default",
reasoning_effort: str = "medium",
reasoning_summary: str = "auto",
reasoning_compat: str = "think-tags",
Expand All @@ -26,6 +27,7 @@ def create_app(
app.config.update(
VERBOSE=bool(verbose),
VERBOSE_OBFUSCATION=bool(verbose_obfuscation),
CLIENT_COMPAT=client_compat,
REASONING_EFFORT=reasoning_effort,
REASONING_SUMMARY=reasoning_summary,
REASONING_COMPAT=reasoning_compat,
Expand Down
9 changes: 9 additions & 0 deletions chatmock/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def cmd_serve(
port: int,
verbose: bool,
verbose_obfuscation: bool,
client_compat: str,
reasoning_effort: str,
reasoning_summary: str,
reasoning_compat: str,
Expand All @@ -275,6 +276,7 @@ def cmd_serve(
app = create_app(
verbose=verbose,
verbose_obfuscation=verbose_obfuscation,
client_compat=client_compat,
reasoning_effort=reasoning_effort,
reasoning_summary=reasoning_summary,
reasoning_compat=reasoning_compat,
Expand All @@ -300,6 +302,12 @@ def main() -> None:
p_serve.add_argument("--host", default="127.0.0.1")
p_serve.add_argument("--port", type=int, default=8000)
p_serve.add_argument("--verbose", action="store_true", help="Enable verbose logging")
p_serve.add_argument(
"--client-compat",
choices=["default", "vscode"],
default="default",
help="Startup compatibility mode for client-specific behavior (default: default)",
)
p_serve.add_argument(
"--verbose-obfuscation",
action="store_true",
Expand Down Expand Up @@ -371,6 +379,7 @@ def main() -> None:
port=args.port,
verbose=args.verbose,
verbose_obfuscation=args.verbose_obfuscation,
client_compat=args.client_compat,
reasoning_effort=args.reasoning_effort,
reasoning_summary=args.reasoning_summary,
reasoning_compat=args.reasoning_compat,
Expand Down
4 changes: 4 additions & 0 deletions chatmock/model_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class ModelSpec:
aliases: tuple[str, ...]
allowed_efforts: frozenset[str]
variant_efforts: tuple[str, ...]
public_aliases: tuple[str, ...] = ()
uses_codex_instructions: bool = False


Expand Down Expand Up @@ -126,6 +127,8 @@ class ModelSpec:
_ALIASES[_spec.public_id] = _spec.upstream_id
for _alias in _spec.aliases:
_ALIASES[_alias] = _spec.upstream_id
for _public_alias in _spec.public_aliases:
_ALIASES[_public_alias] = _spec.upstream_id


def _strip_model_name(model: str | None) -> tuple[str, str | None]:
Expand Down Expand Up @@ -189,6 +192,7 @@ def list_public_models(expose_reasoning_models: bool = False) -> list[str]:
model_ids: list[str] = []
for spec in _MODEL_SPECS:
model_ids.append(spec.public_id)
model_ids.extend(spec.public_aliases)
if expose_reasoning_models:
model_ids.extend(f"{spec.public_id}-{effort}" for effort in spec.variant_efforts)
return model_ids
Expand Down
59 changes: 56 additions & 3 deletions chatmock/responses_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

import json
from dataclasses import dataclass
from typing import Any, Dict, Iterable, Iterator, List
from typing import Any, Callable, Dict, Iterable, Iterator, List

from flask import Response, jsonify, make_response

from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .fast_mode import ServiceTierResolution, resolve_service_tier
from .http import build_cors_headers
from .model_registry import (
allowed_efforts_for_model,
extract_reasoning_from_model_name,
Expand All @@ -14,6 +17,7 @@
)
from .reasoning import build_reasoning_param
from .session import ensure_session_id
from .utils import convert_tools_chat_to_responses


@dataclass(frozen=True)
Expand All @@ -35,6 +39,40 @@ class NormalizedResponsesRequest:
service_tier_resolution: ServiceTierResolution


def is_vscode_client_compat(config: Dict[str, Any]) -> bool:
return str(config.get("CLIENT_COMPAT") or "default").strip().lower() == "vscode"


def client_compat_error_response(
feature_name: str,
route_name: str,
*,
verbose: bool = False,
log_json: Callable[[str, Any], None] | None = None,
) -> Response:
err = {
"error": {
"message": f"{feature_name} on {route_name} is only supported when CLIENT_COMPAT=vscode",
"code": "CLIENT_COMPAT_UNSUPPORTED",
}
}
if verbose and log_json is not None:
log_json(f"OUT POST {route_name}", err)
resp = make_response(jsonify(err), 400)
for key, value in build_cors_headers().items():
resp.headers.setdefault(key, value)
return resp


def _uses_chat_completions_tool_schema(tools: Any) -> bool:
if not isinstance(tools, list):
return False
for tool in tools:
if isinstance(tool, dict) and isinstance(tool.get("function"), dict):
return True
return False


def instructions_for_model(config: Dict[str, Any], model: str) -> str:
base = config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS)
if uses_codex_instructions(model):
Expand Down Expand Up @@ -83,18 +121,28 @@ def normalize_responses_payload(
config: Dict[str, Any],
client_session_id: str | None = None,
) -> NormalizedResponsesRequest:
if not is_vscode_client_compat(config) and _uses_chat_completions_tool_schema(payload.get("tools")):
raise ResponsesRequestError(
"chat.completions tool schema on /v1/responses is only supported when CLIENT_COMPAT=vscode",
code="CLIENT_COMPAT_UNSUPPORTED",
)

requested_model = payload.get("model") if isinstance(payload.get("model"), str) else None
normalized_model = normalize_model_name(requested_model, config.get("DEBUG_MODEL"))

normalized = dict(payload)
normalized["model"] = normalized_model
normalized.pop("max_output_tokens", None)
# The Codex backend behind ChatMock rejects Responses truncation hints,
# so keep accepting the client field but do not forward it upstream.
normalized.pop("truncation", None)

if "input" in normalized:
normalized["input"] = canonicalize_responses_input(normalized.get("input"))

if "store" not in normalized:
normalized["store"] = False
# Copilot/Codex traffic is expected to be non-persistent here and the
# upstream contract rejects stored responses, so always pin this off.
normalized["store"] = False

instructions = normalized.get("instructions")
if not isinstance(instructions, str) or not instructions.strip():
Expand Down Expand Up @@ -122,6 +170,11 @@ def normalize_responses_payload(
normalized["include"] = include_list

tools = normalized.get("tools")
converted_tools = convert_tools_chat_to_responses(tools)
if converted_tools:
normalized["tools"] = converted_tools
tools = converted_tools

if (not isinstance(tools, list) or not tools) and bool(config.get("DEFAULT_WEB_SEARCH")):
tool_choice = normalized.get("tool_choice")
if not (isinstance(tool_choice, str) and tool_choice.strip().lower() == "none"):
Expand Down
76 changes: 56 additions & 20 deletions chatmock/routes_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import json
import datetime
import time
from typing import Any, Dict, List
from typing import Any, Dict, List, cast

from requests import Response as RequestsResponse
from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context

from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .fast_mode import resolve_service_tier
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .model_registry import list_public_models, uses_codex_instructions
from .responses_api import instructions_for_model
from .responses_api import client_compat_error_response, instructions_for_model, is_vscode_client_compat
from .reasoning import (
allowed_efforts_for_model,
build_reasoning_param,
Expand Down Expand Up @@ -182,7 +183,7 @@ def ollama_chat() -> Response:
err = {"error": "Invalid JSON body"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
return make_response(jsonify(err), 400)

model = payload.get("model")
raw_messages = payload.get("messages")
Expand All @@ -199,6 +200,23 @@ def ollama_chat() -> Response:
if stream_req is None:
stream_req = True
stream_req = bool(stream_req)

if not is_vscode_client_compat(current_app.config):
if "responses_tools" in payload:
return client_compat_error_response(
"responses_tools",
"/api/chat",
verbose=verbose,
log_json=_log_json,
)
if "responses_tool_choice" in payload:
return client_compat_error_response(
"responses_tool_choice",
"/api/chat",
verbose=verbose,
log_json=_log_json,
)

tools_req = payload.get("tools") if isinstance(payload.get("tools"), list) else []
tools_responses = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
tool_choice = payload.get("tool_choice", "auto")
Expand All @@ -216,7 +234,7 @@ def ollama_chat() -> Response:
err = {"error": "Only web_search/web_search_preview are supported in responses_tools"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
return make_response(jsonify(err), 400)
extra_tools.append(_t)
if not extra_tools and bool(current_app.config.get("DEFAULT_WEB_SEARCH")):
rtc = payload.get("responses_tool_choice")
Expand All @@ -233,7 +251,7 @@ def ollama_chat() -> Response:
err = {"error": "responses_tools too large"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
return make_response(jsonify(err), 400)
had_responses_tools = True
tools_responses = (tools_responses or []) + extra_tools

Expand All @@ -245,7 +263,7 @@ def ollama_chat() -> Response:
err = {"error": "Invalid request format"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
return make_response(jsonify(err), 400)

input_items = convert_chat_messages_to_responses_input(messages)

Expand All @@ -263,7 +281,7 @@ def ollama_chat() -> Response:
err = {"error": service_tier_resolution.error_message}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
return make_response(jsonify(err), 400)
upstream, error_resp = start_upstream_request(
normalized_model,
input_items,
Expand Down Expand Up @@ -293,13 +311,18 @@ def ollama_chat() -> Response:
pass
return error_resp

record_rate_limits_from_response(upstream)
upstream_resp = cast(RequestsResponse, upstream)
record_rate_limits_from_response(upstream_resp)

if upstream.status_code >= 400:
if upstream_resp.status_code >= 400:
try:
err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"raw": upstream.text}
if upstream_resp.content:
parsed_err_body = json.loads(upstream_resp.content.decode("utf-8", errors="ignore"))
err_body: Dict[str, Any] = parsed_err_body if isinstance(parsed_err_body, dict) else {"raw": parsed_err_body}
else:
err_body = {"raw": upstream_resp.text}
except Exception:
err_body = {"raw": upstream.text}
err_body = {"raw": upstream_resp.text}
if had_responses_tools:
if verbose:
print("[Passthrough] Upstream rejected tools; retrying without extras (args redacted)")
Expand All @@ -321,20 +344,33 @@ def ollama_chat() -> Response:
service_tier=service_tier_resolution.service_tier,
)
record_rate_limits_from_response(upstream2)
if err2 is None and upstream2 is not None and upstream2.status_code < 400:
upstream = upstream2
if err2 is not None:
if verbose:
try:
body = err2.get_data(as_text=True)
if body:
try:
parsed = json.loads(body)
except Exception:
parsed = body
_log_json("OUT POST /api/chat", parsed)
except Exception:
pass
return err2
if upstream2 is not None and upstream2.status_code < 400:
upstream_resp = upstream2
else:
err = {"error": {"message": (err_body.get("error", {}) or {}).get("message", "Upstream error"), "code": "RESPONSES_TOOLS_REJECTED"}}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), (upstream2.status_code if upstream2 is not None else upstream.status_code)
return make_response(jsonify(err), upstream2.status_code if upstream2 is not None else upstream_resp.status_code)
else:
if verbose:
print("/api/chat upstream error status=", upstream.status_code, " body:", json.dumps(err_body)[:2000])
print("/api/chat upstream error status=", upstream_resp.status_code, " body:", json.dumps(err_body)[:2000])
err = {"error": (err_body.get("error", {}) or {}).get("message", "Upstream error")}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), upstream.status_code
return make_response(jsonify(err), upstream_resp.status_code)

created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
model_out = model if isinstance(model, str) and model.strip() else normalized_model
Expand All @@ -348,7 +384,7 @@ def _gen():
pending_summary_paragraph = False
full_parts: List[str] = []
try:
for raw_line in upstream.iter_lines(decode_unicode=False):
for raw_line in upstream_resp.iter_lines(decode_unicode=False):
if not raw_line:
continue
line = raw_line.decode("utf-8", errors="ignore") if isinstance(raw_line, (bytes, bytearray)) else raw_line
Expand Down Expand Up @@ -478,7 +514,7 @@ def _gen():
elif kind == "response.completed":
break
finally:
upstream.close()
upstream_resp.close()
if compat == "think-tags" and think_open and not think_closed:
yield (
json.dumps(
Expand Down Expand Up @@ -518,7 +554,7 @@ def _gen():
reasoning_full_text = ""
tool_calls: List[Dict[str, Any]] = []
try:
for raw in upstream.iter_lines(decode_unicode=False):
for raw in upstream_resp.iter_lines(decode_unicode=False):
if not raw:
continue
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, (bytes, bytearray)) else raw
Expand Down Expand Up @@ -557,7 +593,7 @@ def _gen():
elif kind == "response.completed":
break
finally:
upstream.close()
upstream_resp.close()

if (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() == "think-tags":
rtxt_parts = []
Expand Down
Loading