OpenBMB · octo-patch · Mar 21, 2026
diff --git a/README.md b/README.md
@@ -252,6 +252,26 @@ Designed for developers and end users, providing complete UI interaction and com
 - [Deployment Guide](https://ultrarag.openbmb.cn/pages/en/ui/prepare): Detailed production environment deployment tutorials, covering the setup of Retriever, Generation models (LLM), and Milvus vector database.
 - [Deep Research](https://ultrarag.openbmb.cn/pages/en/demo/deepresearch): Flagship case, deploy a Deep Research Pipeline. Combined with the AgentCPM-Report model, it can automatically perform multi-step retrieval and integration to generate tens of thousands of words of survey reports.
 
+### ☁️ Supported Cloud LLM Backends
+
+The Generation server supports multiple backends out of the box:
+
+| Backend | Description |
+|---------|-------------|
+| `vllm` | Local inference via [vLLM](https://github.com/vllm-project/vllm) |
+| `openai` | OpenAI API (or any OpenAI-compatible endpoint) |
+| `minimax` | [MiniMax](https://www.minimaxi.com) cloud API (M2.7, M2.5 series) |
+| `hf` | Local inference via HuggingFace Transformers |
+
+To use MiniMax as the generation backend, set `backend: minimax` in your parameter file and provide your API key:
+
+```shell
+export MINIMAX_API_KEY="your-api-key"
+ultrarag run examples/minimax_rag.yaml
+```
+
+See [`examples/parameter/minimax_generation_parameter.yaml`](examples/parameter/minimax_generation_parameter.yaml) for full configuration options.
+
 ## 🤝 Contributing
 
 Thanks to the following contributors for their code submissions and testing. We also welcome new members to join us in collectively building a comprehensive RAG ecosystem!

diff --git a/docs/README_zh.md b/docs/README_zh.md
@@ -250,6 +250,26 @@ Hello, UltraRAG v3!
 - [部署指南](https://ultrarag.openbmb.cn/pages/cn/ui/prepare)：详细的生产环境部署教程，涵盖检索器 (Retriever)、生成模型 (LLM) 以及 Milvus 向量库的搭建。
 - [深度研究](https://ultrarag.openbmb.cn/pages/cn/demo/deepresearch)：旗舰案例，部署一个深度研究 Pipeline。配合 AgentCPM-Report 模型，可自动执行多步检索与整合，生成数万字的综述报告。
 
+### ☁️ 支持的云端 LLM 后端
+
+Generation 服务器开箱即用地支持多种后端：
+
+| 后端 | 说明 |
+|------|------|
+| `vllm` | 通过 [vLLM](https://github.com/vllm-project/vllm) 进行本地推理 |
+| `openai` | OpenAI API（或任何 OpenAI 兼容端点） |
+| `minimax` | [MiniMax](https://www.minimaxi.com) 云端 API（M2.7、M2.5 系列） |
+| `hf` | 通过 HuggingFace Transformers 进行本地推理 |
+
+使用 MiniMax 作为生成后端，只需在参数文件中设置 `backend: minimax` 并提供 API Key：
+
+```shell
+export MINIMAX_API_KEY="your-api-key"
+ultrarag run examples/minimax_rag.yaml
+```
+
+完整配置选项请参考 [`examples/parameter/minimax_generation_parameter.yaml`](../examples/parameter/minimax_generation_parameter.yaml)。
+
 ## 🤝 贡献
 
 感谢以下贡献者在代码提交和测试中的付出。我们也欢迎新的成员加入，共同构建完善的 RAG 生态！

diff --git a/examples/minimax_rag.yaml b/examples/minimax_rag.yaml
@@ -0,0 +1,29 @@
+# RAG Demo using MiniMax as the generation backend
+#
+# MiniMax provides OpenAI-compatible cloud LLM APIs.
+# Available models:
+#   - MiniMax-M2.7           (latest, 1M context)
+#   - MiniMax-M2.7-highspeed (fast variant)
+#   - MiniMax-M2.5           (previous generation)
+#   - MiniMax-M2.5-highspeed (204K context, fast)
+#
+# Set the MINIMAX_API_KEY environment variable before running:
+#   export MINIMAX_API_KEY="your-api-key"
+
+# MCP Server
+servers:
+  benchmark: servers/benchmark
+  retriever: servers/retriever
+  prompt: servers/prompt
+  generation: servers/generation
+  evaluation: servers/evaluation
+
+# MCP Client Pipeline
+pipeline:
+- benchmark.get_data
+- retriever.retriever_init
+- retriever.retriever_search
+- generation.generation_init
+- prompt.qa_rag_boxed
+- generation.generate
+- evaluation.evaluate
diff --git a/examples/parameter/minimax_generation_parameter.yaml b/examples/parameter/minimax_generation_parameter.yaml
@@ -0,0 +1,23 @@
+# Generation parameters for MiniMax backend
+#
+# Set the MINIMAX_API_KEY environment variable:
+#   export MINIMAX_API_KEY="your-api-key"
+
+generation:
+  backend: minimax
+  backend_configs:
+    minimax:
+      model_name: MiniMax-M2.7
+      base_url: https://api.minimax.io/v1
+      api_key: ""                         # or set MINIMAX_API_KEY env var
+      concurrency: 4
+      retries: 3
+      base_delay: 1.0
+      strip_think_tags: true
+
+  sampling_params:
+    temperature: 0.7
+    top_p: 0.8
+    max_tokens: 2048
+
+  system_prompt: ""
diff --git a/servers/generation/parameter.yaml b/servers/generation/parameter.yaml
@@ -1,6 +1,6 @@
 # servers/generation/parameter.yaml
 
-backend: vllm # options: vllm, openai
+backend: vllm # options: vllm, openai, minimax, hf
 backend_configs:
   vllm:
     model_name_or_path: openbmb/MiniCPM4-8B
@@ -15,6 +15,14 @@ backend_configs:
     concurrency: 8
     retries: 3
     base_delay: 1.0
+  minimax:
+    model_name: MiniMax-M2.7           # or MiniMax-M2.7-highspeed, MiniMax-M2.5, MiniMax-M2.5-highspeed
+    base_url: https://api.minimax.io/v1
+    api_key: ""                         # or set MINIMAX_API_KEY env var
+    concurrency: 4
+    retries: 3
+    base_delay: 1.0
+    strip_think_tags: true              # strip <think>...</think> from model output
   hf:
     model_name_or_path: openbmb/MiniCPM4-8B
     gpu_ids: '2,3'

diff --git a/servers/generation/src/generation.py b/servers/generation/src/generation.py
@@ -148,20 +148,57 @@ def _to_data_url(self, path_or_url: str) -> str:
             b64 = base64.b64encode(f.read()).decode("utf-8")
         return f"data:{mime};base64,{b64}"
 
+    @staticmethod
+    def _clamp_temperature(
+        sampling_params: Dict[str, Any], low: float = 0.01, high: float = 1.0
+    ) -> Dict[str, Any]:
+        """Clamp temperature in sampling_params to [low, high].
+
+        Args:
+            sampling_params: Sampling parameters dict (may contain 'temperature')
+            low: Minimum allowed temperature
+            high: Maximum allowed temperature
+
+        Returns:
+            Updated sampling_params dict with clamped temperature
+        """
+        params = dict(sampling_params)
+        if "temperature" in params:
+            raw = float(params["temperature"])
+            params["temperature"] = max(low, min(high, raw))
+        return params
+
+    @staticmethod
+    def _strip_think_tags(text: str) -> str:
+        """Strip ``<think>...</think>`` blocks from model output.
+
+        MiniMax M2.5/M2.7 models may include internal reasoning wrapped in
+        ``<think>`` tags. This helper removes them so only the final answer
+        is returned to callers.
+
+        Args:
+            text: Raw model output
+
+        Returns:
+            Text with think blocks removed
+        """
+        import re
+        return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+
     def generation_init(
         self,
         backend_configs: Dict[str, Any],
         sampling_params: Dict[str, Any],
         extra_params: Optional[Dict[str, Any]] = None,
         backend: str = "vllm",
     ) -> None:
-        """Initialize generation backend (vllm, openai, or hf).
+        """Initialize generation backend (vllm, openai, minimax, or hf).
 
         Args:
             backend_configs: Dictionary of backend-specific configurations
             sampling_params: Sampling parameters for generation
             extra_params: Optional extra parameters (e.g., chat_template_kwargs)
-            backend: Backend name ("vllm", "openai", or "hf")
+            backend: Backend name ("vllm", "openai", "minimax", or "hf")
 
         Raises:
             ImportError: If required dependencies are not installed
@@ -234,6 +271,39 @@ def generation_init(
             self._retries = int(cfg.get("retries", 3))
             self._base_delay = float(cfg.get("base_delay", 1.0))
 
+        elif self.backend == "minimax":
+            self.model_name = cfg.get("model_name") or "MiniMax-M2.7"
+
+            base_url = cfg.get("base_url") or "https://api.minimax.io/v1"
+
+            api_key = (
+                cfg.get("api_key")
+                or os.environ.get("MINIMAX_API_KEY")
+                or os.environ.get("LLM_API_KEY")
+            )
+            if not api_key:
+                error_msg = (
+                    "api_key is required for minimax backend. "
+                    "Set it in backend_configs or via the MINIMAX_API_KEY "
+                    "environment variable."
+                )
+                app.logger.error(error_msg)
+                raise ValueError(error_msg)
+
+            self.client = AsyncOpenAI(base_url=base_url, api_key=api_key)
+
+            # Clamp temperature to MiniMax's accepted range (0, 1].
+            sampling_params = self._clamp_temperature(sampling_params, 0.01, 1.0)
+
+            if extra_params:
+                sampling_params["extra_body"] = extra_params
+            self.sampling_params = sampling_params
+
+            self._max_concurrency = int(cfg.get("concurrency", 1))
+            self._retries = int(cfg.get("retries", 3))
+            self._base_delay = float(cfg.get("base_delay", 1.0))
+            self._strip_think = bool(cfg.get("strip_think_tags", True))
+
         elif self.backend == "hf":
             try:
                 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -386,6 +456,86 @@ async def call_with_retry(
                 idx, ans = await coro
                 ret[idx] = ans
 
+        elif self.backend == "minimax":
+            sem = asyncio.Semaphore(self._max_concurrency)
+
+            async def call_minimax(
+                idx,
+                msg,
+                client,
+                model_name,
+                sampling_params,
+                retries: int,
+                base_delay: float,
+            ):
+                import random
+                from openai import RateLimitError, APIStatusError
+
+                delay = base_delay
+                for attempt in range(retries):
+                    try:
+                        async with sem:
+                            resp = await client.chat.completions.create(
+                                model=model_name,
+                                messages=msg,
+                                **sampling_params,
+                            )
+                        text = resp.choices[0].message.content or ""
+                        if getattr(self, "_strip_think", True):
+                            text = self._strip_think_tags(text)
+                        return idx, text
+                    except AuthenticationError as e:
+                        error_msg = (
+                            f"[{e.status_code}] Unauthorized: Invalid or missing MINIMAX_API_KEY."
+                        )
+                        app.logger.error(error_msg)
+                        raise ToolError(error_msg)
+                    except RateLimitError as e:
+                        warn_msg = f"[{e.status_code}] MiniMax rate limited (idx={idx}, attempt={attempt+1}): {e}"
+                        app.logger.warning(warn_msg)
+                        raise ToolError(warn_msg)
+                    except APIStatusError as e:
+                        if e.status_code >= 500:
+                            warn_msg = f"[{e.status_code}] MiniMax server error (idx={idx}, attempt={attempt+1}): {e}"
+                            app.logger.warning(warn_msg)
+                        else:
+                            error_msg = f"[{e.status_code}] MiniMax API error (idx={idx}, attempt={attempt+1}): {e}"
+                            app.logger.error(error_msg)
+                            raise ToolError(error_msg)
+                    except Exception as e:
+                        error_msg = f"[Retry {attempt+1}] MiniMax failed (idx={idx}): {e}"
+                        app.logger.error(error_msg)
+                        raise ToolError(error_msg)
+
+                    await asyncio.sleep(delay + random.random() * 0.25)
+                    delay *= 2
+
+                return idx, "<error>"
+
+            tasks = [
+                asyncio.create_task(
+                    call_minimax(
+                        idx,
+                        msg,
+                        self.client,
+                        self.model_name,
+                        self.sampling_params,
+                        retries=getattr(self, "_retries", 3),
+                        base_delay=getattr(self, "_base_delay", 1.0),
+                    )
+                )
+                for idx, msg in enumerate(msg_ls)
+            ]
+            ret = [None] * len(msg_ls)
+
+            for coro in tqdm(
+                asyncio.as_completed(tasks),
+                total=len(tasks),
+                desc="MiniMax Generating: ",
+            ):
+                idx, ans = await coro
+                ret[idx] = ans
+
         elif self.backend == "hf":
             prompt_txt_ls: List[str] = []
             for msg in msg_ls:

diff --git a/tests/__init__.py b/tests/__init__.py