InternLM
diff --git a/‎autotest/utils/config_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎autotest/utils/config_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lmdeploy/api.py‎
Lines changed: 16 additions & 11 deletions b/‎lmdeploy/api.py‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎lmdeploy/cli/utils.py‎
Lines changed: 2 additions & 2 deletions b/‎lmdeploy/cli/utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lmdeploy/lite/apis/calibrate.py‎
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/lite/apis/calibrate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmdeploy/lite/quantization/calibration.py‎
Lines changed: 4 additions & 4 deletions b/‎lmdeploy/lite/quantization/calibration.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎lmdeploy/lite/quantization/weight/quantizer.py‎
Lines changed: 2 additions & 2 deletions b/‎lmdeploy/lite/quantization/weight/quantizer.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lmdeploy/lite/utils/batch_split.py‎
Lines changed: 4 additions & 4 deletions b/‎lmdeploy/lite/utils/batch_split.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎lmdeploy/lite/utils/global_avail.py‎
Lines changed: 5 additions & 5 deletions b/‎lmdeploy/lite/utils/global_avail.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lmdeploy/logger.py‎
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/logger.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 6 additions & 9 deletions b/‎lmdeploy/messages.py‎
Lines changed: 6 additions & 9 deletions
@@ -51,7 +51,7 @@ def get_func_config_list(backend: str,
         func_type: Test func type filter, default: func
         extra: extra config to update in each run config dict
     Returns:
-        List[Dict]: All valid run config dicts
+        list[dict]: All valid run config dicts
     """
     config = get_config()
     device = config.get('device', 'cuda')
@@ -228,7 +228,7 @@ def get_model_list(config: dict,
         model_type: Model type, default: chat_model
         func_type: Test func type filter, default: func
     Returns:
-        List[str]: Base models + quantization extended models
+        list[str]: Base models + quantization extended models
     """
     model_config_key = f'{backend}_{model_type}'
     all_models = []
 
@@ -19,7 +19,8 @@ def pipeline(model_path: str,
              max_log_len: int | None = None,
              speculative_config: SpeculativeConfig | None = None,
              **kwargs):
-    """
+    """Create a pipeline for inference.
+
     Args:
         model_path: the path of a model. It could be one of the following options:
 
@@ -34,14 +35,17 @@ def pipeline(model_path: str,
               on huggingface.co, such as ``internlm/internlm-chat-7b``,
               ``Qwen/Qwen-7B-Chat``, ``baichuan-inc/Baichuan2-7B-Chat``
               and so on.
-        backend_config: backend
-            config instance. Default to None.
-        chat_template_config: chat template configuration.
-            Default to None.
+        backend_config: backend config instance. Default to None.
+        chat_template_config: chat template configuration. Default to None.
         log_level: set log level whose value among [``CRITICAL``, ``ERROR``,
             ``WARNING``, ``INFO``, ``DEBUG``]
         max_log_len: Max number of prompt characters or prompt tokens
-            being printed in log
+            being printed in log.
+        speculative_config: speculative decoding configuration.
+        **kwargs: additional keyword arguments passed to the pipeline.
+
+    Returns:
+        Pipeline: a pipeline instance for inference.
 
     Examples:
 
@@ -62,8 +66,7 @@ def pipeline(model_path: str,
             im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
             response = pipe([('describe this image', [im])])
             print(response)
-
-    """ # noqa E501
+    """  # noqa E501
 
     return Pipeline(model_path,
                     backend_config=backend_config,
@@ -106,11 +109,13 @@ def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: str | None = N
 
     Args:
         api_server_url: communicating address ``http://<ip>:<port>`` of
-            api_server
+            api_server.
         api_key: api key. Default to None, which means no
             api key will be used.
-    Return:
-        Chatbot for LLaMA series models with turbomind as inference engine.
+
+    Raises:
+        NotImplementedError: This function has been deprecated and removed.
+            Use ``from lmdeploy.serve import APIClient`` instead.
     """
     raise NotImplementedError("The 'client' function is no longer available. This function has been deprecated. "
                               ' Please use "from lmdeploy.serve import APIClient" instead.')
@@ -43,10 +43,10 @@ def get_lora_adapters(adapters: list[str]):
     """Parse lora adapers from cli input.
 
     Args:
-        adapters (List[str]): CLI input string of lora adapter path(s).
+        adapters (list[str]): CLI input string of lora adapter path(s).
 
     Returns:
-        Dict[str,str] or None: Parsed lora adapter path(s).
+        dict[str, str] | None: Parsed lora adapter path(s).
     """
     if not adapters:
         return None
 
@@ -95,7 +95,7 @@ def _prepare_for_calibrate(model: nn.Module,
     ----------
     model : nn.Module
         The PyTorch model to prepare for calibration.
-    layer_type : Union[str, Type]
+    layer_type : str | type
         The type of the layer to be moved to CPU. Can be either a string of
         class name or the class type itself.
     head_name : str, optional
 
@@ -43,8 +43,8 @@ def __init__(self,
         Args:
             model (nn.Module): Model to be calibrated.
             tokenizer (PreTrainedTokenizer): Tokenizer of the given model.
-            layer_type (Union[str, type]): Type of the layers to be observed.
-            norm_type (Union[str, type]): Norm type used in the model.
+            layer_type (str | type): Type of the layers to be observed.
+            norm_type (str | type): Norm type used in the model.
             batch_size (int): The batch size for running the calib samples.
                 Low GPU mem requires small batch_size. Large batch_size
                 reduces the calibration time while costs more VRAM.
@@ -204,7 +204,7 @@ def export(self, out_dir):
         to specified directory.
 
         Args:
-            out_dir (Union[str, Path]): The directory path where the stats
+            out_dir (str | Path): The directory path where the stats
                 will be saved.
         """
 
@@ -377,7 +377,7 @@ def export(self, out_dir):
         to specified directory.
 
         Args:
-            out_dir (Union[str, Path]): The directory path where the stats
+            out_dir (str | Path): The directory path where the stats
                 will be saved.
         """
         inputs_stats = {
 
@@ -31,7 +31,7 @@ class WeightQuantizer(GlobalAvailMixin):
             use min-max scaling.
         granularity (str): The granularity of quantization. Available options
             are 'per_channel', 'per_tensor', and 'per_group'.
-        group_size (Optional[int]): If using 'per_group' quantization, this is
+        group_size (int | None): If using 'per_group' quantization, this is
             the number of channels in each group.
 
     Example:
@@ -108,7 +108,7 @@ def quant(self, weight: torch.Tensor, qparams: QParams | None = None, real: bool
         Args:
             weight (torch.Tensor): The weight tensor with shape
                 (out_features, in_features).
-            qparams (Optional[QParams]): A namedtuple containing 'scales'
+            qparams (QParams | None): A namedtuple containing 'scales'
                 and 'zero_points'.
             real (bool): If True, return the tensor with quantized type.
 
 
@@ -10,13 +10,13 @@ def split_decoder_layer_inputs(batch_size, *args: torch.Tensor | Any,
     elements.
 
     Args:
-        *args (Union[torch.Tensor, Any]): Positional arguments which could
+        *args (torch.Tensor | Any): Positional arguments which could
             be a mix of tensors and other types.
-        **kwargs (Union[torch.Tensor, Any]): Keyword arguments which could
+        **kwargs (torch.Tensor | Any): Keyword arguments which could
             be a mix of tensors and other types.
 
     Returns:
-        Tuple[List[List[Any]], List[Dict[str, Any]]]: A tuple containing two
+        tuple[list[list[Any]], list[dict[str, Any]]]: A tuple containing two
             lists, one for positional arguments, one for keyword arguments.
             Each list contains individual elements from the batch.
     """
@@ -63,7 +63,7 @@ def concat_decoder_layer_outputs(batch_outputs: list[Any]) -> Any:
     batched output.
 
     Args:
-        batch_outputs (List[Any]): A list, where each tuple
+        batch_outputs (list[Any]): A list, where each tuple
             represents the output from an individual element in the batch.
 
     Returns:
 
@@ -13,7 +13,7 @@ def global_available(self, key: str | nn.Module = 'default', group: str = 'defau
         """Make the instance globally available.
 
         Args:
-            key (Union[str, nn.Module], optional): Key to save the instance.
+            key (str | nn.Module, optional): Key to save the instance.
                 Defaults to 'default'.
             group (str, optional): Group to save the instance.
                 Defaults to 'default'.
@@ -29,7 +29,7 @@ def _save_instance(cls,
 
         Args:
             instance (GlobalAvailMixin): Instance to save.
-            key (Union[str, nn.Module], optional): Key to save the instance.
+            key (str | nn.Module, optional): Key to save the instance.
                 Defaults to 'default'.
             group (str, optional): Group to save the instance.
                 Defaults to 'default'.
@@ -45,13 +45,13 @@ def find(cls, key: str | nn.Module = 'default', group: str = 'default') -> Union
         """Find an instance by its key and group.
 
         Args:
-            key (Union[str, nn.Module], optional): Key of the instance.
+            key (str | nn.Module, optional): Key of the instance.
                 Defaults to 'default'.
             group (str, optional): Group of the instance.
                 Defaults to 'default'.
 
         Returns:
-            Union[None, GlobalAvailMixin]: The found instance, or None if
+            None | GlobalAvailMixin: The found instance, or None if
                 it does not exist.
         """
         return cls._instances.get(group, {}).get(key)
@@ -64,7 +64,7 @@ def find_group(cls, group: str) -> dict[str | nn.Module, 'GlobalAvailMixin']:
             group (str): Group of the instances.
 
         Returns:
-            Dict[Union[str, nn.Module], GlobalAvailMixin]: All instances in
+            dict[str | nn.Module, GlobalAvailMixin]: All instances in
                 the group.
         """
         return cls._instances.get(group, {})
 
@@ -12,7 +12,7 @@ class RequestLogger:
     exceed a specified maximum length.
 
     Args:
-        max_log_len (Optional[int]): The maximum length of the log entries.
+        max_log_len (int | None): The maximum length of the log entries.
             If None, no maximum length is enforced.
     """
 
 
@@ -51,10 +51,10 @@ class GenerationConfig:
         random_seed: Seed used when sampling a token
         stop_words: Words that stop generating further tokens
         bad_words: Words that the engine will never generate
-        stop_token_ids: List of tokens that stop the generation
+        stop_token_ids: list of tokens that stop the generation
             when they are generated. The returned output will not contain
             the stop tokens.
-        bad_token_ids: List of tokens that the engine will never
+        bad_token_ids: list of tokens that the engine will never
             generate.
         min_new_tokens: The minimum numbers of tokens to generate,
             ignoring the number of tokens in the prompt.
@@ -475,16 +475,13 @@ class Response:
         generate_token_len: the response token length.
         input_token_len: the input prompt token length. Note that it may
             contains chat template part.
-        session_id: the id for running the session.
         finish_reason: the reason the model stopped
             generating tokens. This will be 'stop' if the model hit a natural
             stop point or a provided stop sequence, 'length' if the maximum
             number of tokens specified in the request was reached.
-        token_ids:: the output token ids.
-        logprobs:: the top logprobs for each output
-            position.
-        index: it refers to the position index of the input request
-            batch
+        token_ids: the output token ids.
+        logprobs: the top logprobs for each output position.
+        index: it refers to the position index of the input request batch.
     """
     text: str
     generate_token_len: int
@@ -605,7 +602,7 @@ class RequestMetrics:
 
     Attributes:
         token_timestamp: A wall-clock time when a token is generated.
-        engine_events: List of engine events during inference.
+        engine_events: list of engine events during inference.
     """
     token_timestamp: float = 0.0
     engine_events: list[EngineEvent] = field(default_factory=list)