ModelTC · hiworldwzj · Jan 9, 2026 · Jan 8, 2026 · Jan 8, 2026 · gemini-code-assist
diff --git a/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
@@ -24,6 +24,7 @@
 class Qwen3VLTransformerLayerInfer(Qwen2VLTransformerLayerInfer):
     def __init__(self, layer_num, network_config, mode=[]):
         super().__init__(layer_num, network_config, mode)
+        self.head_dim_ = network_config["head_dim"]
         self.mrope_section = torch.tensor(
             network_config["rope_scaling"]["mrope_section"], dtype=torch.int32, device="cuda"
         )

diff --git a/lightllm/models/vit/model.py b/lightllm/models/vit/model.py
@@ -185,7 +185,7 @@ def encode(self, images: List[ImageItem]):
             else:
                 raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
-            cur_num = img_tensors[-1].shape[0]
+            cur_num = img.token_num
-            cur_num = img.token_num
+            assert img.token_num is not None, "Image token number must be set before calling encode."
+            cur_num = img.token_num
-            cur_num = img.token_num
+            assert img.token_num is not None, "Image token number must be set before calling encode."
+            cur_num = img.token_num
             valid_ids.append([valid_id, valid_id + cur_num])
             valid_id += cur_num
 
@@ -195,7 +195,7 @@ def encode(self, images: List[ImageItem]):
         imgs = torch.cat(img_tensors, dim=0)
         pixel_values = imgs.cuda().to(dtype=self.data_type)
         all_img_embeds = self.forward(pixel_values)
-        return all_img_embeds, uuids, valid_ids
+        return all_img_embeds.view(-1, all_img_embeds.shape[-1]), uuids, valid_ids
 
     def cuda(self):
         return self