huggingface · thisisiron · Mar 18, 2026 · Mar 18, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -551,6 +551,8 @@
         title: DeBERTa
       - local: model_doc/deberta-v2
         title: DeBERTa-v2
+      - local: model_doc/deepseek_ocr2
+        title: DeepSeek-OCR-2
       - local: model_doc/deepseek_v2
         title: DeepSeek-V2
       - local: model_doc/deepseek_v3

diff --git a/docs/source/en/model_doc/deepseek_ocr2.md b/docs/source/en/model_doc/deepseek_ocr2.md
@@ -0,0 +1,101 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-01-28 and added to Hugging Face Transformers on 2026-04-14.*
+
+# DeepSeek-OCR-2
+
+
+## Overview
+
+The DeepSeek-OCR-2 model was proposed in [Visual Causal Flow: A Novel Approach to OCR-Specialized Vision-Language Models](https://huggingface.co/papers/2601.20552) by the DeepSeek team.
+
+DeepSeek-OCR-2 is an OCR-specialized vision-language model built on a distinctive architecture: a SAM ViT-B vision encoder feeds into a Qwen2 hybrid attention encoder, which is connected through an MLP projector to a DeepSeek-V2 Mixture-of-Experts (MoE) language model. A key feature of the model is its hybrid attention mechanism, which applies bidirectional attention over image tokens and causal attention over query tokens, enabling efficient and accurate document understanding.
+
+<img src="https://huggingface.co/deepseek-ai/DeepSeek-OCR-2/resolve/main/assets/fig1.png" width="800">
+
+<small> DeepSeek-OCR 2: Visual Causal Flow.</small>
+
+This model was contributed by [thisisiron](https://huggingface.co/thisisiron).
+
+
+## Usage example
+
+### Plain OCR
+
+```python
+>>> import torch
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> model = AutoModelForImageTextToText.from_pretrained(
+...     "thisisiron/DeepSeek-OCR-2-hf", dtype=torch.bfloat16, device_map="auto"
+... )
+>>> processor = AutoProcessor.from_pretrained("thisisiron/DeepSeek-OCR-2-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+>>> inputs = processor(images=image, text="<image>\nFree OCR.", return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=4096)
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+"R&D QUALITY IMPROVEMENT\nSUGGESTION/SOLUTION FORM\nName/Phone Ext. : (...)"
+```
+
+### Grounding with markdown conversion
+
+The `<|grounding|>` token enables coordinate-aware output with `<|ref|>` and `<|det|>` tags.
+
+```python
+>>> inputs = processor(
+...     images=image,
+...     text="<image>\n<|grounding|>Convert the document to markdown.",
+...     return_tensors="pt",
+... ).to(model.device, dtype=torch.bfloat16)
+
+>>> generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=4096)
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=False)
+"<|ref|>title<|/ref|><|det|>[[330, 198, 558, 230]]<|/det|>\n# R&D QUALITY (...)"
+```
+
+## DeepseekOcr2Config
+
+[[autodoc]] DeepseekOcr2Config
+
+## DeepseekOcr2ImageProcessor
+
+[[autodoc]] DeepseekOcr2ImageProcessor
+
+## DeepseekOcr2ImageProcessorPil
+
+[[autodoc]] DeepseekOcr2ImageProcessorPil
+
+## DeepseekOcr2Processor
+
+[[autodoc]] DeepseekOcr2Processor
+
+## DeepseekOcr2TextModel
+
+[[autodoc]] DeepseekOcr2TextModel
+
+## DeepseekOcr2VisionModel
+
+[[autodoc]] DeepseekOcr2VisionModel
+
+## DeepseekOcr2Model
+
+[[autodoc]] DeepseekOcr2Model
+
+## DeepseekOcr2ForConditionalGeneration
+
+[[autodoc]] DeepseekOcr2ForConditionalGeneration
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -91,6 +91,7 @@
     from .deberta import *
     from .deberta_v2 import *
     from .decision_transformer import *
+    from .deepseek_ocr2 import *
     from .deepseek_v2 import *
     from .deepseek_v3 import *
     from .deepseek_vl import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -109,6 +109,7 @@
         ("deberta", "DebertaConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
+        ("deepseek_ocr2", "DeepseekOcr2Config"),
         ("deepseek_v2", "DeepseekV2Config"),
         ("deepseek_v3", "DeepseekV3Config"),
         ("deepseek_vl", "DeepseekVLConfig"),
@@ -623,6 +624,7 @@
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
+        ("deepseek_ocr2", "DeepSeek-OCR-2"),
         ("deepseek_v2", "DeepSeek-V2"),
         ("deepseek_v3", "DeepSeek-V3"),
         ("deepseek_vl", "DeepseekVL"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -88,6 +88,7 @@
             ("convnextv2", {"torchvision": "ConvNextImageProcessor", "pil": "ConvNextImageProcessorPil"}),
             ("cvt", {"torchvision": "ConvNextImageProcessor", "pil": "ConvNextImageProcessorPil"}),
             ("data2vec-vision", {"torchvision": "BeitImageProcessor", "pil": "BeitImageProcessorPil"}),
+            ("deepseek_ocr2", {"torchvision": "DeepseekOcr2ImageProcessor", "pil": "DeepseekOcr2ImageProcessorPil"}),
             ("deepseek_vl", {"torchvision": "DeepseekVLImageProcessor", "pil": "DeepseekVLImageProcessorPil"}),
             (
                 "deepseek_vl_hybrid",

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -112,6 +112,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("deberta", "DebertaModel"),
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
+        ("deepseek_ocr2", "DeepseekOcr2Model"),
         ("deepseek_v2", "DeepseekV2Model"),
         ("deepseek_v3", "DeepseekV3Model"),
         ("deepseek_vl", "DeepseekVLModel"),
@@ -969,6 +970,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
         ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),
+        ("deepseek_ocr2", "DeepseekOcr2ForConditionalGeneration"),
         ("deepseek_vl", "DeepseekVLForConditionalGeneration"),
         ("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
         ("emu3", "Emu3ForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -69,6 +69,7 @@
             ("colmodernvbert", "ColModernVBertProcessor"),
             ("colpali", "ColPaliProcessor"),
             ("colqwen2", "ColQwen2Processor"),
+            ("deepseek_ocr2", "DeepseekOcr2Processor"),
             ("deepseek_vl", "DeepseekVLProcessor"),
             ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"),
             ("dia", "DiaProcessor"),

diff --git a/src/transformers/models/deepseek_ocr2/__init__.py b/src/transformers/models/deepseek_ocr2/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deepseek_ocr2 import *
+    from .image_processing_deepseek_ocr2 import *
+    from .image_processing_pil_deepseek_ocr2 import *
+    from .modeling_deepseek_ocr2 import *
+    from .processing_deepseek_ocr2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)