huggingface · waitingcheung · Apr 23, 2026 · Apr 27, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/docs/source/en/api/models/motif_video_transformer_3d.md b/docs/source/en/api/models/motif_video_transformer_3d.md
@@ -0,0 +1,32 @@
+<!-- Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# MotifVideoTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in Motif-Video by the Motif Technologies Team.
+
+The model uses a three-stage architecture with 12 dual-stream + 16 single-stream + 8 DDT decoder layers and rotary positional embeddings (RoPE) for video generation.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import MotifVideoTransformer3DModel
+
+transformer = MotifVideoTransformer3DModel.from_pretrained("Motif-Technologies/Motif-Video-2B", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## MotifVideoTransformer3DModel
+
+[[autodoc]] MotifVideoTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/pipelines/motif_video.md b/docs/source/en/api/pipelines/motif_video.md
@@ -0,0 +1,123 @@
+<!-- Copyright 2026 The HuggingFace Team. All rights reserved. -->
+
+# Motif-Video
+
+[Technical Report](https://arxiv.org/abs/2604.16503)
+
+Motif-Video is a 2B parameter diffusion transformer designed for text-to-video and image-to-video generation. It features a three-stage architecture with 12 dual-stream + 16 single-stream + 8 DDT decoder layers, Shared Cross-Attention for stable text-video alignment under long video sequences, T5Gemma2 text encoder, and rectified flow matching for velocity prediction.
+
+<p align="center">
+  <img src="https://huggingface.co/Motif-Technologies/Motif-Video-2B/resolve/main/assets/architecture.png" width="90%" alt="Motif-Video architecture"/>
+</p>
+
+## Text-to-Video Generation
+
+Use `MotifVideoPipeline` for text-to-video generation:
+
+```python
+import torch
+from diffusers import MotifVideoPipeline
+from diffusers.utils import export_to_video
+
+
+pipe = MotifVideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+
+prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair."
+negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "output.mp4", fps=24)
+```
+
+## Image-to-Video Generation
+
+Use `MotifVideoImage2VideoPipeline` for image-to-video generation:
+
+```python
+import torch
+from diffusers import MotifVideoImage2VideoPipeline
+from diffusers.utils import export_to_video, load_image
+
+
+pipe = MotifVideoImage2VideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+
+image = load_image("input_image.png")
+prompt = "A cinematic scene with vivid colors."
+negative_prompt = "worst quality, blurry, jittery, distorted"
+
+video = pipe(
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "i2v_output.mp4", fps=24)
+```
+
+### Memory-efficient Inference
+
+For GPUs with less than 30GB VRAM (e.g., RTX 4090), use model CPU offloading:
+
+```bash
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+```
+
+```python
+import torch
+from diffusers import MotifVideoPipeline
+from diffusers.utils import export_to_video
+
+
+pipe = MotifVideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+)
+pipe.enable_model_cpu_offload()
+
+prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair."
+negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "output.mp4", fps=24)
+```
+
+## MotifVideoPipeline
+
+[[autodoc]] MotifVideoPipeline
+  - all
+  - __call__
+
+## MotifVideoImage2VideoPipeline
+
+[[autodoc]] MotifVideoImage2VideoPipeline
+  - all
+  - __call__
+
+## MotifVideoPipelineOutput
+
+[[autodoc]] pipelines.motif_video.pipeline_output.MotifVideoPipelineOutput
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -265,6 +265,7 @@
             "LuminaNextDiT2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
+            "MotifVideoTransformer3DModel",
             "MotionAdapter",
             "MultiAdapter",
             "MultiControlNetModel",
@@ -637,6 +638,9 @@
             "MarigoldIntrinsicsPipeline",
             "MarigoldNormalsPipeline",
             "MochiPipeline",
+            "MotifVideoImage2VideoPipeline",
+            "MotifVideoPipeline",
+            "MotifVideoPipelineOutput",
             "MusicLDMPipeline",
             "NucleusMoEImagePipeline",
             "OmniGenPipeline",
@@ -1087,6 +1091,7 @@
             LuminaNextDiT2DModel,
             MochiTransformer3DModel,
             ModelMixin,
+            MotifVideoTransformer3DModel,
             MotionAdapter,
             MultiAdapter,
             MultiControlNetModel,
@@ -1434,6 +1439,9 @@
             MarigoldIntrinsicsPipeline,
             MarigoldNormalsPipeline,
             MochiPipeline,
+            MotifVideoImage2VideoPipeline,
+            MotifVideoPipeline,
+            MotifVideoPipelineOutput,
             MusicLDMPipeline,
             NucleusMoEImagePipeline,
             OmniGenPipeline,

diff --git a/src/diffusers/hooks/_helpers.py b/src/diffusers/hooks/_helpers.py
@@ -188,6 +188,10 @@ def _register_transformer_blocks_metadata():
     from ..models.transformers.transformer_kandinsky import Kandinsky5TransformerDecoderBlock
     from ..models.transformers.transformer_ltx import LTXVideoTransformerBlock
     from ..models.transformers.transformer_mochi import MochiTransformerBlock
+    from ..models.transformers.transformer_motif_video import (
+        MotifVideoSingleTransformerBlock,
+        MotifVideoTransformerBlock,
+    )
     from ..models.transformers.transformer_qwenimage import QwenImageTransformerBlock
     from ..models.transformers.transformer_wan import WanTransformerBlock
     from ..models.transformers.transformer_z_image import ZImageTransformerBlock
@@ -290,6 +294,22 @@ def _register_transformer_blocks_metadata():
         ),
     )
 
+    # MotifVideo
+    TransformerBlockRegistry.register(
+        model_class=MotifVideoTransformerBlock,
+        metadata=TransformerBlockMetadata(
+            return_hidden_states_index=0,
+            return_encoder_hidden_states_index=1,
+        ),
+    )
+    TransformerBlockRegistry.register(
+        model_class=MotifVideoSingleTransformerBlock,
+        metadata=TransformerBlockMetadata(
+            return_hidden_states_index=0,
+            return_encoder_hidden_states_index=1,
+        ),
+    )
+
     # Wan
     TransformerBlockRegistry.register(
         model_class=WanTransformerBlock,

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -21,7 +21,11 @@
 from typing_extensions import Self
 
 from .. import __version__
-from ..models.model_loading_utils import _caching_allocator_warmup, _determine_device_map, _expand_device_map
+from ..models.model_loading_utils import (
+    _caching_allocator_warmup,
+    _determine_device_map,
+    _expand_device_map,
+)
 from ..quantizers import DiffusersAutoQuantizer
 from ..utils import deprecate, is_accelerate_available, is_torch_version, logging
 from ..utils.torch_utils import empty_device_cache
@@ -194,6 +198,10 @@
         "checkpoint_mapping_fn": convert_ltx2_audio_vae_to_diffusers,
         "default_subfolder": "audio_vae",
     },
+    "MotifVideoTransformer3DModel": {
+        "checkpoint_mapping_fn": lambda checkpoint, **kwargs: checkpoint,
+        "default_subfolder": "transformer",
+    },
 }
 
 
@@ -336,7 +344,11 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: str | None = No
         disable_mmap = kwargs.pop("disable_mmap", False)
         device_map = kwargs.pop("device_map", None)
 
-        user_agent = {"diffusers": __version__, "file_type": "single_file", "framework": "pytorch"}
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "single_file",
+            "framework": "pytorch",
+        }
         # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
         if quantization_config is not None:
             user_agent["quant"] = quantization_config.quant_method.value
@@ -393,7 +405,9 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: str | None = No
 
             config_mapping_kwargs = _get_mapping_function_kwargs(config_mapping_fn, **kwargs)
             diffusers_model_config = config_mapping_fn(
-                original_config=original_config, checkpoint=checkpoint, **config_mapping_kwargs
+                original_config=original_config,
+                checkpoint=checkpoint,
+                **config_mapping_kwargs,
             )
         else:
             if config is not None:
@@ -465,7 +479,9 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: str | None = No
 
         if _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint):
             diffusers_format_checkpoint = checkpoint_mapping_fn(
-                config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
+                config=diffusers_model_config,
+                checkpoint=checkpoint,
+                **checkpoint_mapping_kwargs,
             )
         else:
             diffusers_format_checkpoint = checkpoint

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -123,6 +123,7 @@
     _import_structure["transformers.transformer_ltx2"] = ["LTX2VideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
+    _import_structure["transformers.transformer_motif_video"] = ["MotifVideoTransformer3DModel"]
     _import_structure["transformers.transformer_nucleusmoe_image"] = ["NucleusMoEImageTransformer2DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_ovis_image"] = ["OvisImageTransformer2DModel"]
@@ -249,6 +250,7 @@
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,
             MochiTransformer3DModel,
+            MotifVideoTransformer3DModel,
             NucleusMoEImageTransformer2DModel,
             OmniGenTransformer2DModel,
             OvisImageTransformer2DModel,

diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
@@ -44,6 +44,7 @@
     from .transformer_ltx2 import LTX2VideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
+    from .transformer_motif_video import MotifVideoTransformer3DModel
     from .transformer_nucleusmoe_image import NucleusMoEImageTransformer2DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_ovis_image import OvisImageTransformer2DModel