-
Notifications
You must be signed in to change notification settings - Fork 32
feat: vision encoder spec for multimodal models #195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e72eb93
1be8d44
4fc53bd
5f1d7ee
66adf85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,167 @@ | ||
| { | ||
| "description": "Vision Encoder Architecture Schema for Multimodal Models", | ||
| "$schema": "http://json-schema.org/draft-04/schema#", | ||
| "$id": "https://github.com/modelpack/model-spec/vision-encoder", | ||
| "type": "object", | ||
| "properties": { | ||
| "type": { | ||
| "type": "string", | ||
| "enum": ["vit", "clip_vit", "other"], | ||
| "description": "The vision encoder architecture type" | ||
| }, | ||
| "hidden_size": { | ||
| "type": "integer", | ||
| "description": "Hidden size / embedding dimension of the vision encoder" | ||
| }, | ||
| "patch_size": { | ||
| "type": "integer", | ||
| "description": "Spatial patch size in pixels (e.g., 14 means 14x14 patches)" | ||
| }, | ||
| "image_size": { | ||
| "type": "integer", | ||
| "description": "Default input image resolution in pixels" | ||
| }, | ||
| "num_layers": { | ||
| "type": "integer", | ||
| "description": "Number of transformer layers in the vision encoder" | ||
| }, | ||
| "num_attention_heads": { | ||
| "type": "integer", | ||
| "description": "Number of attention heads in the vision encoder" | ||
| }, | ||
| "intermediate_size": { | ||
| "type": "integer", | ||
| "description": "FFN intermediate size in the vision encoder" | ||
| }, | ||
| "in_channels": { | ||
| "type": "integer", | ||
| "description": "Number of input image channels (3 for RGB)", | ||
| "default": 3 | ||
| }, | ||
| "activation": { | ||
| "type": "string", | ||
| "description": "Activation function used in the vision encoder (e.g., quick_gelu, gelu, silu)" | ||
| }, | ||
| "norm": { | ||
| "type": "object", | ||
| "properties": { | ||
| "type": { | ||
| "type": "string", | ||
| "enum": ["layernorm", "rmsnorm"], | ||
| "description": "Normalization type in the vision encoder" | ||
| }, | ||
| "epsilon": { | ||
| "type": "number", | ||
| "description": "Epsilon value for normalization" | ||
| } | ||
| }, | ||
| "additionalProperties": false | ||
| }, | ||
| "projector": { | ||
| "type": "object", | ||
| "description": "Multimodal projector that maps vision embeddings to language model space", | ||
| "properties": { | ||
| "type": { | ||
| "type": "string", | ||
| "enum": ["mlp", "linear", "cross_attention", "perceiver", "other"], | ||
| "description": "Projector architecture type" | ||
| }, | ||
| "num_layers": { | ||
| "type": "integer", | ||
| "description": "Number of layers in the projector (for MLP or cross-attention projectors)" | ||
| }, | ||
| "activation": { | ||
| "type": "string", | ||
| "description": "Activation function in the projector (e.g., gelu)" | ||
| } | ||
| }, | ||
| "additionalProperties": false | ||
| }, | ||
| "special_tokens": { | ||
| "type": "object", | ||
| "description": "Special token IDs for image/video in the tokenizer", | ||
| "properties": { | ||
| "image_token_id": { | ||
| "type": "integer", | ||
| "description": "Token ID used as a placeholder for image input" | ||
| }, | ||
| "vision_start_token_id": { | ||
| "type": "integer", | ||
| "description": "Token ID marking the start of a vision region" | ||
| }, | ||
| "vision_end_token_id": { | ||
| "type": "integer", | ||
| "description": "Token ID marking the end of a vision region" | ||
| }, | ||
| "vision_token_id": { | ||
| "type": "integer", | ||
| "description": "Token ID for a generic vision placeholder (e.g., used by Qwen2-VL)" | ||
| }, | ||
| "video_token_id": { | ||
| "type": "integer", | ||
| "description": "Token ID for video frame placeholder" | ||
| } | ||
| }, | ||
| "additionalProperties": false | ||
| }, | ||
| "dynamic_resolution": { | ||
| "type": "object", | ||
| "description": "Dynamic image resolution support (e.g., Qwen2-VL native dynamic resolution)", | ||
| "properties": { | ||
| "enabled": { | ||
| "type": "boolean" | ||
| }, | ||
| "min_pixels": { | ||
| "type": "integer", | ||
| "description": "Minimum number of visual tokens" | ||
| }, | ||
| "max_pixels": { | ||
| "type": "integer", | ||
| "description": "Maximum number of visual tokens" | ||
| }, | ||
| "spatial_merge_size": { | ||
| "type": "integer", | ||
| "description": "Spatial merging stride for reducing token count" | ||
| } | ||
| }, | ||
| "additionalProperties": false | ||
| }, | ||
| "temporal_patch_size": { | ||
| "type": "integer", | ||
| "description": "Temporal patch size for video understanding (number of frames per patch)" | ||
| }, | ||
| "fusion_type": { | ||
| "type": "string", | ||
| "enum": ["early", "late", "cross_attention"], | ||
| "description": "How vision and language modalities are fused" | ||
| }, | ||
| "position_embedding": { | ||
| "type": "object", | ||
| "description": "Position embedding configuration for the vision encoder", | ||
| "properties": { | ||
| "type": { | ||
| "type": "string", | ||
| "enum": ["learned", "rope", "mrope", "sinusoidal"], | ||
| "description": "Type of position embedding" | ||
| }, | ||
| "mrope_sections": { | ||
| "type": "array", | ||
| "items": { | ||
| "type": "integer" | ||
| }, | ||
| "description": "Per-modality RoPE dimension sections (for mrope type)" | ||
| } | ||
| }, | ||
| "additionalProperties": false | ||
| } | ||
| }, | ||
| "required": [ | ||
| "type", | ||
| "hidden_size", | ||
| "patch_size", | ||
| "image_size", | ||
| "num_layers", | ||
| "num_attention_heads" | ||
| ], | ||
| "additionalProperties": false | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,210 @@ | ||
| # Vision Encoder Specification | ||
|
|
||
| This document describes the vision encoder architecture fields for multimodal models that process image and video inputs. It extends the model configuration defined in [config.md](./config.md) to cover the architectural details of how visual inputs are processed. | ||
|
|
||
| ## Background | ||
|
|
||
| The current ModelPack specification supports declaring image modality via `capabilities.inputTypes: ["image"]`, but provides no architectural description of how images are processed. Every major model family now has a vision variant (LLaVA, Qwen2-VL, LLaMA-3.2 Vision, Gemma 2 VL), and inference engines need structured metadata about the vision encoder to correctly configure image preprocessing, patch embedding, and vision-language fusion. | ||
|
|
||
| ## Architecture Overview | ||
|
|
||
| Vision-language models follow a common pattern: | ||
|
|
||
| ```text | ||
| Input Image → Vision Encoder → Projector → Language Model → Text Output | ||
| ↓ | ||
| Visual token embeddings | ||
| ``` | ||
|
|
||
| The **vision encoder** converts raw images into a sequence of visual tokens using a Vision Transformer (ViT) or CLIP-ViT architecture. A **projector** module maps these visual tokens into the language model's embedding space. The **fusion type** determines how visual and textual tokens interact inside the language model. | ||
|
|
||
| ## Properties | ||
|
|
||
| - **type** _string_, REQUIRED | ||
|
|
||
| The vision encoder architecture type. Supported values: | ||
|
|
||
| | Value | Description | | ||
| |-------|-------------| | ||
| | `"vit"` | Standard Vision Transformer | | ||
| | `"clip_vit"` | CLIP-pretrained Vision Transformer | | ||
| | `"other"` | Other vision encoder architecture | | ||
|
|
||
| - **hidden_size** _integer_, REQUIRED | ||
|
|
||
| The hidden size (embedding dimension) of the vision encoder. | ||
|
|
||
| - **patch_size** _integer_, REQUIRED | ||
|
|
||
| The spatial patch size in pixels. For example, `14` means the image is divided into 14×14 pixel patches. Each patch becomes one visual token. | ||
|
|
||
| - **image_size** _integer_, REQUIRED | ||
|
|
||
| The default input image resolution in pixels. | ||
|
|
||
| - **num_layers** _integer_, REQUIRED | ||
|
|
||
| The number of transformer layers in the vision encoder. | ||
|
|
||
| - **num_attention_heads** _integer_, REQUIRED | ||
|
|
||
| The number of attention heads in the vision encoder. | ||
|
|
||
| - **intermediate_size** _integer_, OPTIONAL | ||
|
|
||
| The FFN intermediate size in the vision encoder. | ||
|
|
||
| - **in_channels** _integer_, OPTIONAL | ||
|
|
||
| The number of input image channels. Defaults to `3` (RGB). | ||
|
|
||
| - **activation** _string_, OPTIONAL | ||
|
|
||
| The activation function used in the vision encoder, such as `"quick_gelu"`, `"gelu"`, or `"silu"`. | ||
|
|
||
| - **norm** _object_, OPTIONAL | ||
|
|
||
| Normalization configuration for the vision encoder. | ||
|
|
||
| - **type** _string_, OPTIONAL | ||
|
|
||
| The normalization type. Supported values: `"layernorm"`, `"rmsnorm"`. | ||
|
|
||
| - **epsilon** _number_, OPTIONAL | ||
|
|
||
| The epsilon value for normalization. | ||
|
|
||
| - **projector** _object_, OPTIONAL | ||
|
|
||
| The multimodal projector that maps vision encoder outputs to the language model embedding space. | ||
|
|
||
| - **type** _string_, OPTIONAL | ||
|
|
||
| The projector architecture type. Supported values: | ||
|
|
||
| | Value | Description | | ||
| |-------|-------------| | ||
| | `"mlp"` | Multi-layer perceptron (e.g., LLaVA 1.5 uses 2-layer MLP with GELU) | | ||
| | `"linear"` | Single linear projection | | ||
| | `"cross_attention"` | Cross-attention layers (e.g., LLaMA-3.2 Vision) | | ||
| | `"perceiver"` | Perceiver-style resampler | | ||
| | `"other"` | Other projector architecture | | ||
|
|
||
| - **num_layers** _integer_, OPTIONAL | ||
|
|
||
| The number of layers in the projector (for MLP or cross-attention type projectors). | ||
|
|
||
| - **activation** _string_, OPTIONAL | ||
|
|
||
| The activation function in the projector, such as `"gelu"`. | ||
|
|
||
| - **special_tokens** _object_, OPTIONAL | ||
|
|
||
| Special token IDs used for image and video inputs in the tokenizer. | ||
|
|
||
| - **image_token_id** _integer_, OPTIONAL | ||
|
|
||
| The token ID used as a placeholder for image input in the text sequence. | ||
|
|
||
| - **vision_start_token_id** _integer_, OPTIONAL | ||
|
|
||
| The token ID marking the start of a vision region (used by models like Qwen2-VL). | ||
|
|
||
| - **vision_end_token_id** _integer_, OPTIONAL | ||
|
|
||
| The token ID marking the end of a vision region. | ||
|
|
||
| - **vision_token_id** _integer_, OPTIONAL | ||
|
|
||
| The token ID for a generic vision placeholder (used by models like Qwen2-VL). | ||
|
|
||
| - **video_token_id** _integer_, OPTIONAL | ||
|
|
||
| The token ID for video frame placeholders. | ||
|
Comment on lines
+101
to
+123
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The specification for Please add Example: - **vision_token_id** _integer_, OPTIONAL
The token ID for a generic vision placeholder (used by models like Qwen2-VL). |
||
|
|
||
| - **dynamic_resolution** _object_, OPTIONAL | ||
|
|
||
| Dynamic image resolution support, where the model can handle variable-resolution inputs. | ||
|
|
||
| - **enabled** _boolean_, OPTIONAL | ||
|
|
||
| Whether dynamic resolution is enabled. | ||
|
|
||
| - **min_pixels** _integer_, OPTIONAL | ||
|
|
||
| The minimum number of visual tokens. | ||
|
|
||
| - **max_pixels** _integer_, OPTIONAL | ||
|
|
||
| The maximum number of visual tokens. | ||
|
|
||
| - **spatial_merge_size** _integer_, OPTIONAL | ||
|
|
||
| The spatial merging stride for reducing visual token count. | ||
|
|
||
| - **temporal_patch_size** _integer_, OPTIONAL | ||
|
|
||
| The temporal patch size for video understanding. Specifies how many frames are grouped into one temporal patch. | ||
|
|
||
| - **fusion_type** _string_, OPTIONAL | ||
|
|
||
| How vision and language modalities are fused. Supported values: | ||
|
|
||
| | Value | Description | | ||
| |-------|-------------| | ||
| | `"early"` | Visual tokens are concatenated with text tokens before the first transformer layer (e.g., Qwen2-VL) | | ||
| | `"late"` | Visual tokens are injected after separate encoding (e.g., LLaVA) | | ||
| | `"cross_attention"` | Dedicated cross-attention layers between vision and language (e.g., LLaMA-3.2 Vision) | | ||
|
|
||
| - **position_embedding** _object_, OPTIONAL | ||
|
|
||
| Position embedding configuration for the vision encoder. | ||
|
|
||
| - **type** _string_, OPTIONAL | ||
|
|
||
| The type of position embedding. Supported values: `"learned"`, `"rope"`, `"mrope"`, `"sinusoidal"`. | ||
|
|
||
| - **mrope_sections** _array of integers_, OPTIONAL | ||
|
|
||
| Per-modality RoPE dimension sections. Only applicable when type is `"mrope"` (e.g., Qwen2-VL uses `[16, 24, 24]` for temporal, height, width dimensions). | ||
|
|
||
| ## Model Coverage | ||
|
|
||
| | Model | Encoder | Patch Size | Image Size | Projector | Fusion | Special Features | | ||
| |-------|---------|-----------|------------|-----------|--------|------------------| | ||
| | LLaVA 1.5 | CLIP-ViT-L/14 | 14 | 336 | 2-layer MLP | late | — | | ||
| | Qwen2-VL | ViT | 14 | dynamic | — | early | mRoPE, dynamic resolution, video | | ||
| | LLaMA-3.2 Vision | CLIP-ViT | 14 | 560 | cross-attention | cross_attention | Gated cross-attention | | ||
| | Gemma 2 VL | SigLIP | 14 | 224 | linear | late | — | | ||
|
|
||
| ## Example | ||
|
|
||
| ```json | ||
| { | ||
| "type": "clip_vit", | ||
| "hidden_size": 1024, | ||
| "patch_size": 14, | ||
| "image_size": 336, | ||
| "num_layers": 24, | ||
| "num_attention_heads": 16, | ||
| "intermediate_size": 4096, | ||
| "in_channels": 3, | ||
| "activation": "quick_gelu", | ||
| "norm": { | ||
| "type": "layernorm", | ||
| "epsilon": 1e-5 | ||
| }, | ||
| "projector": { | ||
| "type": "mlp", | ||
| "num_layers": 2, | ||
| "activation": "gelu" | ||
| }, | ||
| "special_tokens": { | ||
| "image_token_id": 32000 | ||
| }, | ||
| "fusion_type": "late", | ||
| "position_embedding": { | ||
| "type": "learned" | ||
| } | ||
| } | ||
| ``` | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
special_tokensproperties are missingvision_token_id. Models like Qwen2-VL use this token, and it's mentioned in the PR description as one of the 5 special tokens for that model, but it's not included in the schema.Please add it to the properties list to ensure the schema is complete: