Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/Model Support.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
[Anima](#anima) | DiT | 2026 | Circlestone Labs | 2B | WTF | Modern, very small, decent for anime |
[ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
[HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
[Lens](#lens) | MMDiT | 2026 | Microsoft | 3.8B | Minimal | Modern, lightweight |

Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):

Expand Down Expand Up @@ -618,6 +619,22 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
- **Dev Lora:**
- A dev lora can be downloaded here [Kijai/hidream-O1-image_comfy](<https://huggingface.co/Kijai/hidream-O1-image_comfy/resolve/main/loras/hidream_o1_dev_lora_rank_64_bf16_pruned_v1.safetensors>). It allows use of the base model with the distilled behavior from the Dev model. 8 steps will generate a coherent image of lower quality, 16 steps seems closer to original quality. Use CFG Scale 1.

# Lens

- Microsoft's [Lens](<https://huggingface.co/microsoft/Lens>) is supported in SwarmUI!
- It is a 3.8B model, with a base model and an official turbo distill designed to run fast.
- The "Turbo" model (in fat BF16) can be downloaded here [Comfy-Org/Lens - turbo](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_turbo_bf16.safetensors?download=true>)
- Or the base version (in fat BF16) [Comfy-Org/Lens - base](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_bf16.safetensors?download=true>)
- Save in `diffusion_models`
- Uses the Flux.2 VAE, will be downloaded and handled automatically
- Uses the GPT-OSS 20B text encoder, will be downloaded and handled automatically
- **Parameters:**
- **Sampler:** Default is fine.
- **Scheduler:** Default is fine.
- **CFG Scale:** For Turbo, `1`, for base normal CFG ranges (around `5`)
- **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
- **Resolution:** Side length `1440` is the standard.

# Video Models

- Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,7 @@ void copyParam<T>(T2IRegisteredParam<T> param)
copyParam(T2IParamTypes.QwenModel);
copyParam(T2IParamTypes.MistralModel);
copyParam(T2IParamTypes.GemmaModel);
copyParam(T2IParamTypes.GptOssModel);
}
WorkflowGenerator wg = new() { UserInput = input, ModelFolderFormat = ModelFolderFormat, Features = [.. SupportedFeatures] };
JObject workflow = wg.Generate();
Expand Down
2 changes: 1 addition & 1 deletion src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
}
}
// TODO: Registry of model default preferences instead of this
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1())
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens())
{
defscheduler ??= "simple";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ public bool IsKontext()
/// <summary>Returns true if the current model is HiDream-O1 Image.</summary>
public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1);

/// <summary>Returns true if the current model is Lens.</summary>
public bool IsLens() => IsModelCompatClass(T2IModelClassSorter.CompatLens);

/// <summary>Returns true if the current model supports Flux Guidance.</summary>
public bool HasFluxGuidance()
{
Expand Down Expand Up @@ -269,7 +272,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
["width"] = width
}, id));
}
else if (IsAnyFlux2() || IsErnie())
else if (IsAnyFlux2() || IsErnie() || IsLens())
{
return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject()
{
Expand Down Expand Up @@ -598,6 +601,11 @@ public string GetMinistral3_3bModel()
return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel);
}

public string GetGptOss_20bModel()
{
return RequireClipModel("gpt_oss_20b_mxfp4.safetensors", "https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/text_encoders/gpt_oss_20b_mxfp4.safetensors", "f279cf3e73c494f78e0c5e4d35cf665068ae69672f7066813dbb75c021286856", T2IParamTypes.GptOssModel);
}

public string GetClipLModel()
{
if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model))
Expand Down Expand Up @@ -899,7 +907,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
{
dtype = "default";
}
else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format
else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens()) // Model is small and dense, so trust user preferred download format
{
dtype = "default";
}
Expand Down Expand Up @@ -1057,6 +1065,27 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel());
helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
}
else if (IsLens())
{
helpers.LoadClip("lens", helpers.GetGptOss_20bModel());
helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
string lensSamplingNode = CreateNode("ModelSamplingFlux", new JObject()
{
["model"] = LoadingModel,
["width"] = UserInput.GetImageWidth(),
["height"] = UserInput.GetImageHeight(),
["max_shift"] = UserInput.Get(T2IParamTypes.SigmaShift, 1.15, sectionId: sectionId),
["base_shift"] = 0.5
});
LoadingModel = [lensSamplingNode, 0];
string lensCfgNormNode = CreateNode("CFGNorm", new JObject()
{
["model"] = LoadingModel,
["strength"] = 1.0,
["pre_cfg"] = true
});
LoadingModel = [lensCfgNormNode, 0];
}
else if (IsFlux() && (LoadingClip is null || LoadingVAE is null || UserInput.Get(T2IParamTypes.T5XXLModel) is not null || UserInput.Get(T2IParamTypes.ClipLModel) is not null))
{
helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel());
Expand Down
6 changes: 6 additions & 0 deletions src/Text2Image/T2IModelClassSorter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ public static T2IModelCompatClass
CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }),
CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }),
// Audio models
CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
// Obscure old random ones
Expand Down Expand Up @@ -158,6 +159,7 @@ bool isFluxLora(JObject h)
bool isFlux2KleinLora(JObject h) => hasLoraKey(h, "double_blocks.4.img_attn.proj") && hasLoraKey(h, "double_blocks.4.txt_mlp.2") && hasLoraKey(h, "single_blocks.18.linear1") && hasLoraKey(h, "single_blocks.19.linear2");
bool isFlux2Klein9BLora(JObject h) => hasLoraKey(h, "single_blocks.23.linear1");
bool isFlux2DevLora(JObject h) => hasLoraKey(h, "single_blocks.47.linear2");
bool isLens(JObject h) => h.ContainsKey("transformer_blocks.0.attn.norm_added_q.weight") && h.ContainsKey("transformer_blocks.0.img_mlp.w1.weight");
bool isSD35Lora(JObject h) => h.ContainsKey("transformer.transformer_blocks.0.attn.to_k.lora_A.weight") && h.ContainsKey("transformer.transformer_blocks.37.attn.to_out.0.lora_B.weight");
bool isMochi(JObject h) => hasKey(h, "blocks.0.attn.k_norm_x.weight");
bool isMochiVae(JObject h) => h.ContainsKey("encoder.layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("blocks.2.blocks.3.stack.5.weight") || h.ContainsKey("decoder.blocks.2.blocks.3.stack.5.weight");
Expand Down Expand Up @@ -478,6 +480,10 @@ JToken GetEmbeddingKey(JObject h)
{
return isFlux2KleinLora(h) && isFlux2Klein9BLora(h) && !isFlux2DevLora(h);
}});
Register(new() { ID = "lens", CompatClass = CompatLens, Name = "Lens", StandardWidth = 1440, StandardHeight = 1440, IsThisModelOfClass = (m, h) =>
{
return isLens(h);
}});
// ====================== Wan Video ======================
Register(new() { ID = "wan-2_1-text2video/vae", CompatClass = CompatWan21, Name = "Wan 2.1 VAE", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) => { return false; }});
Register(new() { ID = "wan-2_1-text2video-1_3b", CompatClass = CompatWan21_1_3b, Name = "Wan 2.1 Text2Video 1.3B", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) =>
Expand Down
5 changes: 4 additions & 1 deletion src/Text2Image/T2IParamTypes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ public static string ApplyStringEdit(string prior, string update)
FreeUBlock1, FreeUBlock2, FreeUSkip1, FreeUSkip2, GlobalRegionFactor, EndStepsEarly, SamplerSigmaMin, SamplerSigmaMax, SamplerRho, VideoAugmentationLevel, VideoCFG, VideoMinCFG, Video2VideoCreativity, VideoSwapPercent, VideoExtendSwapPercent, IP2PCFG2, RegionalObjectCleanupFactor, SigmaShift, SegmentThresholdMax, SegmentCFGScale, FluxGuidanceScale, Text2AudioDuration;
public static T2IRegisteredParam<Image> InitImage, MaskImage, VideoEndFrame;
public static T2IRegisteredParam<AudioFile> VideoAudioInput, VideoAudioReference;
public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, VideoExtendModel, VideoExtendSwapModel;
public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, GptOssModel, VideoExtendModel, VideoExtendSwapModel;
public static T2IRegisteredParam<List<string>> Loras, LoraWeights, LoraTencWeights, LoraSectionConfinement;
public static T2IRegisteredParam<List<Image>> PromptImages;
public static T2IRegisteredParam<bool> OutputIntermediateImages, DoNotSave, DoNotSaveIntermediates, ControlNetPreviewOnly, RevisionZeroPrompt, RemoveBackground, NoSeedIncrement, NoPreviews, VideoBoomerang, ModelSpecificEnhancements, UseInpaintingEncode, MaskCompositeUnthresholded, SaveSegmentMask, InitImageRecompositeMask, UseReferenceOnly, RefinerDoTiling, AutomaticVAE, ZeroNegative, FluxDisableGuidance, SmartImagePromptResizing, NoLoadModels, NoInternalSpecialHandling, ForwardRawBackendData, ForwardSwarmData,
Expand Down Expand Up @@ -715,6 +715,9 @@ static List<string> listVaes(Session s)
GemmaModel = Register<T2IModel>(new("Gemma Model", "Which Gemma LLM to use as a text encoder, for models that use Gemma (such as Lumina2, LTX2).",
"", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
));
GptOssModel = Register<T2IModel>(new("GPT-OSS Model", "Which GPT-OSS LLM to use as a text encoder, for Lens-style 'diffusion_models' folder models.",
"", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
));
TorchCompile = Register<string>(new("Torch Compile", "Torch.Compile is a way to dynamically accelerate AI models.\nIt wastes a bit of time (around a minute) on the first call compiling a graph of the generation, and then all subsequent generations run faster thanks to the compiled graph.\nTorch.Compile depends on Triton, which is difficult to install on Windows, easier on Linux.",
"Disabled", IgnoreIf: "Disabled", GetValues: _ => ["Disabled", "inductor", "cudagraphs"], OrderPriority: 40, Group: GroupAdvancedModelAddons
));
Expand Down
Loading