Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/windows_release_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ on:
description: 'cuda version'
required: true
type: string
default: "128"
default: "129"

python_minor:
description: 'python minor version'
required: true
type: string
default: "12"
default: "13"

python_patch:
description: 'python patch version'
required: true
type: string
default: "10"
default: "6"
# push:
# branches:
# - master
Expand Down Expand Up @@ -64,6 +64,8 @@ jobs:
./python.exe get-pip.py
./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth

rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
cd ..

git clone --depth 1 https://github.com/comfyanonymous/taesd
Expand Down
19 changes: 19 additions & 0 deletions comfy/ldm/wan/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ def __init__(self,
cross_attn_norm=True,
eps=1e-6,
flf_pos_embed_token_number=None,
in_dim_ref_conv=None,
image_model=None,
device=None,
dtype=None,
Expand Down Expand Up @@ -484,6 +485,11 @@ def __init__(self,
else:
self.img_emb = None

if in_dim_ref_conv is not None:
self.ref_conv = operations.Conv2d(in_dim_ref_conv, dim, kernel_size=patch_size[1:], stride=patch_size[1:], device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
else:
self.ref_conv = None

def forward_orig(
self,
x,
Expand Down Expand Up @@ -526,6 +532,13 @@ def forward_orig(
e = e.reshape(t.shape[0], -1, e.shape[-1])
e0 = self.time_projection(e).unflatten(2, (6, self.dim))

full_ref = None
if self.ref_conv is not None:
full_ref = kwargs.get("reference_latent", None)
if full_ref is not None:
full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
x = torch.concat((full_ref, x), dim=1)

# context
context = self.text_embedding(context)

Expand All @@ -552,6 +565,9 @@ def block_wrap(args):
# head
x = self.head(x, e)

if full_ref is not None:
x = x[:, full_ref.shape[1]:]

# unpatchify
x = self.unpatchify(x, grid_sizes)
return x
Expand All @@ -570,6 +586,9 @@ def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, tra
x = torch.cat([x, time_dim_concat], dim=2)
t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0])

if self.ref_conv is not None and "reference_latent" in kwargs:
t_len += 1

img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
Expand Down
10 changes: 9 additions & 1 deletion comfy/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1124,7 +1124,11 @@ def concat_cond(self, **kwargs):
mask = mask.repeat(1, 4, 1, 1, 1)
mask = utils.resize_to_batch_size(mask, noise.shape[0])

return torch.cat((mask, image), dim=1)
concat_mask_index = kwargs.get("concat_mask_index", 0)
if concat_mask_index != 0:
return torch.cat((image[:, :concat_mask_index], mask, image[:, concat_mask_index:]), dim=1)
else:
return torch.cat((mask, image), dim=1)

def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
Expand All @@ -1140,6 +1144,10 @@ def extra_conds(self, **kwargs):
if time_dim_concat is not None:
out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat))

reference_latents = kwargs.get("reference_latents", None)
if reference_latents is not None:
out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])[:, :, 0])

return out


Expand Down
5 changes: 5 additions & 0 deletions comfy/model_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix))
if flf_weight is not None:
dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1]

ref_conv_weight = state_dict.get('{}ref_conv.weight'.format(key_prefix))
if ref_conv_weight is not None:
dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1]

return dit_config

if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D
Expand Down
2 changes: 1 addition & 1 deletion comfy_api/latest/_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
try:
import torchaudio
TORCH_AUDIO_AVAILABLE = True
except ImportError:
except:
TORCH_AUDIO_AVAILABLE = False
from PIL import Image as PILImage
from PIL.PngImagePlugin import PngInfo
Expand Down
58 changes: 58 additions & 0 deletions comfy_extras/nodes_wan.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,63 @@ def encode(self, positive, negative, vae, width, height, length, batch_size, sta
out_latent["samples"] = latent
return (positive, negative, out_latent)

class Wan22FunControlToVideo:
@classmethod
def INPUT_TYPES(s):
return {"required": {"positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"vae": ("VAE", ),
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
},
"optional": {"ref_image": ("IMAGE", ),
"control_video": ("IMAGE", ),
# "start_image": ("IMAGE", ),
}}

RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
RETURN_NAMES = ("positive", "negative", "latent")
FUNCTION = "encode"

CATEGORY = "conditioning/video_models"

def encode(self, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None):
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))

if start_image is not None:
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
concat_latent_image = vae.encode(start_image[:, :, :, :3])
concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
mask[:, :, :start_image.shape[0] + 3] = 0.0

ref_latent = None
if ref_image is not None:
ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
ref_latent = vae.encode(ref_image[:, :, :, :3])

if control_video is not None:
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
concat_latent_image = vae.encode(control_video[:, :, :, :3])
concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]

mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16})
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16})

if ref_latent is not None:
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)

out_latent = {}
out_latent["samples"] = latent
return (positive, negative, out_latent)

class WanFirstLastFrameToVideo:
@classmethod
def INPUT_TYPES(s):
Expand Down Expand Up @@ -733,6 +790,7 @@ def encode(self, vae, width, height, length, batch_size, start_image=None):
"WanTrackToVideo": WanTrackToVideo,
"WanImageToVideo": WanImageToVideo,
"WanFunControlToVideo": WanFunControlToVideo,
"Wan22FunControlToVideo": Wan22FunControlToVideo,
"WanFunInpaintToVideo": WanFunInpaintToVideo,
"WanFirstLastFrameToVideo": WanFirstLastFrameToVideo,
"WanVaceToVideo": WanVaceToVideo,
Expand Down
Loading