code · pull · Feb 27, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -16,7 +16,7 @@ body:
 
         ## Very Important
 
-        Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored.
+        Please make sure that you post ALL your ComfyUI logs in the bug report **even if there is no crash**. Just paste everything. The startup log (everything before "To see the GUI go to: ...") contains critical information to developers trying to help. For a performance issue or crash, paste everything from "got prompt" to the end, including the crash. More is better - always. A bug report without logs will likely be ignored.
   - type: checkboxes
     id: custom-nodes-test
     attributes:

diff --git a/comfy/ldm/modules/diffusionmodules/openaimodel.py b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -18,6 +18,8 @@
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 
+from ..sdpose import HeatmapHead
+
 class TimestepBlock(nn.Module):
     """
     Any module where forward() takes timestep embeddings as a second argument.
@@ -441,6 +443,7 @@ def __init__(
         disable_temporal_crossattention=False,
         max_ddpm_temb_period=10000,
         attn_precision=None,
+        heatmap_head=False,
         device=None,
         operations=ops,
     ):
@@ -827,6 +830,9 @@ def get_resblock(
             #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
         )
 
+        if heatmap_head:
+            self.heatmap_head = HeatmapHead(device=device, dtype=self.dtype, operations=operations)
+
     def forward(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs):
         return comfy.patcher_extension.WrapperExecutor.new_class_executor(
             self._forward,

diff --git a/comfy/ldm/modules/sdpose.py b/comfy/ldm/modules/sdpose.py
@@ -0,0 +1,130 @@
+import torch
+import numpy as np
+from scipy.ndimage import gaussian_filter
+
+class HeatmapHead(torch.nn.Module):
+    def __init__(
+            self,
+            in_channels=640,
+            out_channels=133,
+            input_size=(768, 1024),
+            heatmap_scale=4,
+            deconv_out_channels=(640,),
+            deconv_kernel_sizes=(4,),
+            conv_out_channels=(640,),
+            conv_kernel_sizes=(1,),
+            final_layer_kernel_size=1,
+            device=None, dtype=None, operations=None
+        ):
+        super().__init__()
+
+        self.heatmap_size = (input_size[0] // heatmap_scale, input_size[1] // heatmap_scale)
+        self.scale_factor = ((np.array(input_size) - 1) / (np.array(self.heatmap_size) - 1)).astype(np.float32)
+
+        # Deconv layers
+        if deconv_out_channels:
+            deconv_layers = []
+            for out_ch, kernel_size in zip(deconv_out_channels, deconv_kernel_sizes):
+                if kernel_size == 4:
+                    padding, output_padding = 1, 0
+                elif kernel_size == 3:
+                    padding, output_padding = 1, 1
+                elif kernel_size == 2:
+                    padding, output_padding = 0, 0
+                else:
+                    raise ValueError(f'Unsupported kernel size {kernel_size}')
+
+                deconv_layers.extend([
+                    operations.ConvTranspose2d(in_channels, out_ch, kernel_size,
+                                     stride=2, padding=padding, output_padding=output_padding, bias=False, device=device, dtype=dtype),
+                    torch.nn.InstanceNorm2d(out_ch, device=device, dtype=dtype),
+                    torch.nn.SiLU(inplace=True)
+                ])
+                in_channels = out_ch
+            self.deconv_layers = torch.nn.Sequential(*deconv_layers)
+        else:
+            self.deconv_layers = torch.nn.Identity()
+
+        # Conv layers
+        if conv_out_channels:
+            conv_layers = []
+            for out_ch, kernel_size in zip(conv_out_channels, conv_kernel_sizes):
+                padding = (kernel_size - 1) // 2
+                conv_layers.extend([
+                    operations.Conv2d(in_channels, out_ch, kernel_size,
+                            stride=1, padding=padding, device=device, dtype=dtype),
+                    torch.nn.InstanceNorm2d(out_ch, device=device, dtype=dtype),
+                    torch.nn.SiLU(inplace=True)
+                ])
+                in_channels = out_ch
+            self.conv_layers = torch.nn.Sequential(*conv_layers)
+        else:
+            self.conv_layers = torch.nn.Identity()
+
+        self.final_layer = operations.Conv2d(in_channels, out_channels, kernel_size=final_layer_kernel_size, padding=final_layer_kernel_size // 2, device=device, dtype=dtype)
+
+    def forward(self, x): # Decode heatmaps to keypoints
+        heatmaps = self.final_layer(self.conv_layers(self.deconv_layers(x)))
+        heatmaps_np = heatmaps.float().cpu().numpy()  # (B, K, H, W)
+        B, K, H, W = heatmaps_np.shape
+
+        batch_keypoints = []
+        batch_scores = []
+
+        for b in range(B):
+            hm = heatmaps_np[b].copy()  # (K, H, W)
+
+            # --- vectorised argmax ---
+            flat = hm.reshape(K, -1)
+            idx = np.argmax(flat, axis=1)
+            scores = flat[np.arange(K), idx].copy()
+            y_locs, x_locs = np.unravel_index(idx, (H, W))
+            keypoints = np.stack([x_locs, y_locs], axis=-1).astype(np.float32)  # (K, 2) in heatmap space
+            invalid = scores <= 0.
+            keypoints[invalid] = -1
+
+            # --- DARK sub-pixel refinement (UDP) ---
+            # 1. Gaussian blur with max-preserving normalisation
+            border = 5  # (kernel-1)//2 for kernel=11
+            for k in range(K):
+                origin_max = np.max(hm[k])
+                dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
+                dr[border:-border, border:-border] = hm[k].copy()
+                dr = gaussian_filter(dr, sigma=2.0)
+                hm[k] = dr[border:-border, border:-border].copy()
+                cur_max = np.max(hm[k])
+                if cur_max > 0:
+                    hm[k] *= origin_max / cur_max
+            # 2. Log-space for Taylor expansion
+            np.clip(hm, 1e-3, 50., hm)
+            np.log(hm, hm)
+            # 3. Hessian-based Newton step
+            hm_pad = np.pad(hm, ((0, 0), (1, 1), (1, 1)), mode='edge').flatten()
+            index = keypoints[:, 0] + 1 + (keypoints[:, 1] + 1) * (W + 2)
+            index += (W + 2) * (H + 2) * np.arange(0, K)
+            index = index.astype(int).reshape(-1, 1)
+            i_       = hm_pad[index]
+            ix1      = hm_pad[index + 1]
+            iy1      = hm_pad[index + W + 2]
+            ix1y1    = hm_pad[index + W + 3]
+            ix1_y1_  = hm_pad[index - W - 3]
+            ix1_     = hm_pad[index - 1]
+            iy1_     = hm_pad[index - 2 - W]
+            dx = 0.5 * (ix1 - ix1_)
+            dy = 0.5 * (iy1 - iy1_)
+            derivative = np.concatenate([dx, dy], axis=1).reshape(K, 2, 1)
+            dxx = ix1  - 2 * i_ + ix1_
+            dyy = iy1  - 2 * i_ + iy1_
+            dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+            hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1).reshape(K, 2, 2)
+            hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+            keypoints -= np.einsum('imn,ink->imk', hessian, derivative).squeeze(axis=-1)
+
+            # --- restore to input image space ---
+            keypoints = keypoints * self.scale_factor
+            keypoints[invalid] = -1
+
+            batch_keypoints.append(keypoints)
+            batch_scores.append(scores)
+
+        return batch_keypoints, batch_scores
diff --git a/comfy/lora.py b/comfy/lora.py
@@ -337,6 +337,7 @@ def model_lora_keys_unet(model, key_map={}):
             if k.startswith("diffusion_model.decoder.") and k.endswith(".weight"):
                 key_lora = k[len("diffusion_model.decoder."):-len(".weight")]
                 key_map["base_model.model.{}".format(key_lora)] = k  # Official base model loras
+                key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k  # LyCORIS/LoKR format
 
     return key_map
 

diff --git a/comfy/model_detection.py b/comfy/model_detection.py
@@ -795,6 +795,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         unet_config["use_temporal_resblock"] = False
         unet_config["use_temporal_attention"] = False
 
+    heatmap_key = '{}heatmap_head.conv_layers.0.weight'.format(key_prefix)
+    if heatmap_key in state_dict_keys:
+        unet_config["heatmap_head"] = True
+
     return unet_config
 
 def model_config_from_unet_config(unet_config, state_dict=None):
@@ -1015,7 +1019,7 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
 
     LotusD = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': 4,
             'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
-            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_heads': 8,
+            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64,
             'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
             'use_temporal_attention': False, 'use_temporal_resblock': False}
 

diff --git a/comfy/model_management.py b/comfy/model_management.py
@@ -350,7 +350,7 @@ def amd_min_version(device=None, min_rdna_version=0):
 
 try:
     if is_amd():
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName.split(':')[0]
         if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
             if os.getenv(AMD_ENABLE_MIOPEN_ENV) != '1':
                 torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
@@ -378,7 +378,7 @@ def aotriton_supported(gpu_arch):
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             if aotriton_supported(arch):  # AMD efficient attention implementation depends on aotriton.
                 if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
-                    if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
+                    if any((a in arch) for a in ["gfx90a", "gfx942", "gfx950", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
                         ENABLE_PYTORCH_ATTENTION = True
                 if rocm_version >= (7, 0):
                    if any((a in arch) for a in ["gfx1200", "gfx1201"]):

diff --git a/comfy/supported_models.py b/comfy/supported_models.py
@@ -525,7 +525,8 @@ class LotusD(SD20):
     }
 
     unet_extra_config = {
-        "num_classes": 'sequential'
+        "num_classes": 'sequential',
+        "num_head_channels": 64,
     }
 
     def get_model(self, state_dict, prefix="", device=None):

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
@@ -1224,16 +1224,19 @@ class BoundingBoxDict(TypedDict):
 
     class Input(WidgetInput):
         def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
-                     socketless: bool=True, default: dict=None, component: str=None):
+                     socketless: bool=True, default: dict=None, component: str=None, force_input: bool=None):
             super().__init__(id, display_name, optional, tooltip, None, default, socketless)
             self.component = component
+            self.force_input = force_input
             if default is None:
                 self.default = {"x": 0, "y": 0, "width": 512, "height": 512}
 
         def as_dict(self):
             d = super().as_dict()
             if self.component:
                 d["component"] = self.component
+            if self.force_input is not None:
+                d["forceInput"] = self.force_input
             return d