huggingface · iambogeumkim · May 17, 2025 · Aug 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
@@ -675,6 +675,16 @@ class LoraConfig(PeftConfig):
         },
     )
 
+    use_kasa: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Enable <a href='https://arxiv.org/abs/2412.06071'>'Knowledge-Aware Singular-Value Adaptation of Large Language Models' (KaSA)</a>. This technique leverages "
+                "singular value decomposition (SVD) with knowledge-aware singular values to dynamically "
+                "activate parametric knowledge according to its relevance to downstream tasks."
+            )
+        }
+    )
     def to_dict(self):
         """
         Returns the configuration for your adapter model as a dictionary. Removes runtime configurations.

diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -127,7 +127,7 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, *
         self.in_features = in_features
         self.out_features = out_features
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         """Return a matching LoRA variant for this layer type.
 
         Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this
@@ -150,6 +150,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora: bool = False,
+        use_kasa: bool = False,
         use_alora: bool = False,
         use_qalora: bool = False,
         lora_bias: bool = False,
@@ -175,11 +176,13 @@ def update_layer(
 
         lora_variant = self.resolve_lora_variant(
             use_dora=use_dora,
+            use_kasa=use_kasa,
             use_alora=use_alora,
             use_qalora=use_qalora,
             qalora_group_size=qalora_group_size,
             arrow_config=arrow_config,
         )
+
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -610,6 +613,7 @@ def __init__(
         init_lora_weights: Union[bool, str] = True,
         use_rslora: bool = False,
         use_dora: bool = False,
+        use_kasa: bool = False,
         use_alora: bool = False,
         arrow_config: ArrowConfig = None,
         lora_bias: bool = False,
@@ -628,27 +632,30 @@ def __init__(
             init_lora_weights=init_lora_weights,
             use_rslora=use_rslora,
             use_dora=use_dora,
+            use_kasa=use_kasa,
             use_alora=use_alora,
             lora_bias=lora_bias,
             arrow_config=arrow_config,
         )
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
 
     def resolve_lora_variant(
-        self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, **kwargs
+        self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, use_kasa: bool, **kwargs
     ) -> Optional[LoraVariant]:
         if arrow_config is not None:
             from .variants import ArrowLinearVariant
 
             return ArrowLinearVariant()
 
-        if not use_dora and not use_alora:
+        if not use_dora and not use_alora and not use_kasa:
             return None
 
-        from .variants import ALoraLinearVariant, DoraLinearVariant
+        from .variants import ALoraLinearVariant, DoraLinearVariant, KasaLinearVariant
 
         if use_alora:
             return ALoraLinearVariant()
+        elif use_kasa:
+            return KasaLinearVariant()
         else:
             return DoraLinearVariant()
 
@@ -862,7 +869,7 @@ def __init__(
             arrow_config=arrow_config,
         )
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -879,6 +886,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora,
+        use_kasa,
         lora_bias,
         arrow_config: ArrowConfig = None,
         inference_mode: bool = False,
@@ -891,7 +899,8 @@ def update_layer(
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
 
-        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config)
+        lora_variant = self.resolve_lora_variant(use_dora=use_dora, use_kasa=use_kasa, arrow_config=arrow_config)
+
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -1147,6 +1156,7 @@ def __init__(
         init_lora_weights: Union[bool, str] = True,
         use_rslora: bool = False,
         use_dora: bool = False,
+        use_kasa: bool = False,
         arrow_config: ArrowConfig = None,
         lora_bias: bool = False,
         **kwargs,
@@ -1189,6 +1199,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora,
+        use_kasa,
         lora_bias,
         arrow_config: ArrowConfig = None,
         inference_mode: bool = False,
@@ -1208,7 +1219,7 @@ def update_layer(
                 PeftWarning,
             )
 
-        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config)
+        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config, use_kasa=use_kasa)
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -1452,7 +1463,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv2d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -1469,7 +1480,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv1d layer kernel must have 3 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv1d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -1486,7 +1497,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv3d layer kernel must have 5 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv3d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -1969,6 +1980,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora: bool = False,
+        use_kasa: bool = False,
         use_qalora: bool = False,
         lora_bias: bool = False,
         qalora_group_size: int = 32,
@@ -1985,7 +1997,7 @@ def update_layer(
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
 
         lora_variant = self.resolve_lora_variant(
-            use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size
+            use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size, use_kasa=use_kasa
         )
         if lora_variant is not None:
             raise ValueError(f"lora.{self.__class__.__name__} does not work with LoRA variants like DoRA.")

diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
@@ -155,6 +155,29 @@ class LoraModel(BaseTuner):
     prefix: str = "lora_"
     tuner_layer_cls = LoraLayer
     target_module_mapping = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+
+    def _check_new_adapter_config(self, config: LoraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+        # Check KaSA adapter compatibility (only when adding additional adapters)
+        if len(self.peft_config) > 1:
+            kasa_count = sum(1 for cfg in self.peft_config.values() if cfg.use_kasa)
+            non_kasa_count = len(self.peft_config) - kasa_count
+
+            if kasa_count > 0 and non_kasa_count > 0:
+                raise ValueError("KaSA adapters cannot be mixed with other adapter types.")
 
     def _prepare_model(self, peft_config: LoraConfig, model: nn.Module):
         r"""
@@ -211,6 +234,7 @@ def _create_and_replace(
             "use_dora": lora_config.use_dora,
             "use_alora": lora_config.alora_invocation_tokens is not None,
             "use_qalora": lora_config.use_qalora,
+            "use_kasa": lora_config.use_kasa,
             "qalora_group_size": lora_config.qalora_group_size,
             "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload,
             "lora_bias": lora_config.lora_bias,
@@ -248,6 +272,7 @@ def _create_and_replace(
                 init_lora_weights=lora_config.init_lora_weights,
                 use_rslora=lora_config.use_rslora,
                 use_dora=lora_config.use_dora,
+                use_kasa=lora_config.use_kasa,
                 lora_bias=lora_config.lora_bias,
                 arrow_config=lora_config.arrow_config,
                 inference_mode=lora_config.inference_mode,

diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py
@@ -447,6 +447,116 @@ def init(module: Conv3d, adapter_name: str, **kwargs: Any) -> None:
         _DoraConvNdVariant.init_convd_variant(module, adapter_name, dora_layer=dora_layer)
 
 
+class KasaLinearVariant(LoraVariant):
+    @staticmethod
+    def init(module: Linear, adapter_name: str, **kwargs: Any) -> None:
+        if not hasattr(module, "lora_diag"):
+            module.lora_diag = nn.ParameterDict()
+            module.adapter_layer_names = module.adapter_layer_names[:] + ("lora_diag",)
+
+        # Initialize lora_diag with the same dtype as the base layer
+        base_dtype = module.get_base_layer().weight.dtype
+        module.lora_diag[adapter_name] = nn.Parameter(
+            torch.randn(module.r[adapter_name], dtype=base_dtype), requires_grad=True
+        )
+
+        # see https://github.com/juyongjiang/KaSA/blob/f85e88c22d0fa4cb8ab2923d7c2bf1bbec152da3/peft/src/peft/tuners/lora/layer.py#L132
+        if not getattr(module, "_kasa_svd_applied", False):
+            weight = module.get_base_layer().weight
+            dtype = weight.dtype
+            svd_rank = module.in_features - module.r[adapter_name]
+            weight = weight.to(torch.float32)
+            U, S, Vh = torch.linalg.svd(weight.data, full_matrices=False)
+            U_principle, S_principle, Vh_principle = U[:, :svd_rank], S[:svd_rank], Vh[:svd_rank, :]
+            reconstructed_weight = U_principle @ torch.diag(S_principle) @ Vh_principle
+            module.get_base_layer().weight.data = reconstructed_weight.to(dtype)
+            module._kasa_svd_applied = True
+
+    @staticmethod
+    def _get_delta_weight(weight_A, weight_B, lora_diag, scaling, fan_in_fan_out):
+        # Ensure all tensors have the same dtype
+        target_dtype = weight_A.dtype
+        weight_B = weight_B.to(target_dtype)
+        lora_diag = lora_diag.to(target_dtype)
+
+        diag = torch.diag(lora_diag)
+        delta = weight_B @ diag @ weight_A
+        if fan_in_fan_out:
+            delta = delta.transpose(0, 1)
+        delta = delta * scaling
+        return delta
+
+    @staticmethod
+    def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight,
+            module.lora_B[active_adapter].weight,
+            module.lora_diag[active_adapter],
+            module.scaling[active_adapter],
+            module.fan_in_fan_out,
+        )
+        return orig_weight + delta_weight
+
+    @staticmethod
+    def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight,
+            module.lora_B[active_adapter].weight,
+            module.lora_diag[active_adapter],
+            module.scaling[active_adapter],
+            module.fan_in_fan_out,
+        )
+        orig_weight.data += delta_weight
+
+    @staticmethod
+    def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight,
+            module.lora_B[active_adapter].weight,
+            module.lora_diag[active_adapter],
+            module.scaling[active_adapter],
+            module.fan_in_fan_out,
+        )
+        return orig_weight - delta_weight
+
+    @staticmethod
+    def forward(module: Linear, active_adapter: str, x: torch.Tensor, result: torch.Tensor, **kwargs) -> torch.Tensor:
+        # Check if adapters are disabled
+        if module.disable_adapters:
+            return result
+
+        lora_A = module.lora_A[active_adapter]
+        lora_B = module.lora_B[active_adapter]
+        dropout = module.lora_dropout[active_adapter]
+        scaling = module.scaling[active_adapter]
+        diag = torch.diag(module.lora_diag[active_adapter])
+
+        # KaSA calculation
+        # see https://github.com/juyongjiang/KaSA/blob/f85e88c22d0fa4cb8ab2923d7c2bf1bbec152da3/peft/src/peft/tuners/lora/layer.py#L602C21-L602C110
+
+        # Ensure all tensors have the same dtype as the result
+        target_dtype = result.dtype
+        x = x.to(target_dtype)
+        diag = diag.to(target_dtype)
+
+        # Convert LoRA weights to target dtype
+        lora_A.weight.data = lora_A.weight.data.to(target_dtype)
+        lora_B.weight.data = lora_B.weight.data.to(target_dtype)
+
+        lora_A_output = lora_A(dropout(x))
+
+        if x.ndim == 3:
+            einsum_output = torch.einsum("ijk,kl->ijl", lora_A_output, diag)
+            lora_output = lora_B(einsum_output) * scaling
+        elif x.ndim == 2:
+            matmul_output = lora_A_output @ diag
+            lora_output = lora_B(matmul_output) * scaling
+        else:
+            raise ValueError(f"Using KaSA with inputs of shape {x.ndim} is not supported, only 2 or 3 dims.")
+
+        return result + lora_output
+
+
 class QALoraLinearVariant(LoraVariant):
     @staticmethod
     def init(module: Linear, adapter_name: str, **kwargs: Any) -> None:

diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -164,6 +164,8 @@
         LoraConfig,
         {"target_modules": ["lin0"], "target_parameters": ["lin1.weight"]},
     ),
+    ("Vanilla MLP 7 LoRA with KaSA", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_kasa": True}),
+    ("Vanilla MLP 8 LoRA with KaSA", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_kasa": True}),
     #######
     # IA³ #
     #######
@@ -1231,6 +1233,9 @@ def _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config
     if (config_cls == LoraConfig) and config_kwargs.get("target_parameters"):
         pytest.skip("LoRA with multiple adapters with target_parameters is not supported")
 
+def _skip_test_disable_adapters(config_cls, config_kwargs):
+    if (config_cls == LoraConfig) and config_kwargs.get("use_kasa"):
+        pytest.skip("KaSA modifies base weights, so adapter disable test is skipped")
 
 class MLP(nn.Module):
     def __init__(self, bias=True):
@@ -2138,6 +2143,7 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
     def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         # Test that it's possible to disable the adapter, in which case the model output should be identical to that of
         # the base model.
+        _skip_test_disable_adapters(config_cls, config_kwargs)
         X = self.prepare_inputs_for_testing()
         model = self.transformers_class.from_pretrained(model_id).to(self.torch_device).eval()
         outputs_base = model(**X)