@@ -113,6 +113,7 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, *
113113 self ._disable_adapters = False
114114 self .merged_adapters = []
115115 self .use_dora : dict [str , bool ] = {} # not actively used anymore after #2443, keep it for BC
116+ self .use_rslora : dict [str , bool ] = {}
116117 self .lora_bias : dict [str , bool ] = {}
117118 self .lora_magnitude_vector = torch .nn .ModuleDict () # for DoRA
118119 self ._caches : dict [str , Any ] = {}
@@ -255,6 +256,8 @@ def update_layer(
255256 else :
256257 self .scaling [adapter_name ] = lora_alpha / r
257258
259+ self .use_rslora [adapter_name ] = use_rslora
260+
258261 self .use_dora [adapter_name ] = use_dora
259262
260263 # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
@@ -528,7 +531,10 @@ def set_scale(self, adapter: str, scale: float | int) -> None:
528531 if adapter not in self .scaling :
529532 # Ignore the case where the adapter is not in the layer
530533 return
531- self .scaling [adapter ] = scale * self .lora_alpha [adapter ] / self .r [adapter ]
534+ if self .use_rslora .get (adapter , False ):
535+ self .scaling [adapter ] = scale * self .lora_alpha [adapter ] / math .sqrt (self .r [adapter ])
536+ else :
537+ self .scaling [adapter ] = scale * self .lora_alpha [adapter ] / self .r [adapter ]
532538
533539 def scale_layer (self , scale : float | int ) -> None :
534540 """Multiply the current scale of all active adapters by the provided factor"""
@@ -553,9 +559,12 @@ def unscale_layer(self, scale: Optional[float | int] = None) -> None:
553559 continue
554560
555561 if scale is None :
556- self .scaling [active_adapter ] = self .lora_alpha [active_adapter ] / self .r [active_adapter ]
562+ if self .use_rslora .get (active_adapter , False ):
563+ self .scaling [active_adapter ] = self .lora_alpha [active_adapter ] / math .sqrt (self .r [active_adapter ])
564+ else :
565+ self .scaling [active_adapter ] = self .lora_alpha [active_adapter ] / self .r [active_adapter ]
557566 else :
558- self .scaling [active_adapter ] /= scale
567+ self .scaling [active_adapter ] = self . scaling [ active_adapter ] / scale
559568
560569 def _check_forward_args (self , x , * args , ** kwargs ):
561570 """Check if the arguments are compatible with the configs and state of the model"""
@@ -960,6 +969,8 @@ def update_layer(
960969 else :
961970 self .scaling [adapter_name ] = lora_alpha / r
962971
972+ self .use_rslora [adapter_name ] = use_rslora
973+
963974 self .use_dora [adapter_name ] = use_dora
964975
965976 if init_lora_weights == "loftq" :
@@ -1260,6 +1271,8 @@ def update_layer(
12601271 else :
12611272 self .scaling [adapter_name ] = lora_alpha / r
12621273
1274+ self .use_rslora [adapter_name ] = use_rslora
1275+
12631276 self .use_dora [adapter_name ] = use_dora
12641277
12651278 if init_lora_weights == "loftq" :
@@ -2033,6 +2046,8 @@ def update_layer(
20332046 else :
20342047 self .scaling [adapter_name ] = lora_alpha / r
20352048
2049+ self .use_rslora [adapter_name ] = use_rslora
2050+
20362051 self .use_dora [adapter_name ] = use_dora
20372052
20382053 # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
0 commit comments