From 3e2c996c6a095129a05814d494d3800cbcb72947 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Thu, 24 Apr 2025 15:35:50 -0400 Subject: [PATCH 01/12] feat(lokr, loha): add 1x1 Conv2d and Conv1d support --- src/peft/tuners/loha/layer.py | 84 +++++++++++++++++++++++++++++-- src/peft/tuners/loha/model.py | 1 + src/peft/tuners/lokr/layer.py | 93 +++++++++++++++++++++++++++++++++-- src/peft/tuners/lokr/model.py | 1 + 4 files changed, 172 insertions(+), 7 deletions(-) diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py index 6d294af669..275767971d 100644 --- a/src/peft/tuners/loha/layer.py +++ b/src/peft/tuners/loha/layer.py @@ -45,7 +45,7 @@ def _available_adapters(self) -> Set[str]: def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]): # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75 - if len(shape) == 4: + if len(shape) == 4: # Conv2d self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode @@ -53,7 +53,15 @@ def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode - else: + elif len(shape) == 3: # Conv1d + self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + + self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + else: # Linear self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r)) self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) @@ -127,7 +135,13 @@ def update_layer( if isinstance(base_layer, nn.Linear): shape = tuple(base_layer.weight.shape) elif isinstance(base_layer, nn.Conv2d): - use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + # Handle 1x1 convolutions differently + if base_layer.kernel_size == (1, 1): + # For 1x1 convolutions, use a more direct shape without using effective_conv2d + shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) + use_effective_conv2d = False + else: + use_effective_conv2d = use_effective_conv2d if use_effective_conv2d: shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) else: @@ -135,6 +149,21 @@ def update_layer( base_layer.out_channels, base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], ) + elif isinstance(base_layer, nn.Conv1d): + # Handle kernel_size=1 Conv1d differently + if base_layer.kernel_size[0] == 1: + # For kernel_size=1, use a more direct shape without using effective_conv2d + shape = (base_layer.out_channels, base_layer.in_channels, base_layer.kernel_size[0]) + use_effective_conv2d = False + else: + use_effective_conv2d = use_effective_conv2d + if use_effective_conv2d: + shape = (base_layer.out_channels, base_layer.in_channels, base_layer.kernel_size[0]) + else: + shape = ( + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0], + ) else: raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}") @@ -173,6 +202,12 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: ) base_layer = self.get_base_layer() + + # Special optimization for 1x1 convolutions and kernel_size=1 Conv1d + is_1x1_conv2d = isinstance(base_layer, nn.Conv2d) and base_layer.kernel_size == (1, 1) + is_1_conv1d = isinstance(base_layer, nn.Conv1d) and base_layer.kernel_size[0] == 1 + + # Reshape to match base layer shape weight = weight.reshape(base_layer.weight.shape) # Perform rank dropout during training - drop rows of addition weights @@ -290,6 +325,49 @@ def __repr__(self) -> str: return "loha." + rep +class Conv1d(LoHaLayer): + """LoHa implemented in Conv1d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv1d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + # Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9 diff --git a/src/peft/tuners/loha/model.py b/src/peft/tuners/loha/model.py index e1cbc50b34..9e92bcedca 100644 --- a/src/peft/tuners/loha/model.py +++ b/src/peft/tuners/loha/model.py @@ -85,6 +85,7 @@ class LoHaModel(LycorisTuner): prefix: str = "hada_" layers_mapping: Dict[Type[torch.nn.Module], Type[LoHaLayer]] = { torch.nn.Conv2d: Conv2d, + torch.nn.Conv1d: Conv1d, torch.nn.Linear: Linear, } diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py index 1c8cf1bbd9..5507fc78e3 100644 --- a/src/peft/tuners/lokr/layer.py +++ b/src/peft/tuners/lokr/layer.py @@ -75,8 +75,8 @@ def create_adapter_parameters( self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r)) self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0])) - if len(shape) == 4: - # Conv2d + # Handle both Conv2d and Conv1d + if len(shape) == 4: # Conv2d if use_w2: self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:])) elif use_effective_conv2d: @@ -86,6 +86,16 @@ def create_adapter_parameters( else: self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3])) + elif len(shape) == 3: # Conv1d + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], shape[2])) + elif use_effective_conv2d: # Even for Conv1d, use the effective parameter for kernel dimension + self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1])) # b, 1-mode + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) # d, 2-mode + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2])) else: # Linear if use_w2: @@ -201,7 +211,28 @@ def update_layer( use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 - use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + # Handle 1x1 convolutions differently + if base_layer.kernel_size == (1, 1): + # For 1x1 convolutions, always disable use_effective_conv2d + use_effective_conv2d = False + else: + use_effective_conv2d = use_effective_conv2d + elif isinstance(base_layer, nn.Conv1d): + in_dim, out_dim = base_layer.in_channels, base_layer.out_channels + k_size = (base_layer.kernel_size[0],) # Convert to a tuple with single element + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n), *k_size) # ((a, b), (c, d), k) + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 + # Handle kernel_size=1 Conv1d differently + if base_layer.kernel_size[0] == 1: + # For kernel_size=1, always disable use_effective_conv2d + use_effective_conv2d = False + else: + use_effective_conv2d = use_effective_conv2d else: raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}") @@ -237,7 +268,16 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: # Make weights with Kronecker product weight = make_kron(w1, w2, self.scaling[adapter_name]) - weight = weight.reshape(self.get_base_layer().weight.shape) + + # Get base layer for reshaping + base_layer = self.get_base_layer() + + # Special optimization for 1x1 convolutions and kernel_size=1 Conv1d + is_1x1_conv2d = isinstance(base_layer, nn.Conv2d) and base_layer.kernel_size == (1, 1) + is_1_conv1d = isinstance(base_layer, nn.Conv1d) and base_layer.kernel_size[0] == 1 + + # Regular reshape to match base layer shape + weight = weight.reshape(base_layer.weight.shape) # Perform rank dropout during training - drop rows of addition weights rank_dropout = self.rank_dropout[adapter_name] @@ -356,6 +396,51 @@ def __repr__(self) -> str: return "lokr." + rep +class Conv1d(LoKrLayer): + """LoKr implemented in Conv1d layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv1d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + # Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11 diff --git a/src/peft/tuners/lokr/model.py b/src/peft/tuners/lokr/model.py index 5d64f01178..39450e0d1a 100644 --- a/src/peft/tuners/lokr/model.py +++ b/src/peft/tuners/lokr/model.py @@ -86,6 +86,7 @@ class LoKrModel(LycorisTuner): prefix: str = "lokr_" layers_mapping: Dict[Type[torch.nn.Module], Type[LoKrLayer]] = { torch.nn.Conv2d: Conv2d, + torch.nn.Conv1d: Conv1d, torch.nn.Linear: Linear, } From a716722913f907005a3042974000a9e844278f2f Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Sat, 10 May 2025 20:19:16 -0400 Subject: [PATCH 02/12] Fix: Add Conv1d and Conv2d 1x1 support to LyCORIS adapters - Update lycoris_utils.py to support Conv1d layers - Add ModelConv2D1x1 and ModelConv1D test classes - Add test cases for Conv1d and Conv2d 1x1 with LoHa and LoKr - Update imports in loha/model.py and lokr/model.py --- src/peft/tuners/loha/model.py | 2 +- src/peft/tuners/lokr/model.py | 2 +- src/peft/tuners/lycoris_utils.py | 2 +- tests/test_custom_models.py | 50 ++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/peft/tuners/loha/model.py b/src/peft/tuners/loha/model.py index 9e92bcedca..3a97bdb531 100644 --- a/src/peft/tuners/loha/model.py +++ b/src/peft/tuners/loha/model.py @@ -21,7 +21,7 @@ from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner -from .layer import Conv2d, Linear, LoHaLayer +from .layer import Conv1d, Conv2d, Linear, LoHaLayer class LoHaModel(LycorisTuner): diff --git a/src/peft/tuners/lokr/model.py b/src/peft/tuners/lokr/model.py index 39450e0d1a..0881e08d51 100644 --- a/src/peft/tuners/lokr/model.py +++ b/src/peft/tuners/lokr/model.py @@ -21,7 +21,7 @@ from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner -from .layer import Conv2d, Linear, LoKrLayer +from .layer import Conv1d, Conv2d, Linear, LoKrLayer class LoKrModel(LycorisTuner): diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py index 5cec45b596..ba2c00eff5 100644 --- a/src/peft/tuners/lycoris_utils.py +++ b/src/peft/tuners/lycoris_utils.py @@ -258,7 +258,7 @@ def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn else: target_base_layer = target - if isinstance(target_base_layer, torch.nn.Conv2d): + if isinstance(target_base_layer, (torch.nn.Conv2d, torch.nn.Conv1d)): new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) elif isinstance(target_base_layer, torch.nn.Linear): new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 1869298761..f7996ac8dd 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -230,6 +230,8 @@ ("Conv2d 2 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"]}), ("Conv2d 3 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), + ("Conv1D LoHa", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv2d 1x1 LoHa", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}), # LoKr ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}), ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}), @@ -252,6 +254,8 @@ ("Conv2d 2 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"]}), ("Conv2d 3 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), + ("Conv1D LoKr", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), + ("Conv2d 1x1 LoKr", "Conv2d1x1", LoKrConfig, {"target_modules": ["conv2d"]}), ( "Conv2d 5 LOKR", "Conv2d", @@ -852,6 +856,46 @@ def forward(self, X): return X +class ModelConv2D1x1(nn.Module): + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(1, 10, kernel_size=(1, 1), padding=0) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lino = nn.Linear(10 * 5 * 5, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 1, 5, 5) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lino(X) + X = self.sm(X) + return X + + +class ModelConv1D(nn.Module): + def __init__(self): + super().__init__() + self.conv1d = nn.Conv1d(in_channels=3, out_channels=10, kernel_size=1) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lino = nn.Linear(10 * 10, 2) + self.dtype = torch.float + + def forward(self, x): + x = x.to(self.dtype) + x = x.reshape(-1, 3, 10) # batch, channels, seq_len + x = self.conv1d(x) + x = self.relu(x) + x = self.flat(x) + x = self.lino(x) + return x + + class ModelConv3D(nn.Module): def __init__(self): super().__init__() @@ -912,6 +956,12 @@ def from_pretrained(cls, model_id, torch_dtype=None): if model_id == "Conv2d": return ModelConv2D().to(torch_dtype) + + if model_id == "Conv2d1x1": + return ModelConv2D1x1().to(torch_dtype) + + if model_id == "Conv1d": + return ModelConv1D().to(torch_dtype) if model_id == "Conv3d": return ModelConv3D().to(torch_dtype) From 67c0cc661c9659cd245f00e7fcb341608404de69 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Fri, 16 May 2025 14:49:12 -0400 Subject: [PATCH 03/12] Resolve conflicts in tests/test_custom_models.py --- tests/test_custom_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index f7996ac8dd..1875ca0550 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -862,7 +862,7 @@ def __init__(self): self.conv2d = nn.Conv2d(1, 10, kernel_size=(1, 1), padding=0) self.relu = nn.ReLU() self.flat = nn.Flatten() - self.lino = nn.Linear(10 * 5 * 5, 2) + self.lin0 = nn.Linear(10 * 5 * 5, 2) self.sm = nn.LogSoftmax(dim=-1) self.dtype = torch.float @@ -872,7 +872,7 @@ def forward(self, X): X = self.conv2d(X) X = self.relu(X) X = self.flat(X) - X = self.lino(X) + X = self.lin0(X) X = self.sm(X) return X @@ -883,7 +883,7 @@ def __init__(self): self.conv1d = nn.Conv1d(in_channels=3, out_channels=10, kernel_size=1) self.relu = nn.ReLU() self.flat = nn.Flatten() - self.lino = nn.Linear(10 * 10, 2) + self.lin0 = nn.Linear(10 * 10, 2) self.dtype = torch.float def forward(self, x): @@ -892,7 +892,7 @@ def forward(self, x): x = self.conv1d(x) x = self.relu(x) x = self.flat(x) - x = self.lino(x) + x = self.lin0(x) return x From d2559fdaea152d6a36010d195e58c7dff117fb4f Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Fri, 16 May 2025 14:55:09 -0400 Subject: [PATCH 04/12] resolving conflicts --- tests/test_custom_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 1875ca0550..73f9aeaf30 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -863,12 +863,12 @@ def __init__(self): self.relu = nn.ReLU() self.flat = nn.Flatten() self.lin0 = nn.Linear(10 * 5 * 5, 2) - self.sm = nn.LogSoftmax(dim=-1) - self.dtype = torch.float + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float def forward(self, X): X = X.to(self.dtype) - X = X.reshape(-1, 1, 5, 5) + X = X.reshape(-1, 1, 5, 5) X = self.conv2d(X) X = self.relu(X) X = self.flat(X) From 655ea628d9b950c45b12becbb4bb10d2b282f8e9 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Tue, 27 May 2025 19:32:26 -0400 Subject: [PATCH 05/12] feat(lokr, loha): Refine 1x1 Conv2d/Conv1d handling and test naming consistency Clarify existing optimal handling for 1x1 Conv2d and Conv1d layers; confirmed is already correctly disabled in files, addressing reviewer feedback on potential dead code. Standardize test case naming in for consistency: - Updated 'LoHa' to 'LOHA' and 'LoKr' to 'LOKR' (all caps). - Aligned Conv1D/Conv1d naming with PyTorch conventions for clarity. --- src/peft/tuners/loha/layer.py | 18 +++++++++--------- src/peft/tuners/lokr/layer.py | 16 +++++++++------- tests/test_custom_models.py | 8 ++++---- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py index 275767971d..874901e9fe 100644 --- a/src/peft/tuners/loha/layer.py +++ b/src/peft/tuners/loha/layer.py @@ -135,10 +135,12 @@ def update_layer( if isinstance(base_layer, nn.Linear): shape = tuple(base_layer.weight.shape) elif isinstance(base_layer, nn.Conv2d): - # Handle 1x1 convolutions differently + # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # they can be more efficiently handled with the flattened weight representation, + # similar to how Linear layers work. This optimization reduces computational cost + # without affecting the mathematical equivalence of the operation. if base_layer.kernel_size == (1, 1): - # For 1x1 convolutions, use a more direct shape without using effective_conv2d - shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) use_effective_conv2d = False else: use_effective_conv2d = use_effective_conv2d @@ -150,10 +152,11 @@ def update_layer( base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], ) elif isinstance(base_layer, nn.Conv1d): - # Handle kernel_size=1 Conv1d differently + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # to a Linear layer applied across the channel dimension. Using flattened representation + # avoids unnecessary reshaping and improves computational efficiency. if base_layer.kernel_size[0] == 1: - # For kernel_size=1, use a more direct shape without using effective_conv2d - shape = (base_layer.out_channels, base_layer.in_channels, base_layer.kernel_size[0]) use_effective_conv2d = False else: use_effective_conv2d = use_effective_conv2d @@ -203,9 +206,6 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: base_layer = self.get_base_layer() - # Special optimization for 1x1 convolutions and kernel_size=1 Conv1d - is_1x1_conv2d = isinstance(base_layer, nn.Conv2d) and base_layer.kernel_size == (1, 1) - is_1_conv1d = isinstance(base_layer, nn.Conv1d) and base_layer.kernel_size[0] == 1 # Reshape to match base layer shape weight = weight.reshape(base_layer.weight.shape) diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py index 5507fc78e3..6858efa4bb 100644 --- a/src/peft/tuners/lokr/layer.py +++ b/src/peft/tuners/lokr/layer.py @@ -211,9 +211,12 @@ def update_layer( use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 - # Handle 1x1 convolutions differently + # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # they can be more efficiently handled with the flattened weight representation, + # similar to how Linear layers work. This optimization reduces computational cost + # without affecting the mathematical equivalence of the operation. if base_layer.kernel_size == (1, 1): - # For 1x1 convolutions, always disable use_effective_conv2d use_effective_conv2d = False else: use_effective_conv2d = use_effective_conv2d @@ -227,9 +230,11 @@ def update_layer( use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 - # Handle kernel_size=1 Conv1d differently + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # to a Linear layer applied across the channel dimension. Using flattened representation + # avoids unnecessary reshaping and improves computational efficiency. if base_layer.kernel_size[0] == 1: - # For kernel_size=1, always disable use_effective_conv2d use_effective_conv2d = False else: use_effective_conv2d = use_effective_conv2d @@ -272,9 +277,6 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: # Get base layer for reshaping base_layer = self.get_base_layer() - # Special optimization for 1x1 convolutions and kernel_size=1 Conv1d - is_1x1_conv2d = isinstance(base_layer, nn.Conv2d) and base_layer.kernel_size == (1, 1) - is_1_conv1d = isinstance(base_layer, nn.Conv1d) and base_layer.kernel_size[0] == 1 # Regular reshape to match base layer shape weight = weight.reshape(base_layer.weight.shape) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 73f9aeaf30..c6838cc5da 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -230,8 +230,8 @@ ("Conv2d 2 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"]}), ("Conv2d 3 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), - ("Conv1D LoHa", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), - ("Conv2d 1x1 LoHa", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}), + ("Conv1d LOHA", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv2d 1x1 LOHA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}), # LoKr ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}), ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}), @@ -254,8 +254,8 @@ ("Conv2d 2 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"]}), ("Conv2d 3 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), - ("Conv1D LoKr", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), - ("Conv2d 1x1 LoKr", "Conv2d1x1", LoKrConfig, {"target_modules": ["conv2d"]}), + ("Conv1d LOKR", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), + ("Conv2d 1x1 LOKR", "Conv2d1x1", LoKrConfig, {"target_modules": ["conv2d"]}), ( "Conv2d 5 LOKR", "Conv2d", From cd93eb54148a339e1082e514f20ca679e31158c8 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Wed, 28 May 2025 12:48:16 -0400 Subject: [PATCH 06/12] make style linting --- src/peft/tuners/loha/layer.py | 9 ++++----- src/peft/tuners/lokr/layer.py | 11 +++++------ tests/test_custom_models.py | 10 +++++----- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py index 390314343b..115493f064 100644 --- a/src/peft/tuners/loha/layer.py +++ b/src/peft/tuners/loha/layer.py @@ -136,7 +136,7 @@ def update_layer( shape = tuple(base_layer.weight.shape) elif isinstance(base_layer, nn.Conv2d): # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. - # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), # they can be more efficiently handled with the flattened weight representation, # similar to how Linear layers work. This optimization reduces computational cost # without affecting the mathematical equivalence of the operation. @@ -152,8 +152,8 @@ def update_layer( base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], ) elif isinstance(base_layer, nn.Conv1d): - # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons - # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent # to a Linear layer applied across the channel dimension. Using flattened representation # avoids unnecessary reshaping and improves computational efficiency. if base_layer.kernel_size[0] == 1: @@ -205,8 +205,7 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: ) base_layer = self.get_base_layer() - - + # Reshape to match base layer shape weight = weight.reshape(base_layer.weight.shape) diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py index 8a2eeba55f..280a07eb7e 100644 --- a/src/peft/tuners/lokr/layer.py +++ b/src/peft/tuners/lokr/layer.py @@ -212,7 +212,7 @@ def update_layer( use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. - # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), # they can be more efficiently handled with the flattened weight representation, # similar to how Linear layers work. This optimization reduces computational cost # without affecting the mathematical equivalence of the operation. @@ -230,8 +230,8 @@ def update_layer( use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 - # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons - # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent # to a Linear layer applied across the channel dimension. Using flattened representation # avoids unnecessary reshaping and improves computational efficiency. if base_layer.kernel_size[0] == 1: @@ -273,11 +273,10 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor: # Make weights with Kronecker product weight = make_kron(w1, w2, self.scaling[adapter_name]) - + # Get base layer for reshaping base_layer = self.get_base_layer() - - + # Regular reshape to match base layer shape weight = weight.reshape(base_layer.weight.shape) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 17d3727e56..1720407b9a 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -974,12 +974,12 @@ def __init__(self): self.relu = nn.ReLU() self.flat = nn.Flatten() self.lin0 = nn.Linear(10 * 5 * 5, 2) - self.sm = nn.LogSoftmax(dim=-1) - self.dtype = torch.float + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float def forward(self, X): X = X.to(self.dtype) - X = X.reshape(-1, 1, 5, 5) + X = X.reshape(-1, 1, 5, 5) X = self.conv2d(X) X = self.relu(X) X = self.flat(X) @@ -1094,10 +1094,10 @@ def from_pretrained(cls, model_id, torch_dtype=None): if model_id == "Conv2d": return ModelConv2D().to(torch_dtype) - + if model_id == "Conv2d1x1": return ModelConv2D1x1().to(torch_dtype) - + if model_id == "Conv1d": return ModelConv1D().to(torch_dtype) From 6f9641136d67abdd77180a9677b0e5655d5e8048 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Thu, 29 May 2025 00:56:38 -0400 Subject: [PATCH 07/12] Fix duplicate ModelConv1D class names in test_custom_models.py Renamed the second ModelConv1D class (kernel_size=1) to ModelConv1DKernel1 and updated MockTransformerWrapper mapping to avoid class name conflicts. for the 1x1 support --- .gitignore | 4 ++++ tests/test_custom_models.py | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 4e3e2ca5fc..667fa5e431 100644 --- a/.gitignore +++ b/.gitignore @@ -140,6 +140,10 @@ dmypy.json # More test things wandb +<<<<<<< Updated upstream # method_comparison logs method_comparison/MetaMathQA/cancelled_results/ method_comparison/MetaMathQA/temporary_results/ +======= +**/.claude/settings.local.json +>>>>>>> Stashed changes diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 1720407b9a..1938439ef7 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1009,7 +1009,7 @@ def forward(self, X): return X -class ModelConv1D(nn.Module): +class ModelConv1DKernel1(nn.Module): def __init__(self): super().__init__() self.conv1d = nn.Conv1d(in_channels=3, out_channels=10, kernel_size=1) @@ -1098,8 +1098,8 @@ def from_pretrained(cls, model_id, torch_dtype=None): if model_id == "Conv2d1x1": return ModelConv2D1x1().to(torch_dtype) - if model_id == "Conv1d": - return ModelConv1D().to(torch_dtype) + if model_id == "Conv1dKernel1": + return ModelConv1DKernel1().to(torch_dtype) if model_id == "Conv2dGroups": return ModelConv2DGroups().to(torch_dtype) @@ -4808,3 +4808,5 @@ def __init__(self, base_layer, adapter_name, **kwargs): # we should still get a warning message msg = "Unsupported layer type '' encountered, proceed at your own risk." assert str(recwarn.list[-1].message) == msg + + From ae3a4eb050e3e1684e479d2ee0027e17151342d5 Mon Sep 17 00:00:00 2001 From: Kabir Grewal Date: Tue, 3 Jun 2025 16:43:28 -0400 Subject: [PATCH 08/12] make style and fixing conflicts --- .gitignore | 4 +--- tests/test_custom_models.py | 8 +++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 667fa5e431..7e13636eb1 100644 --- a/.gitignore +++ b/.gitignore @@ -140,10 +140,8 @@ dmypy.json # More test things wandb -<<<<<<< Updated upstream # method_comparison logs method_comparison/MetaMathQA/cancelled_results/ method_comparison/MetaMathQA/temporary_results/ -======= + **/.claude/settings.local.json ->>>>>>> Stashed changes diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 1938439ef7..b9a7f17321 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -2477,9 +2477,9 @@ def test_multirank_2(self): if isinstance(module, BaseTunerLayer): rank_expected = rank_pattern.get(key, r) rank_current = module.lora_A[adapter].weight.shape[0] - assert rank_current == rank_expected, ( - f"Rank {rank_current} is not equal to expected {rank_expected}" - ) + assert ( + rank_current == rank_expected + ), f"Rank {rank_current} is not equal to expected {rank_expected}" class TestRepr(unittest.TestCase): @@ -4808,5 +4808,3 @@ def __init__(self, base_layer, adapter_name, **kwargs): # we should still get a warning message msg = "Unsupported layer type '' encountered, proceed at your own risk." assert str(recwarn.list[-1].message) == msg - - From 3fca72a4425e6ed0306a802f4de40e47c8a14c08 Mon Sep 17 00:00:00 2001 From: nemo Date: Fri, 6 Jun 2025 12:10:53 +0200 Subject: [PATCH 09/12] Make style --- tests/test_custom_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index b9a7f17321..596381087f 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -2477,9 +2477,9 @@ def test_multirank_2(self): if isinstance(module, BaseTunerLayer): rank_expected = rank_pattern.get(key, r) rank_current = module.lora_A[adapter].weight.shape[0] - assert ( - rank_current == rank_expected - ), f"Rank {rank_current} is not equal to expected {rank_expected}" + assert rank_current == rank_expected, ( + f"Rank {rank_current} is not equal to expected {rank_expected}" + ) class TestRepr(unittest.TestCase): From 6c44b89de51187640e71230060cf4218dfbcfdae Mon Sep 17 00:00:00 2001 From: nemo Date: Tue, 26 Aug 2025 11:57:35 +0200 Subject: [PATCH 10/12] Fix tests --- src/peft/tuners/loha/layer.py | 1 + src/peft/tuners/lokr/layer.py | 1 + tests/test_custom_models.py | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py index 115493f064..1985f3a199 100644 --- a/src/peft/tuners/loha/layer.py +++ b/src/peft/tuners/loha/layer.py @@ -353,6 +353,7 @@ def _get_delta_activations( self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any ) -> torch.Tensor: delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) # don't add bias here, because the bias is already included in the output of the base_layer base_layer = self.get_base_layer() return F.conv1d( diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py index 280a07eb7e..f992eb504c 100644 --- a/src/peft/tuners/lokr/layer.py +++ b/src/peft/tuners/lokr/layer.py @@ -428,6 +428,7 @@ def _get_delta_activations( self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any ) -> torch.Tensor: delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) # don't add bias here, because the bias is already included in the output of the base_layer base_layer = self.get_base_layer() return F.conv1d( diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index c812f8d9ae..505840e1b8 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1244,13 +1244,13 @@ def __init__(self): self.conv2d = nn.Conv2d(1, 10, kernel_size=(1, 1), padding=0) self.relu = nn.ReLU() self.flat = nn.Flatten() - self.lin0 = nn.Linear(10 * 5 * 5, 2) + self.lin0 = nn.Linear(10 * 3 * 3, 2) self.sm = nn.LogSoftmax(dim=-1) self.dtype = torch.float def forward(self, X): X = X.to(self.dtype) - X = X.reshape(-1, 1, 5, 5) + X = X.reshape(-1, 1, 3, 3) X = self.conv2d(X) X = self.relu(X) X = self.flat(X) From c7ebc5837456423fc5119b498732ea5f767fd553 Mon Sep 17 00:00:00 2001 From: nemo Date: Wed, 27 Aug 2025 10:31:26 +0200 Subject: [PATCH 11/12] Address review comments --- .gitignore | 1 + src/peft/tuners/loha/config.py | 8 +++-- src/peft/tuners/loha/layer.py | 14 +++------ src/peft/tuners/lokr/config.py | 8 +++-- src/peft/tuners/lokr/layer.py | 14 ++++----- tests/test_custom_models.py | 54 +++++++++++++++++++++++++++++++++- 6 files changed, 75 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 7e13636eb1..acfcad8d8a 100644 --- a/.gitignore +++ b/.gitignore @@ -144,4 +144,5 @@ wandb method_comparison/MetaMathQA/cancelled_results/ method_comparison/MetaMathQA/temporary_results/ +# Coding agents **/.claude/settings.local.json diff --git a/src/peft/tuners/loha/config.py b/src/peft/tuners/loha/config.py index 00eacb1301..79c1f63013 100644 --- a/src/peft/tuners/loha/config.py +++ b/src/peft/tuners/loha/config.py @@ -35,7 +35,8 @@ class LoHaConfig(LycorisConfig): module_dropout (`float`): The dropout probability for disabling LoHa modules during training. use_effective_conv2d (`bool`): - Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper). + Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 ("Proposition 3" from FedPara + paper). target_modules (`Optional[Union[List[str], str]]`): The names of the modules to apply the adapter to. If this is specified, only the modules with the specified names will be replaced. When passing a string, a regex match will be performed. When passing a list of @@ -79,7 +80,10 @@ class LoHaConfig(LycorisConfig): use_effective_conv2d: bool = field( default=False, metadata={ - "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)' + "help": ( + "Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 " + '("Proposition 3" from FedPara paper)' + ) }, ) target_modules: Optional[Union[list[str], str]] = field( diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py index 1985f3a199..19582bec80 100644 --- a/src/peft/tuners/loha/layer.py +++ b/src/peft/tuners/loha/layer.py @@ -54,11 +54,11 @@ def create_adapter_parameters(self, adapter_name: str, r: int, shape: tuple[int, self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode elif len(shape) == 3: # Conv1d - self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode - self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode else: # Linear @@ -140,10 +140,7 @@ def update_layer( # they can be more efficiently handled with the flattened weight representation, # similar to how Linear layers work. This optimization reduces computational cost # without affecting the mathematical equivalence of the operation. - if base_layer.kernel_size == (1, 1): - use_effective_conv2d = False - else: - use_effective_conv2d = use_effective_conv2d + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) if use_effective_conv2d: shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) else: @@ -156,10 +153,7 @@ def update_layer( # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent # to a Linear layer applied across the channel dimension. Using flattened representation # avoids unnecessary reshaping and improves computational efficiency. - if base_layer.kernel_size[0] == 1: - use_effective_conv2d = False - else: - use_effective_conv2d = use_effective_conv2d + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size[0] != 1 if use_effective_conv2d: shape = (base_layer.out_channels, base_layer.in_channels, base_layer.kernel_size[0]) else: diff --git a/src/peft/tuners/lokr/config.py b/src/peft/tuners/lokr/config.py index ea8a0e837f..6d25dc5c12 100644 --- a/src/peft/tuners/lokr/config.py +++ b/src/peft/tuners/lokr/config.py @@ -35,7 +35,8 @@ class LoKrConfig(LycorisConfig): module_dropout (`float`): The dropout probability for disabling LoKr modules during training. use_effective_conv2d (`bool`): - Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper). + Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 ("Proposition 3" from FedPara + paper). decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix. decompose_factor (`int`): @@ -85,7 +86,10 @@ class LoKrConfig(LycorisConfig): use_effective_conv2d: bool = field( default=False, metadata={ - "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)' + "help": ( + "Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 " + '("Proposition 3" from FedPara paper)' + ) }, ) decompose_both: bool = field( diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py index f992eb504c..c898065cee 100644 --- a/src/peft/tuners/lokr/layer.py +++ b/src/peft/tuners/lokr/layer.py @@ -90,7 +90,9 @@ def create_adapter_parameters( if use_w2: self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], shape[2])) elif use_effective_conv2d: # Even for Conv1d, use the effective parameter for kernel dimension - self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2])) + # We pass (r, r, kernel_size, 1) in order to be compatible with the 2d assumptions made + # in make_weight_cp (only relevant for the effective conv2d case). + self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1])) # b, 1-mode self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) # d, 2-mode else: @@ -216,10 +218,7 @@ def update_layer( # they can be more efficiently handled with the flattened weight representation, # similar to how Linear layers work. This optimization reduces computational cost # without affecting the mathematical equivalence of the operation. - if base_layer.kernel_size == (1, 1): - use_effective_conv2d = False - else: - use_effective_conv2d = use_effective_conv2d + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) elif isinstance(base_layer, nn.Conv1d): in_dim, out_dim = base_layer.in_channels, base_layer.out_channels k_size = (base_layer.kernel_size[0],) # Convert to a tuple with single element @@ -234,10 +233,7 @@ def update_layer( # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent # to a Linear layer applied across the channel dimension. Using flattened representation # avoids unnecessary reshaping and improves computational efficiency. - if base_layer.kernel_size[0] == 1: - use_effective_conv2d = False - else: - use_effective_conv2d = use_effective_conv2d + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size[0] != 1 else: raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}") diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 505840e1b8..393ed2c23c 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -268,6 +268,20 @@ ("Conv2d 3 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), ("Conv1d LOHA", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOHA 1", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOHA 2", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "r": 2}), + ( + "Conv1d LOHA 3", + "Conv1dBigger", + LoHaConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": True}, + ), + ( + "Conv1d LOHA 4", + "Conv1dBigger", + LoHaConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False}, + ), ("Conv2d 1x1 LOHA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}), # LoKr ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}), @@ -287,11 +301,24 @@ ), ("Vanilla MLP 7 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "rank_dropout": 0.5}), ("Vanilla MLP 8 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "decompose_both": True, "r": 1, "alpha": 1}), + ("Conv1d LOKR 1", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOKR 2", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"], "r": 2}), + ( + "Conv1d LOKR 3", + "Conv1dBigger", + LoKrConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": True}, + ), + ( + "Conv1d LOKR 4", + "Conv1dBigger", + LoKrConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False}, + ), ("Conv2d 1 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"]}), ("Conv2d 2 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"]}), ("Conv2d 3 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), ("Conv2d 4 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), - ("Conv1d LOKR", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), ("Conv2d 1x1 LOKR", "Conv2d1x1", LoKrConfig, {"target_modules": ["conv2d"]}), ( "Conv2d 5 LOKR", @@ -1193,6 +1220,28 @@ def forward(self, X): return X +class ModelConv1DBigger(nn.Module): + def __init__(self): + super().__init__() + self.conv1d = nn.Conv1d(64, 16, 2) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(144, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 1, 10) + X = torch.concat([X] * 64, dim=1) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + class ModelConv2D(nn.Module): def __init__(self, bias=True): super().__init__() @@ -1426,6 +1475,9 @@ def from_pretrained(cls, model_id, torch_dtype=None): if model_id == "Conv1d": return ModelConv1D().to(torch_dtype) + if model_id == "Conv1dBigger": + return ModelConv1DBigger().to(torch_dtype) + if model_id == "Conv2d": return ModelConv2D().to(torch_dtype) From 1936041a9cd56ca68f113f092e07f40c680ec9c4 Mon Sep 17 00:00:00 2001 From: nemo Date: Wed, 27 Aug 2025 10:51:38 +0200 Subject: [PATCH 12/12] Remove in favor of global user config --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index acfcad8d8a..4e3e2ca5fc 100644 --- a/.gitignore +++ b/.gitignore @@ -143,6 +143,3 @@ wandb # method_comparison logs method_comparison/MetaMathQA/cancelled_results/ method_comparison/MetaMathQA/temporary_results/ - -# Coding agents -**/.claude/settings.local.json