From 228199adba675c46211c99002893be4db82861db Mon Sep 17 00:00:00 2001 From: macmacmacmac Date: Tue, 7 Oct 2025 14:42:12 -0400 Subject: [PATCH 1/3] Added new initialization option for PromptEmbedding by sampling tokens from the model's vocab --- src/peft/tuners/prompt_tuning/config.py | 7 ++++++- src/peft/tuners/prompt_tuning/model.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/prompt_tuning/config.py b/src/peft/tuners/prompt_tuning/config.py index ee8ceb8d89..a5cee0279d 100644 --- a/src/peft/tuners/prompt_tuning/config.py +++ b/src/peft/tuners/prompt_tuning/config.py @@ -22,6 +22,7 @@ class PromptTuningInit(str, enum.Enum): TEXT = "TEXT" + RANDOM_DISCRETE = "RANDOM_DISCRETE" RANDOM = "RANDOM" @@ -31,7 +32,11 @@ class PromptTuningConfig(PromptLearningConfig): This is the configuration class to store the configuration of a [`PromptEmbedding`]. Args: - prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding. + prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): + The initialization of the prompt embedding. + `TEXT` will initialize with your text. + `RANDOM_DISCRETE` will initialize with randomly sampled discrete, hard tokens. + `RANDOM` will initialize with randomly sampled continuous, soft tokens. prompt_tuning_init_text (`str`, *optional*): The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`. tokenizer_name_or_path (`str`, *optional*): diff --git a/src/peft/tuners/prompt_tuning/model.py b/src/peft/tuners/prompt_tuning/model.py index ce9b6bc409..3fed8027f1 100644 --- a/src/peft/tuners/prompt_tuning/model.py +++ b/src/peft/tuners/prompt_tuning/model.py @@ -64,7 +64,20 @@ def __init__(self, config, word_embeddings): total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim) - if config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode: + if config.prompt_tuning_init == PromptTuningInit.RANDOM_DISCRETE and not config.inference_mode: + import numpy as np + + # Randomly sample tokens from the tokenizer's vocab + vocab_size = word_embeddings.num_embeddings + init_token_ids = np.random.randint(0,vocab_size,total_virtual_tokens) + init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device) + with gather_params_ctx(word_embeddings.parameters()): + word_embedding_weights = word_embeddings(init_token_ids).detach().clone() + word_embedding_weights = word_embedding_weights.to(torch.float32) + self.embedding.weight = torch.nn.Parameter(word_embedding_weights) + + elif config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode: + import numpy as np from transformers import AutoTokenizer tokenizer_kwargs = config.tokenizer_kwargs or {} From 1a48b39a623d6d5aacbfb5f62e98425defa69db4 Mon Sep 17 00:00:00 2001 From: macmacmacmac Date: Tue, 7 Oct 2025 14:47:46 -0400 Subject: [PATCH 2/3] Added new initialization option for PromptEmbedding --- src/peft/tuners/prompt_tuning/config.py | 9 ++++----- src/peft/tuners/prompt_tuning/model.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/peft/tuners/prompt_tuning/config.py b/src/peft/tuners/prompt_tuning/config.py index a5cee0279d..d968a7d77f 100644 --- a/src/peft/tuners/prompt_tuning/config.py +++ b/src/peft/tuners/prompt_tuning/config.py @@ -32,11 +32,10 @@ class PromptTuningConfig(PromptLearningConfig): This is the configuration class to store the configuration of a [`PromptEmbedding`]. Args: - prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): - The initialization of the prompt embedding. - `TEXT` will initialize with your text. - `RANDOM_DISCRETE` will initialize with randomly sampled discrete, hard tokens. - `RANDOM` will initialize with randomly sampled continuous, soft tokens. + prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): + The initialization of the prompt embedding. `TEXT` will initialize with your text. `RANDOM_DISCRETE` will + initialize with randomly sampled discrete, hard tokens. `RANDOM` will initialize with randomly sampled + continuous, soft tokens (warning: sampled soft tokens may fall outside of embedding manifold) prompt_tuning_init_text (`str`, *optional*): The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`. tokenizer_name_or_path (`str`, *optional*): diff --git a/src/peft/tuners/prompt_tuning/model.py b/src/peft/tuners/prompt_tuning/model.py index 3fed8027f1..10b62f7b5a 100644 --- a/src/peft/tuners/prompt_tuning/model.py +++ b/src/peft/tuners/prompt_tuning/model.py @@ -69,7 +69,7 @@ def __init__(self, config, word_embeddings): # Randomly sample tokens from the tokenizer's vocab vocab_size = word_embeddings.num_embeddings - init_token_ids = np.random.randint(0,vocab_size,total_virtual_tokens) + init_token_ids = np.random.randint(0, vocab_size, total_virtual_tokens) init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device) with gather_params_ctx(word_embeddings.parameters()): word_embedding_weights = word_embeddings(init_token_ids).detach().clone() From 68c10561079fae22f142514fb805e9f557830334 Mon Sep 17 00:00:00 2001 From: macmacmacmac Date: Wed, 8 Oct 2025 20:30:14 -0400 Subject: [PATCH 3/3] Added unit test for new prompttuning initialization option, renamed it too --- src/peft/tuners/prompt_tuning/config.py | 8 ++++---- src/peft/tuners/prompt_tuning/model.py | 10 ++++------ tests/test_decoder_models.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/peft/tuners/prompt_tuning/config.py b/src/peft/tuners/prompt_tuning/config.py index d968a7d77f..b41669efe8 100644 --- a/src/peft/tuners/prompt_tuning/config.py +++ b/src/peft/tuners/prompt_tuning/config.py @@ -22,7 +22,7 @@ class PromptTuningInit(str, enum.Enum): TEXT = "TEXT" - RANDOM_DISCRETE = "RANDOM_DISCRETE" + SAMPLE_VOCAB = "SAMPLE_VOCAB" RANDOM = "RANDOM" @@ -33,9 +33,9 @@ class PromptTuningConfig(PromptLearningConfig): Args: prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): - The initialization of the prompt embedding. `TEXT` will initialize with your text. `RANDOM_DISCRETE` will - initialize with randomly sampled discrete, hard tokens. `RANDOM` will initialize with randomly sampled - continuous, soft tokens (warning: sampled soft tokens may fall outside of embedding manifold) + The initialization of the prompt embedding. `TEXT` will initialize with your text. `SAMPLE_VOCAB` will + initialize with randomly sampled tokens from the model's vocabulary. `RANDOM` will initialize with randomly + sampled continuous, soft tokens (warning: sampled soft tokens may fall outside of embedding manifold) prompt_tuning_init_text (`str`, *optional*): The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`. tokenizer_name_or_path (`str`, *optional*): diff --git a/src/peft/tuners/prompt_tuning/model.py b/src/peft/tuners/prompt_tuning/model.py index 10b62f7b5a..9852ea28b4 100644 --- a/src/peft/tuners/prompt_tuning/model.py +++ b/src/peft/tuners/prompt_tuning/model.py @@ -64,20 +64,18 @@ def __init__(self, config, word_embeddings): total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim) - if config.prompt_tuning_init == PromptTuningInit.RANDOM_DISCRETE and not config.inference_mode: - import numpy as np - + if config.prompt_tuning_init == PromptTuningInit.SAMPLE_VOCAB and not config.inference_mode: # Randomly sample tokens from the tokenizer's vocab vocab_size = word_embeddings.num_embeddings - init_token_ids = np.random.randint(0, vocab_size, total_virtual_tokens) - init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device) + init_token_ids = torch.randint(0, vocab_size, (total_virtual_tokens,), dtype=torch.long).to( + word_embeddings.weight.device + ) with gather_params_ctx(word_embeddings.parameters()): word_embedding_weights = word_embeddings(init_token_ids).detach().clone() word_embedding_weights = word_embedding_weights.to(torch.float32) self.embedding.weight = torch.nn.Parameter(word_embedding_weights) elif config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode: - import numpy as np from transformers import AutoTokenizer tokenizer_kwargs = config.tokenizer_kwargs or {} diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index 5c01e29052..12b4a62c3b 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -380,6 +380,18 @@ def mock_autotokenizer_from_pretrained(*args, **kwargs): expected_call = call(model_id, trust_remote_code=True, foo="bar") assert mock.call_args == expected_call + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prompt_tuning_sample_vocab_prepare_for_training(self, model_id, config_cls, config_kwargs): + if config_cls != PromptTuningConfig: + pytest.skip(f"This test does not apply to {config_cls}") + + config_kwargs = config_kwargs.copy() + config_kwargs["prompt_tuning_init"] = PromptTuningInit.SAMPLE_VOCAB + config_kwargs["tokenizer_name_or_path"] = model_id + + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + def test_prompt_tuning_config_invalid_args(self): # Raise an error when tokenizer_kwargs is used with prompt_tuning_init!='TEXT', because this argument has no # function in that case