这是indexloc提供的服务,不要输入任何密码
Skip to content

Update quantization scripts & relax modelopt requirement specifier #12709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions nemo/collections/llm/modelopt/quantization/quant_cfg_choices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict

from nemo.utils.import_utils import safe_import

mtq, HAVE_MODELOPT = safe_import("modelopt.torch.quantization")


def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
"""
Retrieve a dictionary of modelopt quantization configuration choices.

This function checks for the availability of specific quantization configurations defined in
the modelopt.torch.quantization (mtq) module and returns a dictionary mapping short names to
their corresponding configurations. The function is intended to work for different modelopt
library versions that come with variable configuration choices.

Returns:
dict: A dictionary where keys are short names (e.g., "fp8") and values are the
corresponding modelopt quantization configuration objects.
"""
if not HAVE_MODELOPT:
return {}

QUANT_CFG_NAMES = [
("int8", "INT8_DEFAULT_CFG"),
("int8_sq", "INT8_SMOOTHQUANT_CFG"),
("fp8", "FP8_DEFAULT_CFG"),
("int4_awq", "INT4_AWQ_CFG"),
("w4a8_awq", "W4A8_AWQ_BETA_CFG"),
("int4", "INT4_BLOCKWISE_WEIGHT_ONLY_CFG"),
("nvfp4", "NVFP4_DEFAULT_CFG"),
]

QUANT_CFG_CHOICES = {}

for short_name, full_name in QUANT_CFG_NAMES:
if config := getattr(mtq, full_name, None):
QUANT_CFG_CHOICES[short_name] = config

return QUANT_CFG_CHOICES
35 changes: 17 additions & 18 deletions nemo/collections/llm/modelopt/quantization/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
import torch
from accelerate.hooks import remove_hook_from_module
from datasets import load_dataset
from megatron.core.inference.common_inference_params import CommonInferenceParams
from tqdm import tqdm

from nemo.collections import llm
from nemo.collections.llm.inference import MCoreTokenizerWrappper, generate
from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices
from nemo.collections.llm.utils import barrier, torch_dtype_from_precision
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
Expand All @@ -37,21 +39,11 @@
from nemo.lightning import Trainer
from nemo.lightning.megatron_parallel import MegatronParallel

_, HAVE_MODELOPT = safe_import("modelopt")
if HAVE_MODELOPT:
import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_hf_checkpoint, export_tensorrt_llm_checkpoint

QUANT_CFG_CHOICES = {
"int8": mtq.INT8_DEFAULT_CFG,
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
}
mtq, HAVE_MODELOPT_MTQ = safe_import("modelopt.torch.quantization")
mte, HAVE_MODELOPT_MTE = safe_import("modelopt.torch.export")
HAVE_MODELOPT = HAVE_MODELOPT_MTQ and HAVE_MODELOPT_MTE

QUANT_CFG_CHOICES = get_quant_cfg_choices()
SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers
SUPPORTED_EXPORT_FMT = ["trtllm", "nemo", "hf"]

Expand Down Expand Up @@ -171,8 +163,15 @@ def _generate_sample(model):
mcore_inference = model.get_inference_wrapper(
params_dtype=torch.bfloat16, inference_batch_times_seqlen_threshold=30
)

generated = [r.generated_text for r in generate(mcore_inference, mcore_tokenizer, prompts)]
generated = [
r.generated_text
for r in generate(
mcore_inference,
mcore_tokenizer,
prompts,
inference_params=CommonInferenceParams(top_k=1, num_tokens_to_generate=30),
)
]
outputs = [prompt + generation for prompt, generation in zip(prompts, generated)]

logging.info(f"Sample generation after PTQ (with prompts): {outputs}")
Expand Down Expand Up @@ -358,7 +357,7 @@ def export(self, model, model_dir: str, trainer: Optional["Trainer"] = None) ->
assert is_automodel, "HF export is only supported for AutoModelForCausalLM"
unwrapped_model = unwrap_for_modelopt_operations(model)
with torch.inference_mode():
export_hf_checkpoint(
mte.export_hf_checkpoint(
unwrapped_model,
export_dir=export_dir,
)
Expand All @@ -370,7 +369,7 @@ def export(self, model, model_dir: str, trainer: Optional["Trainer"] = None) ->

with torch.inference_mode():
remove_hook_from_module(model, recurse=True)
export_tensorrt_llm_checkpoint(
mte.export_tensorrt_llm_checkpoint(
model=unwrap_for_modelopt_operations(model),
decoder_type=self._get_decoder_type(model),
dtype=self.torch_dtype,
Expand Down
1 change: 1 addition & 0 deletions nemo/export/quantize/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
}

HAVE_MODELOPT = True
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements_nlp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ matplotlib>=3.3.2
megatron_core
nltk>=3.6.5
numpy<2 # tensorstore has an implicit compiled dependency on numpy<2
nvidia-modelopt[torch]==0.25.0 ; platform_system != 'Darwin'
nvidia-modelopt[torch]>=0.23.2,<=0.25.0 ; platform_system != 'Darwin'
nvidia-resiliency-ext; (platform_machine == 'x86_64' and platform_system != 'Darwin')
opencc
pangu
Expand Down
4 changes: 3 additions & 1 deletion scripts/llm/ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@

from nemo.collections import llm
from nemo.collections.llm.modelopt import ExportConfig, QuantizationConfig
from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices


def get_args():
"""Parses PTQ arguments."""
QUANT_CFG_CHOICES_LIST = ["no_quant", *get_quant_cfg_choices()]
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="NeMo PTQ argument parser"
)
Expand Down Expand Up @@ -69,7 +71,7 @@ def get_args():
"--algorithm",
type=str,
default="fp8",
choices=["no_quant", "int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "nvfp4"],
choices=QUANT_CFG_CHOICES_LIST,
help="TensorRT-Model-Optimizer quantization algorithm",
)
parser.add_argument(
Expand Down
Loading