NVIDIA · janekl · Mar 24, 2025 · Mar 20, 2025 · Mar 21, 2025 · Mar 18, 2025
diff --git a/nemo/collections/llm/modelopt/quantization/quant_cfg_choices.py b/nemo/collections/llm/modelopt/quantization/quant_cfg_choices.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+from nemo.utils.import_utils import safe_import
+
+mtq, HAVE_MODELOPT = safe_import("modelopt.torch.quantization")
+
+
+def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
+    """
+    Retrieve a dictionary of modelopt quantization configuration choices.
+
+    This function checks for the availability of specific quantization configurations defined in
+    the modelopt.torch.quantization (mtq) module and returns a dictionary mapping short names to
+    their corresponding configurations. The function is intended to work for different modelopt
+    library versions that come with variable configuration choices.
+
+    Returns:
+        dict: A dictionary where keys are short names (e.g., "fp8") and values are the
+            corresponding modelopt quantization configuration objects.
+    """
+    if not HAVE_MODELOPT:
+        return {}
+
+    QUANT_CFG_NAMES = [
+        ("int8", "INT8_DEFAULT_CFG"),
+        ("int8_sq", "INT8_SMOOTHQUANT_CFG"),
+        ("fp8", "FP8_DEFAULT_CFG"),
+        ("int4_awq", "INT4_AWQ_CFG"),
+        ("w4a8_awq", "W4A8_AWQ_BETA_CFG"),
+        ("int4", "INT4_BLOCKWISE_WEIGHT_ONLY_CFG"),
+        ("nvfp4", "NVFP4_DEFAULT_CFG"),
+    ]
+
+    QUANT_CFG_CHOICES = {}
+
+    for short_name, full_name in QUANT_CFG_NAMES:
+        if config := getattr(mtq, full_name, None):
+            QUANT_CFG_CHOICES[short_name] = config
+
+    return QUANT_CFG_CHOICES
diff --git a/nemo/collections/llm/modelopt/quantization/quantizer.py b/nemo/collections/llm/modelopt/quantization/quantizer.py
@@ -21,10 +21,12 @@
 import torch
 from accelerate.hooks import remove_hook_from_module
 from datasets import load_dataset
+from megatron.core.inference.common_inference_params import CommonInferenceParams
 from tqdm import tqdm
 
 from nemo.collections import llm
 from nemo.collections.llm.inference import MCoreTokenizerWrappper, generate
+from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices
 from nemo.collections.llm.utils import barrier, torch_dtype_from_precision
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
@@ -37,21 +39,11 @@
     from nemo.lightning import Trainer
     from nemo.lightning.megatron_parallel import MegatronParallel
 
-_, HAVE_MODELOPT = safe_import("modelopt")
-if HAVE_MODELOPT:
-    import modelopt.torch.quantization as mtq
-    from modelopt.torch.export import export_hf_checkpoint, export_tensorrt_llm_checkpoint
-
-    QUANT_CFG_CHOICES = {
-        "int8": mtq.INT8_DEFAULT_CFG,
-        "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
-        "fp8": mtq.FP8_DEFAULT_CFG,
-        "int4_awq": mtq.INT4_AWQ_CFG,
-        "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
-        "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
-        "nvfp4": mtq.NVFP4_DEFAULT_CFG,
-    }
+mtq, HAVE_MODELOPT_MTQ = safe_import("modelopt.torch.quantization")
+mte, HAVE_MODELOPT_MTE = safe_import("modelopt.torch.export")
+HAVE_MODELOPT = HAVE_MODELOPT_MTQ and HAVE_MODELOPT_MTE
 
+QUANT_CFG_CHOICES = get_quant_cfg_choices()
 SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
 SUPPORTED_EXPORT_FMT = ["trtllm", "nemo", "hf"]
 
@@ -171,8 +163,15 @@ def _generate_sample(model):
             mcore_inference = model.get_inference_wrapper(
                 params_dtype=torch.bfloat16, inference_batch_times_seqlen_threshold=30
             )
-
-            generated = [r.generated_text for r in generate(mcore_inference, mcore_tokenizer, prompts)]
+            generated = [
+                r.generated_text
+                for r in generate(
+                    mcore_inference,
+                    mcore_tokenizer,
+                    prompts,
+                    inference_params=CommonInferenceParams(top_k=1, num_tokens_to_generate=30),
+                )
+            ]
             outputs = [prompt + generation for prompt, generation in zip(prompts, generated)]
 
         logging.info(f"Sample generation after PTQ (with prompts): {outputs}")
@@ -358,7 +357,7 @@ def export(self, model, model_dir: str, trainer: Optional["Trainer"] = None) ->
             assert is_automodel, "HF export is only supported for AutoModelForCausalLM"
             unwrapped_model = unwrap_for_modelopt_operations(model)
             with torch.inference_mode():
-                export_hf_checkpoint(
+                mte.export_hf_checkpoint(
                     unwrapped_model,
                     export_dir=export_dir,
                 )
@@ -370,7 +369,7 @@ def export(self, model, model_dir: str, trainer: Optional["Trainer"] = None) ->
 
             with torch.inference_mode():
                 remove_hook_from_module(model, recurse=True)
-                export_tensorrt_llm_checkpoint(
+                mte.export_tensorrt_llm_checkpoint(
                     model=unwrap_for_modelopt_operations(model),
                     decoder_type=self._get_decoder_type(model),
                     dtype=self.torch_dtype,

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
@@ -40,6 +40,7 @@
         "int4_awq": mtq.INT4_AWQ_CFG,
         "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
         "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
+        "nvfp4": mtq.NVFP4_DEFAULT_CFG,
     }
 
     HAVE_MODELOPT = True

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
@@ -13,7 +13,7 @@ matplotlib>=3.3.2
 megatron_core
 nltk>=3.6.5
 numpy<2  # tensorstore has an implicit compiled dependency on numpy<2
-nvidia-modelopt[torch]==0.25.0 ; platform_system != 'Darwin'
+nvidia-modelopt[torch]>=0.23.2,<=0.25.0 ; platform_system != 'Darwin'
 nvidia-resiliency-ext; (platform_machine == 'x86_64' and platform_system != 'Darwin')
 opencc
 pangu

diff --git a/scripts/llm/ptq.py b/scripts/llm/ptq.py
@@ -16,10 +16,12 @@
 
 from nemo.collections import llm
 from nemo.collections.llm.modelopt import ExportConfig, QuantizationConfig
+from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices
 
 
 def get_args():
     """Parses PTQ arguments."""
+    QUANT_CFG_CHOICES_LIST = ["no_quant", *get_quant_cfg_choices()]
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="NeMo PTQ argument parser"
     )
@@ -69,7 +71,7 @@ def get_args():
         "--algorithm",
         type=str,
         default="fp8",
-        choices=["no_quant", "int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "nvfp4"],
+        choices=QUANT_CFG_CHOICES_LIST,
         help="TensorRT-Model-Optimizer quantization algorithm",
     )
     parser.add_argument(