fix CI failure caused by internal changes

xinhe3 · xinhe3 · commit 66fa4f227e4d · 2025-04-25T06:49:03.000+03:00
Signed-off-by: Xin He &lt;xinhe3@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -27,12 +27,9 @@
 
 from neural_compressor.common import options
 from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
-from neural_compressor.torch.utils import is_hpex_available
+from neural_compressor.torch.utils import is_hpu_available
 from neural_compressor.torch.utils.utility import dowload_hf_model
 
-if is_hpex_available():
-    import habana_frameworks
-
 from .load import load
 
 LWQ_WORKSPACE = os.path.join(options.workspace, "lwq_tmpdir")
@@ -262,8 +259,8 @@ def load_first_layer_only(user_model, model_name):
         model_name (str): model name or path
     """
     for name, m in user_model.named_modules():
-        if ("layers" not in name or "layers.0" in name) and len(name) > 0 and len(list(m.named_children())) == 0:
-            load_module(user_model, name, get_path(model_name), device="hpu" if is_hpex_available() else "cpu")
+        if ('layers' not in name or 'layers.0' in name) and len(name) > 0 and len(list(m.named_children())) == 0:
+            load_module(user_model, name, get_path(model_name), device="hpu" if is_hpu_available() else "cpu")
 
 
 def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None, indicated_layers=None):
@@ -337,8 +334,6 @@ def hook(module, input, output):
 
 
 def clean_module_weight(module):
-    """Clean module weight."""
-    hpu_available = is_hpex_available()
     """Clean module weight."""
     if isinstance(module, QDQLayer):
         submodule = module.module
@@ -360,8 +355,9 @@ def clean_module_weight(module):
             else:
                 param_cls = type(submodule._parameters[n])
                 kwargs = submodule._parameters[n].__dict__
-                if hpu_available:
-                    if param_cls == habana_frameworks.torch.core.weight_sharing.HabanaParameterWrapper:
+                if is_hpu_available:
+                    from habana_frameworks.torch.core import weight_sharing
+                    if param_cls == weight_sharing.HabanaParameterWrapper:
                         try:
                             kwargs.pop("change_device_placement")
                         except KeyError:
@@ -465,7 +461,7 @@ def load_model_from_shards_with_safetensors(shard_dir, bin_index_file):
     for shard_file in shard_files:
         shard_path = os.path.join(shard_dir, shard_file)
         print(f"Loading shard from {shard_path}")
-        shard_state_dict = load_file(shard_path, device="hpu" if is_hpex_available() else "cpu")
+        shard_state_dict = load_file(shard_path, device="hpu" if is_hpu_available() else "cpu")
         full_state_dict.update(shard_state_dict)
 
     return full_state_dict
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -68,46 +68,42 @@ def save(model, output_dir="./saved_results", format=SaveLoadFormat.DEFAULT, **k
     os.makedirs(output_dir, exist_ok=True)
     cur_accelerator.synchronize()
     if format == SaveLoadFormat.HUGGINGFACE:  # pragma: no cover
-        config = model.config
-        config_file = "quantize_config.json"
-        quantization_config = config.quantization_config if hasattr(config, "quantization_config") else None
-        if quantization_config and "backend" in quantization_config and "auto_round" in quantization_config["backend"]:
-            safe_serialization = kwargs.get("safe_serialization", True)
-            tokenizer = kwargs.get("tokenizer", None)
-            max_shard_size = kwargs.get("max_shard_size", "5GB")
-            if tokenizer is not None:
-                tokenizer.save_pretrained(output_dir)
-            del model.save
-            model.save_pretrained(
-                output_dir,
-                max_shard_size=max_shard_size,
-                safe_serialization=safe_serialization,
-                state_dict=model.state_dict() if "model_state_dict" not in kwargs else kwargs["model_state_dict"],
-            )
-            with open(os.path.join(output_dir, config_file), "w", encoding="utf-8") as f:
-                json.dump(quantization_config, f, indent=2)
-            return
-
-    output_folder = os.path.abspath(os.path.expanduser(output_dir))
-    qmodel_weight_file_path = os.path.join(output_folder, WEIGHT_NAME)
-    qconfig_file_path = os.path.join(output_folder, QCONFIG_NAME)
-    # saving process
-    save_config_mapping(model.qconfig, qconfig_file_path)
-
-    # MethodType 'save' not in state_dict
-    del model.save
-    if "blockwise" in kwargs:
-        from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, save_layers_in_shards_iteratively
-
-        checkpoints_folder = kwargs.get("blockwise_load_folder", None)
-        if not checkpoints_folder:
-            checkpoints_folder = LWQ_WORKSPACE
-        save_layers_in_shards_iteratively(checkpoints_folder, output_folder, layers_per_shard=8)
-    else:
-        model_state_dict = model.state_dict()  # if 'model_state_dict' not in kwargs else kwargs['model_state_dict']
-        torch.save(model_state_dict, qmodel_weight_file_path)
-        logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path))
-    logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
+        quantization_config_file = "quantize_config.json"
+        safe_serialization = kwargs.get("safe_serialization", True)
+        max_shard_size = kwargs.get("max_shard_size", f"{MAX_FILE_SIZE}GB")
+        if not hasattr(model.config, "quantization_config"):
+            quantization_config = change_config_to_hf_format(model.qconfig)
+            model.config.quantization_config = quantization_config
+        # save model state_dict and config.json
+        model.save_pretrained(output_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+        # save quantize_config.json
+        with open(os.path.join(output_dir, quantization_config_file), "w", encoding="utf-8") as f:
+            json.dump(quantization_config, f, indent=2)
+        # save generation_config.json
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            model.generation_config.save_pretrained(output_dir)
+        # save tokenizer
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(output_dir)
+        return
+    elif format == SaveLoadFormat.DEFAULT:
+        output_folder = os.path.abspath(os.path.expanduser(output_dir))
+        qmodel_weight_file_path = os.path.join(output_folder, WEIGHT_NAME)
+        qconfig_file_path = os.path.join(output_folder, QCONFIG_NAME)
+        # saving process
+        save_config_mapping(model.qconfig, qconfig_file_path)
+        if 'blockwise' in kwargs:
+            from neural_compressor.torch.algorithms.layer_wise import save_layers_in_shards_iteratively, LWQ_WORKSPACE
+            checkpoints_folder = kwargs.get("blockwise_load_folder", None)
+            if not checkpoints_folder:
+                checkpoints_folder = LWQ_WORKSPACE
+            save_layers_in_shards_iteratively(checkpoints_folder, output_folder, layers_per_shard=8)
+        else:
+            model_state_dict = model.state_dict() # if 'model_state_dict' not in kwargs else kwargs['model_state_dict']
+            torch.save(model_state_dict, qmodel_weight_file_path)
+            logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path))
+        logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
 
 
 def load(model_name_or_path, original_model=None, format=SaveLoadFormat.DEFAULT, device="cpu", **kwargs):
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -935,7 +935,7 @@ def __init__(
         act_group_size: int = None,
         act_sym: bool = None,
         act_dynamic: bool = True,
-        act_dtype: Optional[str] = None,
+        act_dtype: Optional[str] = "int",
         enable_full_range: bool = False,
         batch_size: int = 8,
         lr_scheduler=None,
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
@@ -157,7 +157,8 @@ def prepare(
     else:
         model_info = quant_config.get_model_info(model=prepared_model)
 
-    if hasattr(quant_config, "model_path") and quant_config.model_path == "":
+    if (hasattr(quant_config, "model_path") and quant_config.model_path == "" 
+       and hasattr(prepared_model, "name_or_path")):
         quant_config.model_path = prepared_model.name_or_path
     configs_mapping = quant_config.to_config_mapping(model_info=model_info)
     logger.debug(configs_mapping)
diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
@@ -74,6 +74,11 @@ def is_hpex_available():
     return _hpex_available
 
 
+def is_hpu_available():
+    """Returns whether hpex is available."""
+    return get_accelerator().name() == "hpu"
+
+
 ## check optimum
 if is_package_available("optimum"):
     _optimum_available = True
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
@@ -629,7 +629,7 @@ def set_nontext_module_config(model, to_quant_block_names, config):
 def convert_to_GPTQ_checkpoints(model, quantization_config):
     from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear
 
-    from neural_compressor.adaptor.torch_utils.util import set_module
+    from neural_compressor.torch.utils import set_module
     from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
 
     dtype = "int4" if quantization_config.bits == 4 else "int8"
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -16,7 +16,7 @@
     save,
     load
 )
-from neural_compressor.torch.utils import is_hpex_available, get_used_hpu_mem_MB
+from neural_compressor.torch.utils import is_hpu_available, get_used_hpu_mem_MB
 
 
 def change_to_cur_file_dir():
@@ -33,7 +33,7 @@ def calib_func(model):
     for i in range(2):
         model(example_inputs)
 
-@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
 class TestFP8StaticQuantNLP:
     def setup_class(self):
         change_to_cur_file_dir()
@@ -88,7 +88,7 @@ def test_two_step_quant_nlp(self):
 
 
 @pytest.mark.xfail(reason="[SW-219514] RuntimeError: operator torchvision::nms does not exist")
-@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
 class TestFP8StaticQuantCV:
     def setup_class(self):
         change_to_cur_file_dir()
diff --git a/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py b/test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py
@@ -10,7 +10,7 @@
 from transformers import AutoTokenizer
 from datasets import load_dataset
 from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare, HybridGPTQConfig
-from neural_compressor.torch.utils import is_hpex_available, get_accelerator
+from neural_compressor.torch.utils import is_hpu_available, get_accelerator
 from neural_compressor.torch.quantization.save_load_entry import load
 
 
@@ -42,7 +42,7 @@ def calib_func(model, dataset, tokenizer=None):
         cur_accelerator.synchronize()
         logits.append(logs.detach().cpu())
 
-@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
 class TestGPTQwithFP8Quant:
     def setup_class(self):
         change_to_cur_file_dir()
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -14,7 +14,7 @@
     prepare,
     quantize,
 )
-from neural_compressor.torch.utils import accelerator, is_hpex_available
+from neural_compressor.torch.utils import accelerator, is_hpu_available
 
 device = accelerator.name()
 
@@ -184,7 +184,7 @@ def test_act_order(self):
         # compare atol, this case is an ideal case.
         assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
 
-
+    @pytest.mark.skipif(not is_hpu_available(), reason="These tests are not supported on HPU for now.")
     def test_block_wise(self):
         from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE
         from neural_compressor.torch import load_empty_model
@@ -225,7 +225,7 @@ def test_block_wise(self):
 
         kwargs = {'sharded_checkpoints': True}
 
-        loaded_model = load(LWQ_WORKSPACE+"/checkpoint/", copy.deepcopy(self.tiny_gptj), **kwargs)
+        loaded_model = load(LWQ_WORKSPACE+"/checkpoint/", copy.deepcopy(self.tiny_gptj), **kwargs).to(device)
 
         out = loaded_model(self.example_inputs)[0]
 
@@ -298,7 +298,7 @@ def test_true_sequential(self):
         ), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."
 
     # TODO [SW-216127]: it's not in high priority, so we can implement it later.
-    @pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
+    @pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
     def test_quant_lm_head(self):
         # quant_lm_head=False
         model = copy.deepcopy(self.tiny_gptj)
@@ -367,7 +367,7 @@ def test_double_quant_params(self, dtype, double_quant_bits, double_quant_group_
             assert torch.allclose(atol_false, atol_true, atol=0.008), "atol is very close, double checked the logic."
 
     # TODO [SW-216127]: it's not in high priority, so we can implement it later.
-    @pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
+    @pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
     def test_conv1d(self):
         from transformers import GPT2Model, GPT2Tokenizer
 
diff --git a/test/3x/torch/quantization/weight_only/test_load.py b/test/3x/torch/quantization/weight_only/test_load.py
@@ -7,7 +7,7 @@
 import transformers
 
 from neural_compressor.torch.quantization import load
-from neural_compressor.torch.utils import SaveLoadFormat, accelerator, is_hpex_available
+from neural_compressor.torch.utils import SaveLoadFormat, accelerator, is_hpu_available
 
 device = accelerator.current_device_name()
 
@@ -55,7 +55,7 @@ def test_load_hf_woq_model_cpu(self):
         output = qmodel(self.example_inputs.to("cpu"))[0]
         assert len(output) > 0, "Not loading the model correctly"
 
-    @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
+    @pytest.mark.skipif(not is_hpu_available(), reason="no hpex in environment here.")
     def test_load_hf_woq_model_hpu(self):
         # use huggingface model_id (format=huggingface, device="hpu")
         # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
@@ -88,7 +88,7 @@ def test_load_hf_woq_model_hpu(self):
             output1, output2
         ), "The model loaded the second time is different from the model loaded the first time"
 
-    @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
+    @pytest.mark.skipif(not is_hpu_available(), reason="no hpex in environment here.")
     def test_load_hf_woq_model_hpu_special_case(self):
         # this model contains tensors sharing memory
         model = load(
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -13,7 +13,7 @@
     prepare,
     quantize,
 )
-from neural_compressor.torch.utils import accelerator, is_hpex_available
+from neural_compressor.torch.utils import accelerator, is_hpu_available
 
 device = accelerator.name()
 
@@ -310,7 +310,7 @@ def test_rtn_with_quantize_API(self):
 
     # TODO: (4, True, 32, 0), group_dim=0, format not supported
     # TODO [SW-216127]: it's not in high priority, so we can implement it later.
-    @pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
+    @pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
     @pytest.mark.parametrize(
         "bits, use_sym, group_size, group_dim",
         [
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
@@ -9,3 +9,4 @@ psutil
 pytest
 torchvision
 transformers
+datasets

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`prepare,`
`14`	`14`	`quantize,`
`15`	`15`	`)`
`16`		`-from neural_compressor.torch.utils import accelerator, is_hpex_available`
	`16`	`+from neural_compressor.torch.utils import accelerator, is_hpu_available`
`17`	`17`
`18`	`18`	`device = accelerator.name()`
`19`	`19`
`@@ -310,7 +310,7 @@ def test_rtn_with_quantize_API(self):`
`310`	`310`
`311`	`311`	`# TODO: (4, True, 32, 0), group_dim=0, format not supported`
`312`	`312`	`# TODO [SW-216127]: it's not in high priority, so we can implement it later.`
`313`		`- @pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")`
	`313`	`+ @pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")`
`314`	`314`	`@pytest.mark.parametrize(`
`315`	`315`	`"bits, use_sym, group_size, group_dim",`
`316`	`316`	`[`