Skip to content

Commit 66fa4f2

Browse files
committed
fix CI failure caused by internal changes
Signed-off-by: Xin He <xinhe3@habana.ai>
1 parent 1008215 commit 66fa4f2

File tree

12 files changed

+68
-69
lines changed

12 files changed

+68
-69
lines changed

neural_compressor/torch/algorithms/layer_wise/utils.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@
2727

2828
from neural_compressor.common import options
2929
from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
30-
from neural_compressor.torch.utils import is_hpex_available
30+
from neural_compressor.torch.utils import is_hpu_available
3131
from neural_compressor.torch.utils.utility import dowload_hf_model
3232

33-
if is_hpex_available():
34-
import habana_frameworks
35-
3633
from .load import load
3734

3835
LWQ_WORKSPACE = os.path.join(options.workspace, "lwq_tmpdir")
@@ -262,8 +259,8 @@ def load_first_layer_only(user_model, model_name):
262259
model_name (str): model name or path
263260
"""
264261
for name, m in user_model.named_modules():
265-
if ("layers" not in name or "layers.0" in name) and len(name) > 0 and len(list(m.named_children())) == 0:
266-
load_module(user_model, name, get_path(model_name), device="hpu" if is_hpex_available() else "cpu")
262+
if ('layers' not in name or 'layers.0' in name) and len(name) > 0 and len(list(m.named_children())) == 0:
263+
load_module(user_model, name, get_path(model_name), device="hpu" if is_hpu_available() else "cpu")
267264

268265

269266
def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None, indicated_layers=None):
@@ -337,8 +334,6 @@ def hook(module, input, output):
337334

338335

339336
def clean_module_weight(module):
340-
"""Clean module weight."""
341-
hpu_available = is_hpex_available()
342337
"""Clean module weight."""
343338
if isinstance(module, QDQLayer):
344339
submodule = module.module
@@ -360,8 +355,9 @@ def clean_module_weight(module):
360355
else:
361356
param_cls = type(submodule._parameters[n])
362357
kwargs = submodule._parameters[n].__dict__
363-
if hpu_available:
364-
if param_cls == habana_frameworks.torch.core.weight_sharing.HabanaParameterWrapper:
358+
if is_hpu_available:
359+
from habana_frameworks.torch.core import weight_sharing
360+
if param_cls == weight_sharing.HabanaParameterWrapper:
365361
try:
366362
kwargs.pop("change_device_placement")
367363
except KeyError:
@@ -465,7 +461,7 @@ def load_model_from_shards_with_safetensors(shard_dir, bin_index_file):
465461
for shard_file in shard_files:
466462
shard_path = os.path.join(shard_dir, shard_file)
467463
print(f"Loading shard from {shard_path}")
468-
shard_state_dict = load_file(shard_path, device="hpu" if is_hpex_available() else "cpu")
464+
shard_state_dict = load_file(shard_path, device="hpu" if is_hpu_available() else "cpu")
469465
full_state_dict.update(shard_state_dict)
470466

471467
return full_state_dict

neural_compressor/torch/algorithms/weight_only/save_load.py

+36-40
Original file line numberDiff line numberDiff line change
@@ -68,46 +68,42 @@ def save(model, output_dir="./saved_results", format=SaveLoadFormat.DEFAULT, **k
6868
os.makedirs(output_dir, exist_ok=True)
6969
cur_accelerator.synchronize()
7070
if format == SaveLoadFormat.HUGGINGFACE: # pragma: no cover
71-
config = model.config
72-
config_file = "quantize_config.json"
73-
quantization_config = config.quantization_config if hasattr(config, "quantization_config") else None
74-
if quantization_config and "backend" in quantization_config and "auto_round" in quantization_config["backend"]:
75-
safe_serialization = kwargs.get("safe_serialization", True)
76-
tokenizer = kwargs.get("tokenizer", None)
77-
max_shard_size = kwargs.get("max_shard_size", "5GB")
78-
if tokenizer is not None:
79-
tokenizer.save_pretrained(output_dir)
80-
del model.save
81-
model.save_pretrained(
82-
output_dir,
83-
max_shard_size=max_shard_size,
84-
safe_serialization=safe_serialization,
85-
state_dict=model.state_dict() if "model_state_dict" not in kwargs else kwargs["model_state_dict"],
86-
)
87-
with open(os.path.join(output_dir, config_file), "w", encoding="utf-8") as f:
88-
json.dump(quantization_config, f, indent=2)
89-
return
90-
91-
output_folder = os.path.abspath(os.path.expanduser(output_dir))
92-
qmodel_weight_file_path = os.path.join(output_folder, WEIGHT_NAME)
93-
qconfig_file_path = os.path.join(output_folder, QCONFIG_NAME)
94-
# saving process
95-
save_config_mapping(model.qconfig, qconfig_file_path)
96-
97-
# MethodType 'save' not in state_dict
98-
del model.save
99-
if "blockwise" in kwargs:
100-
from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, save_layers_in_shards_iteratively
101-
102-
checkpoints_folder = kwargs.get("blockwise_load_folder", None)
103-
if not checkpoints_folder:
104-
checkpoints_folder = LWQ_WORKSPACE
105-
save_layers_in_shards_iteratively(checkpoints_folder, output_folder, layers_per_shard=8)
106-
else:
107-
model_state_dict = model.state_dict() # if 'model_state_dict' not in kwargs else kwargs['model_state_dict']
108-
torch.save(model_state_dict, qmodel_weight_file_path)
109-
logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path))
110-
logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
71+
quantization_config_file = "quantize_config.json"
72+
safe_serialization = kwargs.get("safe_serialization", True)
73+
max_shard_size = kwargs.get("max_shard_size", f"{MAX_FILE_SIZE}GB")
74+
if not hasattr(model.config, "quantization_config"):
75+
quantization_config = change_config_to_hf_format(model.qconfig)
76+
model.config.quantization_config = quantization_config
77+
# save model state_dict and config.json
78+
model.save_pretrained(output_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
79+
# save quantize_config.json
80+
with open(os.path.join(output_dir, quantization_config_file), "w", encoding="utf-8") as f:
81+
json.dump(quantization_config, f, indent=2)
82+
# save generation_config.json
83+
if hasattr(model, "generation_config") and model.generation_config is not None:
84+
model.generation_config.save_pretrained(output_dir)
85+
# save tokenizer
86+
tokenizer = kwargs.get("tokenizer", None)
87+
if tokenizer is not None:
88+
tokenizer.save_pretrained(output_dir)
89+
return
90+
elif format == SaveLoadFormat.DEFAULT:
91+
output_folder = os.path.abspath(os.path.expanduser(output_dir))
92+
qmodel_weight_file_path = os.path.join(output_folder, WEIGHT_NAME)
93+
qconfig_file_path = os.path.join(output_folder, QCONFIG_NAME)
94+
# saving process
95+
save_config_mapping(model.qconfig, qconfig_file_path)
96+
if 'blockwise' in kwargs:
97+
from neural_compressor.torch.algorithms.layer_wise import save_layers_in_shards_iteratively, LWQ_WORKSPACE
98+
checkpoints_folder = kwargs.get("blockwise_load_folder", None)
99+
if not checkpoints_folder:
100+
checkpoints_folder = LWQ_WORKSPACE
101+
save_layers_in_shards_iteratively(checkpoints_folder, output_folder, layers_per_shard=8)
102+
else:
103+
model_state_dict = model.state_dict() # if 'model_state_dict' not in kwargs else kwargs['model_state_dict']
104+
torch.save(model_state_dict, qmodel_weight_file_path)
105+
logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path))
106+
logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
111107

112108

113109
def load(model_name_or_path, original_model=None, format=SaveLoadFormat.DEFAULT, device="cpu", **kwargs):

neural_compressor/torch/quantization/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -935,7 +935,7 @@ def __init__(
935935
act_group_size: int = None,
936936
act_sym: bool = None,
937937
act_dynamic: bool = True,
938-
act_dtype: Optional[str] = None,
938+
act_dtype: Optional[str] = "int",
939939
enable_full_range: bool = False,
940940
batch_size: int = 8,
941941
lr_scheduler=None,

neural_compressor/torch/quantization/quantize.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ def prepare(
157157
else:
158158
model_info = quant_config.get_model_info(model=prepared_model)
159159

160-
if hasattr(quant_config, "model_path") and quant_config.model_path == "":
160+
if (hasattr(quant_config, "model_path") and quant_config.model_path == ""
161+
and hasattr(prepared_model, "name_or_path")):
161162
quant_config.model_path = prepared_model.name_or_path
162163
configs_mapping = quant_config.to_config_mapping(model_info=model_info)
163164
logger.debug(configs_mapping)

neural_compressor/torch/utils/environ.py

+5
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ def is_hpex_available():
7474
return _hpex_available
7575

7676

77+
def is_hpu_available():
78+
"""Returns whether hpex is available."""
79+
return get_accelerator().name() == "hpu"
80+
81+
7782
## check optimum
7883
if is_package_available("optimum"):
7984
_optimum_available = True

neural_compressor/transformers/quantization/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ def set_nontext_module_config(model, to_quant_block_names, config):
629629
def convert_to_GPTQ_checkpoints(model, quantization_config):
630630
from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear
631631

632-
from neural_compressor.adaptor.torch_utils.util import set_module
632+
from neural_compressor.torch.utils import set_module
633633
from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
634634

635635
dtype = "int4" if quantization_config.bits == 4 else "int8"

test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
save,
1717
load
1818
)
19-
from neural_compressor.torch.utils import is_hpex_available, get_used_hpu_mem_MB
19+
from neural_compressor.torch.utils import is_hpu_available, get_used_hpu_mem_MB
2020

2121

2222
def change_to_cur_file_dir():
@@ -33,7 +33,7 @@ def calib_func(model):
3333
for i in range(2):
3434
model(example_inputs)
3535

36-
@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
36+
@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
3737
class TestFP8StaticQuantNLP:
3838
def setup_class(self):
3939
change_to_cur_file_dir()
@@ -88,7 +88,7 @@ def test_two_step_quant_nlp(self):
8888

8989

9090
@pytest.mark.xfail(reason="[SW-219514] RuntimeError: operator torchvision::nms does not exist")
91-
@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
91+
@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
9292
class TestFP8StaticQuantCV:
9393
def setup_class(self):
9494
change_to_cur_file_dir()

test/3x/torch/quantization/fp8_quant/test_gptq_mixed_precision.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from transformers import AutoTokenizer
1111
from datasets import load_dataset
1212
from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare, HybridGPTQConfig
13-
from neural_compressor.torch.utils import is_hpex_available, get_accelerator
13+
from neural_compressor.torch.utils import is_hpu_available, get_accelerator
1414
from neural_compressor.torch.quantization.save_load_entry import load
1515

1616

@@ -42,7 +42,7 @@ def calib_func(model, dataset, tokenizer=None):
4242
cur_accelerator.synchronize()
4343
logits.append(logs.detach().cpu())
4444

45-
@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
45+
@pytest.mark.skipif(not is_hpu_available(), reason="HPU environment is required!")
4646
class TestGPTQwithFP8Quant:
4747
def setup_class(self):
4848
change_to_cur_file_dir()

test/3x/torch/quantization/weight_only/test_gptq.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
prepare,
1515
quantize,
1616
)
17-
from neural_compressor.torch.utils import accelerator, is_hpex_available
17+
from neural_compressor.torch.utils import accelerator, is_hpu_available
1818

1919
device = accelerator.name()
2020

@@ -184,7 +184,7 @@ def test_act_order(self):
184184
# compare atol, this case is an ideal case.
185185
assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
186186

187-
187+
@pytest.mark.skipif(not is_hpu_available(), reason="These tests are not supported on HPU for now.")
188188
def test_block_wise(self):
189189
from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE
190190
from neural_compressor.torch import load_empty_model
@@ -225,7 +225,7 @@ def test_block_wise(self):
225225

226226
kwargs = {'sharded_checkpoints': True}
227227

228-
loaded_model = load(LWQ_WORKSPACE+"/checkpoint/", copy.deepcopy(self.tiny_gptj), **kwargs)
228+
loaded_model = load(LWQ_WORKSPACE+"/checkpoint/", copy.deepcopy(self.tiny_gptj), **kwargs).to(device)
229229

230230
out = loaded_model(self.example_inputs)[0]
231231

@@ -298,7 +298,7 @@ def test_true_sequential(self):
298298
), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."
299299

300300
# TODO [SW-216127]: it's not in high priority, so we can implement it later.
301-
@pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
301+
@pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
302302
def test_quant_lm_head(self):
303303
# quant_lm_head=False
304304
model = copy.deepcopy(self.tiny_gptj)
@@ -367,7 +367,7 @@ def test_double_quant_params(self, dtype, double_quant_bits, double_quant_group_
367367
assert torch.allclose(atol_false, atol_true, atol=0.008), "atol is very close, double checked the logic."
368368

369369
# TODO [SW-216127]: it's not in high priority, so we can implement it later.
370-
@pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
370+
@pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
371371
def test_conv1d(self):
372372
from transformers import GPT2Model, GPT2Tokenizer
373373

test/3x/torch/quantization/weight_only/test_load.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import transformers
88

99
from neural_compressor.torch.quantization import load
10-
from neural_compressor.torch.utils import SaveLoadFormat, accelerator, is_hpex_available
10+
from neural_compressor.torch.utils import SaveLoadFormat, accelerator, is_hpu_available
1111

1212
device = accelerator.current_device_name()
1313

@@ -55,7 +55,7 @@ def test_load_hf_woq_model_cpu(self):
5555
output = qmodel(self.example_inputs.to("cpu"))[0]
5656
assert len(output) > 0, "Not loading the model correctly"
5757

58-
@pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
58+
@pytest.mark.skipif(not is_hpu_available(), reason="no hpex in environment here.")
5959
def test_load_hf_woq_model_hpu(self):
6060
# use huggingface model_id (format=huggingface, device="hpu")
6161
# first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
@@ -88,7 +88,7 @@ def test_load_hf_woq_model_hpu(self):
8888
output1, output2
8989
), "The model loaded the second time is different from the model loaded the first time"
9090

91-
@pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
91+
@pytest.mark.skipif(not is_hpu_available(), reason="no hpex in environment here.")
9292
def test_load_hf_woq_model_hpu_special_case(self):
9393
# this model contains tensors sharing memory
9494
model = load(

test/3x/torch/quantization/weight_only/test_rtn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
prepare,
1414
quantize,
1515
)
16-
from neural_compressor.torch.utils import accelerator, is_hpex_available
16+
from neural_compressor.torch.utils import accelerator, is_hpu_available
1717

1818
device = accelerator.name()
1919

@@ -310,7 +310,7 @@ def test_rtn_with_quantize_API(self):
310310

311311
# TODO: (4, True, 32, 0), group_dim=0, format not supported
312312
# TODO [SW-216127]: it's not in high priority, so we can implement it later.
313-
@pytest.mark.skipif(is_hpex_available(), reason="These tests are not supported on HPU for now.")
313+
@pytest.mark.skipif(is_hpu_available(), reason="These tests are not supported on HPU for now.")
314314
@pytest.mark.parametrize(
315315
"bits, use_sym, group_size, group_dim",
316316
[

test/3x/torch/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ psutil
99
pytest
1010
torchvision
1111
transformers
12+
datasets

0 commit comments

Comments
 (0)