intel
diff --git a/‎neural_compressor/torch/algorithms/layer_wise/utils.py
+26-27 b/‎neural_compressor/torch/algorithms/layer_wise/utils.py
+26-27
diff --git a/‎neural_compressor/torch/algorithms/mixed_low_precision/modules.py
+13-9 b/‎neural_compressor/torch/algorithms/mixed_low_precision/modules.py
+13-9
diff --git a/‎neural_compressor/torch/algorithms/mixed_low_precision/quantizer.py
+6-3 b/‎neural_compressor/torch/algorithms/mixed_low_precision/quantizer.py
+6-3
diff --git a/‎neural_compressor/torch/algorithms/weight_only/autoround.py
+3-3 b/‎neural_compressor/torch/algorithms/weight_only/autoround.py
+3-3
diff --git a/‎neural_compressor/torch/algorithms/weight_only/gptq.py
+18-14 b/‎neural_compressor/torch/algorithms/weight_only/gptq.py
+18-14
@@ -20,17 +20,15 @@
 import json
 import os
 
-
-
 import torch
 from accelerate.utils import set_module_tensor_to_device
 from safetensors import safe_open
+from safetensors.torch import save_file
 
 from neural_compressor.common import options
 from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
-from neural_compressor.torch.utils.utility import dowload_hf_model
 from neural_compressor.torch.utils import is_hpex_available
-from safetensors.torch import save_file
+from neural_compressor.torch.utils.utility import dowload_hf_model
 
 if is_hpex_available():
     import habana_frameworks
@@ -224,9 +222,9 @@ def load_value(model, param_name, path, device="cpu"):
     files = os.listdir(path)
     safetensors_files = [filename for filename in files if filename.endswith(".safetensors")]
 
-    if device == torch.device('hpu'):
-        device = 'hpu'
-            
+    if device == torch.device("hpu"):
+        device = "hpu"
+
     if len(safetensors_files) == 1:
         value = load_tensor_from_safetensors(
             os.path.join(path, "model.safetensors"), param_name, prefix=prefix, device=device
@@ -255,17 +253,19 @@ def load_module(model, module_name, path, device="cpu"):
         value = load_value(model, param_name, path, device)
         set_module_tensor_to_device(model, param_name, device, value)
 
+
 def load_first_layer_only(user_model, model_name):
-    """load first layer only.
+    """Load first layer only.
 
     Args:
         user_model (torch.nn.Module): input model
         model_name (str): model name or path
     """
     for name, m in user_model.named_modules():
-        if ('layers' not in name or 'layers.0' in name) and len(name) > 0 and len(list(m.named_children())) == 0:
+        if ("layers" not in name or "layers.0" in name) and len(name) > 0 and len(list(m.named_children())) == 0:
             load_module(user_model, name, get_path(model_name), device="hpu" if is_hpex_available() else "cpu")
 
+
 def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None, indicated_layers=None):
     """Register weight hooks for model.
 
@@ -363,7 +363,7 @@ def clean_module_weight(module):
                 if hpu_available:
                     if param_cls == habana_frameworks.torch.core.weight_sharing.HabanaParameterWrapper:
                         try:
-                            kwargs.pop('change_device_placement')
+                            kwargs.pop("change_device_placement")
                         except KeyError:
                             pass
 
@@ -372,14 +372,13 @@ def clean_module_weight(module):
                 submodule._parameters[n] = new_value
     # gc.collect()
 
+
 def save_layers_in_shards_iteratively(checkpoint_dir, output_dir, layers_per_shard=10):
-    """
-    Save model layers iteratively in shards, each shard containing a fixed number of layers using safetensors.
-    """
+    """Save model layers iteratively in shards, each shard containing a fixed number of layers using safetensors."""
     os.makedirs(output_dir, exist_ok=True)
 
     # Get list of checkpoint files in the checkpoint_dir
-    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pt')]
+    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
     checkpoint_files.sort()
 
     bin_index = {}
@@ -390,9 +389,9 @@ def save_layers_in_shards_iteratively(checkpoint_dir, output_dir, layers_per_sha
     for checkpoint_file in checkpoint_files:
         layer_path = os.path.join(checkpoint_dir, checkpoint_file)
         print(f"Loading layer from {layer_path}")
-        
+
         # Load the layer checkpoint
-        checkpoint = torch.load(layer_path, map_location='cpu')
+        checkpoint = torch.load(layer_path, map_location="cpu")
         layer_state_dict = checkpoint
 
         # Add the layer's state dict to the buffer
@@ -406,7 +405,7 @@ def save_layers_in_shards_iteratively(checkpoint_dir, output_dir, layers_per_sha
                 # Update the bin index for each layer
                 for layer_name in layer_dict.keys():
                     bin_index[layer_name] = shard_idx
-            
+
             # Save the shard to disk using safetensors
             shard_filename = f"model_shard-{str(shard_idx + 1).zfill(5)}-of-{str((len(checkpoint_files) // layers_per_shard) + 1).zfill(5)}.safetensors"
             shard_path = os.path.join(output_dir, shard_filename)
@@ -425,48 +424,48 @@ def save_layers_in_shards_iteratively(checkpoint_dir, output_dir, layers_per_sha
             # Update the bin index for each layer
             for layer_name in layer_dict.keys():
                 bin_index[layer_name] = shard_idx
-        
+
         # Save the final shard
         shard_filename = f"model_shard-{str(shard_idx + 1).zfill(5)}-of-{str((len(checkpoint_files) // layers_per_shard) + 1).zfill(5)}.safetensors"
         shard_path = os.path.join(output_dir, shard_filename)
         save_file(shard_state_dict, shard_path)  # Save using safetensors
         print(f"Saved final shard {shard_idx + 1} of {len(checkpoint_files) // layers_per_shard + 1} at {shard_path}")
 
     # Save bin index to a JSON file
-    bin_index_file = os.path.join(output_dir, 'model_bin_index.json')
-    with open(bin_index_file, 'w') as f:
+    bin_index_file = os.path.join(output_dir, "model_bin_index.json")
+    with open(bin_index_file, "w") as f:
         json.dump(bin_index, f, indent=4)
 
     print(f"Model bin index saved to {bin_index_file}")
 
+
 from safetensors.torch import load_file  # Safetensors load function
 
 
 def load_model_from_shards_with_safetensors(shard_dir, bin_index_file):
-    """
-    Load the model from its shards and the bin index using safetensors.
-    
+    """Load the model from its shards and the bin index using safetensors.
+
     Args:
         shard_dir (str): Directory containing the model shard files.
         bin_index_file (str): Path to the bin index JSON file.
-    
+
     Returns:
         torch.nn.Module: The reconstructed model with the layers.
     """
     # Load bin index to get the layer -> shard mapping
-    with open(bin_index_file, 'r') as f:
+    with open(bin_index_file, "r") as f:
         bin_index = json.load(f)
 
     full_state_dict = {}
 
     # Sort and load the shard files
-    shard_files = [f for f in os.listdir(shard_dir) if f.endswith('.safetensors')]
+    shard_files = [f for f in os.listdir(shard_dir) if f.endswith(".safetensors")]
     shard_files.sort()
 
     for shard_file in shard_files:
         shard_path = os.path.join(shard_dir, shard_file)
         print(f"Loading shard from {shard_path}")
         shard_state_dict = load_file(shard_path, device="hpu" if is_hpex_available() else "cpu")
         full_state_dict.update(shard_state_dict)
-    
+
     return full_state_dict
@@ -12,28 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import math
 from abc import abstractmethod
-import functools
+
 import numpy as np
 import torch
 from torch.autograd import Function
 from torch.nn import functional as F
 
+from neural_compressor.torch.utils import accelerator, logger, set_module
+
 from ..weight_only.modules import HPUWeightOnlyLinear
-from neural_compressor.torch.utils import accelerator, logger
-from neural_compressor.torch.utils import logger, set_module
 
 
 class HPUMixedPrecisionLinear(HPUWeightOnlyLinear):
     """Weight and Activations quant (W4A8 gptq) Linear for HPU device."""
 
     def __init__(
-        self, in_features, out_features,
+        self,
+        in_features,
+        out_features,
         **kwargs,
     ):
-        """Init the HPUMixedPrecisionLinear object.
-        """
+        """Init the HPUMixedPrecisionLinear object."""
         super(HPUMixedPrecisionLinear, self).__init__(in_features, out_features)
 
     def forward(self, input):
@@ -43,7 +45,9 @@ def forward(self, input):
         scales = self.scales
         qweight = self.qweight
         zeros = self.qzeros
-        weight = torch.ops.hpu.convert_from_uint4(qweight, scales/self.matmul_internal.scale_other, zeros, torch.float8_e4m3fn)     # todo: div scales in init
+        weight = torch.ops.hpu.convert_from_uint4(
+            qweight, scales / self.matmul_internal.scale_other, zeros, torch.float8_e4m3fn
+        )  # todo: div scales in init
         output = self.matmul_internal(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
@@ -77,13 +81,13 @@ def convert_from_weight_only(obj):
         new_self = HPUMixedPrecisionLinear(obj.in_features, obj.out_features)
         for attr, value in vars(obj).items():
             setattr(new_self, attr, value)
-        new_self.matmul_internal.no_input_quant = True # flag for 8bit input, which shouldn't be quantized in matmul
+        new_self.matmul_internal.no_input_quant = True  # flag for 8bit input, which shouldn't be quantized in matmul
         return new_self
 
     def post_process_for_inference(self):
         """Post process for inference."""
+        from neural_compressor.torch.algorithms.fp8_quant._core.quant_dequant import QuantDequantNone, QuantInput
         from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedMatmul
-        from neural_compressor.torch.algorithms.fp8_quant._core.quant_dequant import QuantInput, QuantDequantNone
 
         self = self.to("hpu")
         module = self
 
@@ -16,6 +16,7 @@
 from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear
 from neural_compressor.torch.algorithms.weight_only.modules import HPUWeightOnlyLinear
 
+
 class HybridGPTQQuantizer(Quantizer):
     def __init__(self, quant_config):
         super().__init__(quant_config)
@@ -26,11 +27,12 @@ def __init__(self, quant_config):
 
     def prepare(self, model):
         return model
-    
+
     def convert(self, model):
         _convert(model)
         return model
 
+
 def set_module(model, op_name, new_module):
     """Set module with a given op name.
 
@@ -51,11 +53,12 @@ def set_module(model, op_name, new_module):
             module = module
     setattr(module, name_list[-1], new_module)
 
+
 def _convert(model):
     for name, module in model.named_modules():
-    # replace `HPUWeightOnlyLinear`s forward func
+        # replace `HPUWeightOnlyLinear`s forward func
         if isinstance(module, HPUWeightOnlyLinear):
             module = HPUMixedPrecisionLinear.convert_from_weight_only(module)
             set_module(model, name, module)
 
-    return model
+    return model
@@ -16,7 +16,7 @@
 import json
 import time
 from functools import lru_cache
-from typing import Union, Optional
+from typing import Optional, Union
 
 import torch
 
@@ -206,10 +206,10 @@ def __init__(
         self.template = template
         self.truncation = truncation
         self.enable_w4afp8 = self._is_w4afp8()
-    
+
     def _is_w4afp8(self):
         return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()])
-    
+
     def prepare(self, model: torch.nn.Module, *args, **kwargs):
         """Prepares a given model for quantization.
 
 
@@ -26,16 +26,16 @@
 import torch.nn as nn
 from tqdm import tqdm
 
+from neural_compressor.torch.algorithms.layer_wise import get_path, load_value, set_module_tensor_to_device
 from neural_compressor.torch.utils import (
     get_accelerator,
     get_model_device,
+    get_used_cpu_mem_MB,
     is_transformers_imported,
     logger,
     set_module,
 )
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
-from neural_compressor.torch.algorithms.layer_wise import load_value, set_module_tensor_to_device, get_path
-from neural_compressor.torch.utils import get_used_cpu_mem_MB
 
 from .modules import INCWeightOnlyLinear
 
@@ -127,8 +127,9 @@ def trace_gptq_target_blocks(module, module_types=[torch.nn.ModuleList, torch.nn
                 continue
     return gptq_related_blocks
 
+
 def find_all_layers(module, name=""):
-    """Get all layers"""    
+    """Get all layers."""
     if len(list(module.named_children())) == 0:
         return {name: module}
     res = {}
@@ -574,7 +575,7 @@ def execute_quantization(self, means=None, stds=None):
         true_sequential_map = self.analyze_true_sequential(self.gptq_related_blocks["transformers"][0])
         logger.info(f"Sequential Name: {true_sequential_map}")
         tblock_length = len(self.gptq_related_blocks["transformers"])
-        for param in self.model.parameters(): 
+        for param in self.model.parameters():
             param.requires_grad = False
 
         cpu_mem_0 = get_used_cpu_mem_MB()
@@ -583,8 +584,8 @@ def execute_quantization(self, means=None, stds=None):
             start_iter = time.time()
             logger.debug(f"Memory usage increase CPU: {get_used_cpu_mem_MB() - cpu_mem_0}")
             logger.info(f"Quantizing layer {block_idx + 1} / {tblock_length}..")
-            transformer_block = self.gptq_related_blocks["transformers"][block_idx]   
- 
+            transformer_block = self.gptq_related_blocks["transformers"][block_idx]
+
             # Step2.1: obtain all layers (Linear, Conv2d, etc) in the block which can be quantized.
             # device = 'cpu'
 
@@ -600,12 +601,12 @@ def find_all_layer_names(module, name=""):
             # block weights are meta tensors, load them from disk
             if self.use_block_wise:
                 for n in find_all_layer_names(transformer_block):
-                    param_name = f"model.layers.{block_idx}." + n + '.weight'
+                    param_name = f"model.layers.{block_idx}." + n + ".weight"
                     try:
-                        value = load_value(self.model, param_name, self.model_path, 'cpu')
-                        set_module_tensor_to_device(transformer_block.get_submodule(n), 'weight', 'cpu', value)
+                        value = load_value(self.model, param_name, self.model_path, "cpu")
+                        set_module_tensor_to_device(transformer_block.get_submodule(n), "weight", "cpu", value)
                     except:
-                        pass # only load w
+                        pass  # only load w
 
             if not self.use_layer_wise:  # pragma: no cover
                 # if we do not apply layer-wise feature, we still place the entire block on the GPU
@@ -647,7 +648,7 @@ def find_all_layer_names(module, name=""):
                     full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                     weight_config_this_layer = self.get_layer_config(full_layer_name)
                     if self.use_layer_wise:  # pragma: no cover
-                        W = load_value(self.model, full_layer_name + ".weight", self.model_path, self.device)                    
+                        W = load_value(self.model, full_layer_name + ".weight", self.model_path, self.device)
                     else:
                         if "hpu" in str(self.device):  # pragma: no cover
                             # [SW-206677] memory is not release when module is moved out of HPU
@@ -827,10 +828,13 @@ def tmp(_, inp, out):
                         LWQ_WORKSPACE,
                         clean_module_weight,
                     )
+
                     block = self.gptq_related_blocks["transformers"][block_idx]
                     full_block_name = self.gptq_related_blocks["transformers_name"] + "." + str(block_idx)
 
-                    modified_state_dict = {f"{full_block_name}.{key}": value for key, value in block.state_dict().items()}
+                    modified_state_dict = {
+                        f"{full_block_name}.{key}": value for key, value in block.state_dict().items()
+                    }
                     torch.save(modified_state_dict, LWQ_WORKSPACE + f"/{full_block_name}.pt")
                     logger.info(f"Saving block to {LWQ_WORKSPACE + f'/{full_block_name}.pt'}")
                     for n, l in find_all_layers(transformer_block).items():
@@ -843,10 +847,10 @@ def tmp(_, inp, out):
 
                         for key, value in state_dict.items():
                             # Filter out tensors that are on the 'meta' device
-                            if value.device.type != 'meta':
+                            if value.device.type != "meta":
                                 aux_state_dict[key] = value
 
-                        torch.save(aux_state_dict, LWQ_WORKSPACE + f"/auxilaries.pt")
+                        torch.save(aux_state_dict, LWQ_WORKSPACE + "/auxiliaries.pt")
 
                 del gptq_for_this_block
                 accelerator.synchronize()