From fb60ef34d57bb6dc2a7e8296c1bdb7fc9b7adbab Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 31 Jan 2025 20:35:17 +0800
Subject: [PATCH] format (#1189)

* format

* ignore format

* format ignore E402

* install uv

* remove uv
---
 examples/benchmark/generation_speed.py        |  6 ++--
 examples/benchmark/ipex.py                    |  2 --
 examples/benchmark/perplexity.py              |  4 +--
 .../evaluation/run_language_modeling_task.py  |  4 +--
 .../run_sequence_classification_task.py       |  4 +--
 .../evaluation/run_text_summarization_task.py |  4 +--
 examples/inference/run_transformers.py        |  1 -
 .../inference/run_with_different_backends.py  |  4 +--
 examples/quantization/basic_usage.py          |  4 +--
 .../quantization/basic_usage_autoround.py     |  4 +--
 .../quantization/basic_usage_wikitext2.py     |  4 +--
 examples/quantization/transformers_usage.py   |  1 -
 format/ruff.toml                              |  2 +-
 gptqmodel/models/_const.py                    |  1 -
 gptqmodel/models/auto.py                      |  7 ++---
 gptqmodel/models/base.py                      | 30 ++++---------------
 gptqmodel/models/definitions/gemma2.py        |  1 -
 gptqmodel/models/loader.py                    | 26 ++++------------
 gptqmodel/models/writer.py                    | 30 ++++---------------
 gptqmodel/nn_modules/qlinear/__init__.py      |  2 +-
 gptqmodel/nn_modules/qlinear/bitblas.py       |  4 +--
 .../qlinear/bitblas_target_detector.py        |  1 -
 gptqmodel/nn_modules/qlinear/dynamic_cuda.py  |  1 -
 gptqmodel/nn_modules/qlinear/exllama.py       |  4 +--
 gptqmodel/nn_modules/qlinear/exllamav2.py     |  2 --
 gptqmodel/nn_modules/qlinear/ipex.py          |  2 --
 gptqmodel/nn_modules/qlinear/marlin.py        |  4 +--
 gptqmodel/nn_modules/qlinear/torch.py         |  4 +--
 gptqmodel/nn_modules/qlinear/utils.py         |  2 +-
 .../triton_utils/custom_autotune.py           |  1 -
 gptqmodel/nn_modules/triton_utils/kernels.py  |  1 -
 gptqmodel/quantization/__init__.py            | 13 ++------
 gptqmodel/quantization/config.py              |  5 ++--
 gptqmodel/quantization/gptq.py                |  1 -
 gptqmodel/quantization/quantizer.py           |  1 -
 gptqmodel/utils/bitblas.py                    |  4 +--
 gptqmodel/utils/device.py                     |  1 -
 gptqmodel/utils/importer.py                   |  3 +-
 gptqmodel/utils/logger.py                     |  1 -
 gptqmodel/utils/marlin.py                     |  3 +-
 gptqmodel/utils/mlx.py                        |  1 -
 gptqmodel/utils/model.py                      | 12 ++------
 gptqmodel/utils/openai_server.py              |  1 -
 gptqmodel/utils/perplexity.py                 |  1 -
 gptqmodel/utils/rocm.py                       |  1 -
 gptqmodel/utils/safetensor.py                 |  8 ++---
 gptqmodel/utils/sglang.py                     |  1 -
 gptqmodel/utils/torch.py                      |  1 -
 gptqmodel/utils/vllm.py                       |  1 -
 setup.py                                      |  2 --
 tests/benchmark/benchmark.py                  |  3 +-
 tests/benchmark/benchmark_test.py             |  4 +--
 tests/inference_speed.py                      |  3 +-
 tests/models/model_test.py                    | 10 ++-----
 tests/models/test_gptbigcode.py               |  1 -
 tests/models/test_opt.py                      |  3 +-
 tests/models/test_qwen2_vl.py                 |  3 +-
 tests/tasks/mmlu/_generate_configs.py         |  1 -
 tests/test_asym_gptq_v1.py                    |  4 +--
 tests/test_bits.py                            |  7 ++---
 tests/test_dynamic.py                         | 11 +++----
 tests/test_estimate_vram.py                   |  1 -
 tests/test_eval.py                            |  6 ++--
 tests/test_evalplus.py                        |  1 -
 tests/test_flash_attention.py                 |  4 +--
 tests/test_group_size.py                      |  7 ++---
 tests/test_inference_speed.py                 | 11 +++----
 tests/test_inference_speed_ipex.py            |  6 ++--
 tests/test_ipex_xpu.py                        |  4 +--
 tests/test_lm_eval.py                         |  5 +---
 tests/test_lm_head.py                         |  6 ++--
 tests/test_mlx.py                             |  4 +--
 tests/test_mlx_generate.py                    |  5 +---
 tests/test_openai_server.py                   |  2 --
 tests/test_packing.py                         |  3 --
 tests/test_packing_speed.py                   |  3 --
 tests/test_parameter_count.py                 | 10 +++----
 tests/test_perplexity.py                      |  6 ++--
 tests/test_q4_bitblas.py                      |  4 +--
 tests/test_q4_cuda.py                         |  4 +--
 tests/test_q4_exllama_v1.py                   |  9 ++----
 tests/test_q4_exllama_v2.py                   |  7 ++---
 tests/test_q4_ipex.py                         |  4 +--
 tests/test_q4_marlin.py                       |  6 ++--
 tests/test_q4_torch.py                        |  4 +--
 tests/test_q4_torch_apple.py                  |  3 +-
 tests/test_q4_triton.py                       |  6 ++--
 tests/test_quant_batch.py                     |  5 +---
 tests/test_quant_formats.py                   | 15 +++-------
 tests/test_quant_formats_auto_round.py        | 14 +++------
 tests/test_quant_time.py                      |  3 --
 tests/test_quant_trust_remote.py              |  9 ++----
 tests/test_save_loaded_quantized_model.py     |  5 +---
 tests/test_serialization.py                   |  1 -
 tests/test_sglang.py                          |  4 +--
 tests/test_sharded.py                         |  4 +--
 tests/test_tgi.py                             |  1 -
 tests/test_transformers_integration.py        |  4 +--
 tests/test_triton.py                          |  5 +---
 tests/test_triton_xpu.py                      |  4 +--
 tests/test_verify_hash.py                     |  1 -
 tests/test_vllm.py                            |  7 ++---
 102 files changed, 125 insertions(+), 366 deletions(-)

diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
index 1018d641c..643acd861 100644
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
@@ -22,12 +22,10 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from transformers import AutoTokenizer, GenerationConfig
-from transformers.generation.logits_process import LogitsProcessor
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.utils.progress import ProgressBar
-
+from transformers import AutoTokenizer, GenerationConfig
+from transformers.generation.logits_process import LogitsProcessor
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py
index b221d9bcf..50541ee55 100644
--- a/examples/benchmark/ipex.py
+++ b/examples/benchmark/ipex.py
@@ -19,7 +19,6 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
-
 try:
     from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
     bind_cores_for_best_perf()
@@ -29,7 +28,6 @@
 
 import argparse
 
-
 parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
 parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
 parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
index cf3e4e3bb..6326d9b35 100644
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@@ -17,10 +17,8 @@
 import os
 
 import torch
-from transformers import AutoTokenizer
-
 from gptqmodel.utils import Perplexity
-
+from transformers import AutoTokenizer
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py
index 59b51405b..5e9e98df4 100644
--- a/examples/evaluation/run_language_modeling_task.py
+++ b/examples/evaluation/run_language_modeling_task.py
@@ -17,12 +17,10 @@
 
 import datasets
 import torch
-from transformers import AutoTokenizer
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import LanguageModelingTask
 from gptqmodel.utils.torch import torch_empty_cache
-
+from transformers import AutoTokenizer
 
 DATASET = "tatsu-lab/alpaca"
 WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py
index 94e9a43b8..7a7658048 100644
--- a/examples/evaluation/run_sequence_classification_task.py
+++ b/examples/evaluation/run_sequence_classification_task.py
@@ -18,12 +18,10 @@
 
 import datasets
 import torch
-from transformers import AutoTokenizer
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import SequenceClassificationTask
 from gptqmodel.utils.torch import torch_empty_cache
-
+from transformers import AutoTokenizer
 
 DATASET = "cardiffnlp/tweet_sentiment_multilingual"
 TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py
index 2c3b16808..31c745acb 100644
--- a/examples/evaluation/run_text_summarization_task.py
+++ b/examples/evaluation/run_text_summarization_task.py
@@ -18,12 +18,10 @@
 
 import datasets
 import torch
-from transformers import AutoTokenizer, GenerationConfig
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import TextSummarizationTask
 from gptqmodel.utils.torch import torch_empty_cache
-
+from transformers import AutoTokenizer, GenerationConfig
 
 os.system("pip install py7zr")
 
diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
index b53e8487c..23a511ba3 100644
--- a/examples/inference/run_transformers.py
+++ b/examples/inference/run_transformers.py
@@ -15,7 +15,6 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-
 tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
index e0837b2ee..ef993f949 100644
--- a/examples/inference/run_with_different_backends.py
+++ b/examples/inference/run_with_different_backends.py
@@ -18,10 +18,8 @@
 import sys
 from argparse import ArgumentParser
 
-from transformers import AutoTokenizer
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
-
+from transformers import AutoTokenizer
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
index ab1dd2f45..129f33a83 100644
--- a/examples/quantization/basic_usage.py
+++ b/examples/quantization/basic_usage.py
@@ -15,10 +15,8 @@
 
 import os
 
-from transformers import AutoTokenizer
-
 from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
-
+from transformers import AutoTokenizer
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py
index c3420fae6..1b071641f 100644
--- a/examples/quantization/basic_usage_autoround.py
+++ b/examples/quantization/basic_usage_autoround.py
@@ -14,11 +14,9 @@
 # limitations under the License.
 
 import torch
-from transformers import AutoTokenizer
-
 from gptqmodel import GPTQModel
 from gptqmodel.quantization.config import AutoRoundQuantizeConfig  # noqa: E402
-
+from transformers import AutoTokenizer
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
index ce8e88d7d..43237e145 100644
--- a/examples/quantization/basic_usage_wikitext2.py
+++ b/examples/quantization/basic_usage_wikitext2.py
@@ -15,10 +15,8 @@
 
 import torch
 from datasets import load_dataset
-from transformers import AutoTokenizer
-
 from gptqmodel import GPTQModel, QuantizeConfig
-
+from transformers import AutoTokenizer
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
index a984a9a8d..98ad1855f 100755
--- a/examples/quantization/transformers_usage.py
+++ b/examples/quantization/transformers_usage.py
@@ -15,7 +15,6 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
-
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
diff --git a/format/ruff.toml b/format/ruff.toml
index 4f769ee6b..e2cb2afb9 100644
--- a/format/ruff.toml
+++ b/format/ruff.toml
@@ -1,5 +1,5 @@
 # Never enforce `E501` (line length violations).
-lint.ignore = ["C901", "E501", "E741", "W605"]
+lint.ignore = ["C901", "E501", "E741", "W605", "E402"]
 lint.select = ["C", "E", "F", "I", "W"]
 line-length = 119
 
diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
index 17448c192..67874d040 100644
--- a/gptqmodel/models/_const.py
+++ b/gptqmodel/models/_const.py
@@ -24,7 +24,6 @@
 from ..utils.rocm import IS_ROCM
 from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
-
 CPU = device("cpu")
 CUDA = device("cuda")
 CUDA_0 = device("cuda:0")
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 3fc68e32c..882a0a66f 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -17,7 +17,6 @@
 
 import os
 
-
 if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None):
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True'
     print("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.")
@@ -28,7 +27,6 @@
 
 import sys  # noqa: E402
 
-
 # TODO: waiting for pytorch implementgation of aten ops for MPS
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -96,7 +94,6 @@
 from .definitions.xverse import XverseGPTQ  # noqa: E402
 from .definitions.yi import YiGPTQ  # noqa: E402
 
-
 logger = setup_logger()
 
 MODEL_MAP = {
@@ -300,11 +297,10 @@ def eval(
                 if task not in EVAL.get_task_enums():
                     raise ValueError(f"lm_eval support tasks: {EVAL.get_all_tasks_string()}")
 
+            from gptqmodel.utils.eval import lm_eval
             from lm_eval.utils import make_table
             from transformers import AutoTokenizer
 
-            from gptqmodel.utils.eval import lm_eval
-
             tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
 
             model_name = 'hf' if backend == 'gptqmodel' else backend
@@ -372,6 +368,7 @@ def export(model_id_or_path: str, target_path: str, format: str, trust_remote_co
         if format == "mlx":
             try:
                 from mlx_lm.utils import save_config, save_weights
+
                 from ..utils.mlx import convert_gptq_to_mlx_weights
             except ImportError:
                 raise ValueError("MLX not installed. Please install via `pip install gptqmodel[mlx] --no-build-isolation`.")
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 7c9fe9d73..9f1fe1406 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -36,33 +36,15 @@
 from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.model import (
-    MODALITY,
-    check_to_quantized,
-    find_layers,
-    get_device,
-    get_module,
-    get_module_by_name_prefix,
-    get_moe_layer_modules,
-    move_to,
-    nested_move_to,
-    normalize_tokenizer,
-    pack_model,
-)
+from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device,
+                           get_module, get_module_by_name_prefix, get_moe_layer_modules,
+                           move_to, nested_move_to, normalize_tokenizer, pack_model)
 from ..utils.progress import ProgressBar
-from ..utils.safetensor import untie_weights
 from ..utils.torch import torch_empty_cache
 from ._const import CPU, DEVICE, SUPPORTS_MODULE_TYPES
 from .loader import ModelLoader
-from .writer import (
-    QUANT_LOG_DAMP,
-    QUANT_LOG_FWD_TIME,
-    QUANT_LOG_LAYER,
-    QUANT_LOG_LOSS,
-    QUANT_LOG_MODULE,
-    QUANT_LOG_TIME,
-    ModelWriter,
-)
+from .writer import (QUANT_LOG_DAMP, QUANT_LOG_FWD_TIME, QUANT_LOG_LAYER,
+                     QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter)
 
 # pytorch 2.6.0 fixes many compilation errors
 PYTORCH_MIN_VERFSION_WITH_COMPILE = Version("2.6.0")
@@ -931,7 +913,7 @@ def compile(self, backend="inductor", mode="reduce-overhead"):
 
         try:
             self.model = torch.compile(self.model, fullgraph=True, backend=backend, mode=mode)
-        except Exception as e:
+        except Exception:
             logger.info("Compiling model again with `fullgraph=False`")
             self.model = torch.compile(self.model, fullgraph=False, backend=backend, mode=mode)
         return self
diff --git a/gptqmodel/models/definitions/gemma2.py b/gptqmodel/models/definitions/gemma2.py
index d7be41c58..08ec1aadc 100644
--- a/gptqmodel/models/definitions/gemma2.py
+++ b/gptqmodel/models/definitions/gemma2.py
@@ -17,7 +17,6 @@
 from ...utils.logger import setup_logger
 from ..base import BaseGPTQModel
 
-
 logger = setup_logger()
 
 SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ."
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 68581778e..ec72cb62b 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -36,28 +36,14 @@
 from ..utils.backend import BACKEND
 from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.marlin import (
-    _validate_marlin_compatibility,
-    _validate_marlin_device_support,
-    prepare_model_for_marlin_load,
-)
-from ..utils.model import (
-    auto_dtype,
-    convert_gptq_v1_to_v2_format,
-    find_layers,
-    get_checkpoints,
-    get_moe_layer_modules,
-    gptqmodel_post_init,
-    load_checkpoint_in_model_then_tie_weights,
-    make_quant,
-    normalize_tokenizer,
-    simple_dispatch_model,
-    verify_model_hash,
-    verify_sharded_model_hashes,
-)
+from ..utils.marlin import (_validate_marlin_compatibility,
+                            _validate_marlin_device_support, prepare_model_for_marlin_load)
+from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_layers, get_checkpoints,
+                           get_moe_layer_modules, gptqmodel_post_init,
+                           load_checkpoint_in_model_then_tie_weights, make_quant, normalize_tokenizer,
+                           simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
 from ._const import DEVICE, SUPPORTED_MODELS, normalize_device
 
-
 logger = setup_logger()
 
 ATTN_IMPLEMENTATION = "attn_implementation"
diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
index a56e51fdf..385709509 100644
--- a/gptqmodel/models/writer.py
+++ b/gptqmodel/models/writer.py
@@ -33,36 +33,18 @@
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
 from transformers.utils.generic import ContextManagers
 
-from ..quantization.config import (
-    FORMAT,
-    META_FIELD_DAMP_AUTO_INCREMENT,
-    META_FIELD_DAMP_PERCENT,
-    META_FIELD_MSE,
-    META_FIELD_QUANTIZER,
-    META_FIELD_STATIC_GROUPS,
-    META_FIELD_TRUE_SEQUENTIAL,
-    META_FIELD_URI,
-    META_QUANTIZER_GPTQMODEL,
-    META_VALUE_URI,
-    MIN_VERSION_WITH_V2,
-)
+from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE,
+                                   META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL,
+                                   META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2)
 from ..utils.backend import BACKEND
 from ..utils.logger import setup_logger
-from ..utils.model import (
-    convert_gptq_v2_to_v1_format,
-    copy_py_files,
-    find_layers,
-    get_model_files_size,
-    get_moe_layer_modules,
-    get_state_dict_for_save,
-    load_checkpoint_in_model_then_tie_weights,
-    make_quant,
-)
+from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_layers,
+                           get_model_files_size, get_moe_layer_modules, get_state_dict_for_save,
+                           load_checkpoint_in_model_then_tie_weights, make_quant)
 from ..utils.torch import torch_empty_cache
 from ..version import __version__
 from ._const import CPU
 
-
 logger = setup_logger()
 
 QUANT_LOG_LAYER = "layer"
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
index 248fd5a9d..5b072f179 100644
--- a/gptqmodel/nn_modules/qlinear/__init__.py
+++ b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -17,7 +17,7 @@
 from typing import List, Optional, Tuple
 
 import numpy as np
-import torch as t # conflict with torch.py
+import torch as t  # conflict with torch.py
 import torch.nn as nn
 import transformers
 
diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py
index 024709aca..02fca774f 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas.py
@@ -22,13 +22,11 @@
 import numpy as np
 import torch
 import torch.nn as nn
-
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
 from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
 
-
 logger = setup_logger()
 
 BITBLAS_TARGET = None
@@ -389,4 +387,4 @@ def forward(self, A):
         return C
 
 
-__all__ = ["BitBLASQuantLinear"]
\ No newline at end of file
+__all__ = ["BitBLASQuantLinear"]
diff --git a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
index ecd65915f..89c2b3d94 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
@@ -22,7 +22,6 @@
 
 from ...utils.logger import setup_logger
 
-
 logger = setup_logger()
 
 TARGET_MISSING_ERROR = (
diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
index 8a323f1ef..04950deb5 100644
--- a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
+++ b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
@@ -16,7 +16,6 @@
 from typing import Optional, Tuple
 
 import torch
-
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 from gptqmodel.utils.logger import setup_logger
 
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
index f17beef9d..de171b44a 100644
--- a/gptqmodel/nn_modules/qlinear/exllama.py
+++ b/gptqmodel/nn_modules/qlinear/exllama.py
@@ -21,12 +21,10 @@
 
 import torch
 import torch.nn.functional as F
-
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear, PackableQuantLinear
+from gptqmodel.nn_modules.qlinear import PackableQuantLinear
 
 from ...models._const import DEVICE, PLATFORM
 
-
 exllama_import_exception = None
 try:
     from gptqmodel_exllama_kernels import make_q4, q4_matmul
diff --git a/gptqmodel/nn_modules/qlinear/exllamav2.py b/gptqmodel/nn_modules/qlinear/exllamav2.py
index 18e42fd5e..e73b1ec30 100644
--- a/gptqmodel/nn_modules/qlinear/exllamav2.py
+++ b/gptqmodel/nn_modules/qlinear/exllamav2.py
@@ -20,13 +20,11 @@
 
 import torch
 import torch.nn.functional as F
-
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
 from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
 
-
 exllama_v2_import_exception = None
 try:
     from gptqmodel_exllamav2_kernels import gemm_half_q_half, make_q_matrix
diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py
index 206f908fc..a6aacd306 100644
--- a/gptqmodel/nn_modules/qlinear/ipex.py
+++ b/gptqmodel/nn_modules/qlinear/ipex.py
@@ -20,14 +20,12 @@
 import torch
 import torch.nn as nn
 import transformers
-
 from gptqmodel.models._const import DEVICE, PLATFORM
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
 from ...utils.logger import setup_logger
 from ...utils.torch import HAS_XPU
 
-
 logger = setup_logger()
 
 BITS_DTYPE_MAPPING = {
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index bd08569a9..61f617d27 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -20,14 +20,12 @@
 
 import numpy as np
 import torch
-from torch.nn.parameter import Parameter
-
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+from torch.nn.parameter import Parameter
 
 from ...models._const import DEVICE, PLATFORM
 from ...utils.rocm import IS_ROCM
 
-
 marlin_import_exception = None
 try:
     import gptqmodel_marlin_kernels
diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py
index d43443158..3be532339 100644
--- a/gptqmodel/nn_modules/qlinear/torch.py
+++ b/gptqmodel/nn_modules/qlinear/torch.py
@@ -18,13 +18,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear, PackableQuantLinear
 from gptqmodel.utils.logger import setup_logger
 
 from ...models._const import DEVICE, PLATFORM
 
-
 logger = setup_logger()
 
 class TorchQuantLinear(PackableQuantLinear):
@@ -224,7 +222,7 @@ def dequantize_model(model: nn.Module):
                 module_name = name
 
             setattr(parent, module_name, new_module)
-            
+
     del model.config.quantization_config
     return model
 
diff --git a/gptqmodel/nn_modules/qlinear/utils.py b/gptqmodel/nn_modules/qlinear/utils.py
index 2c52aac67..2d1b01309 100644
--- a/gptqmodel/nn_modules/qlinear/utils.py
+++ b/gptqmodel/nn_modules/qlinear/utils.py
@@ -34,4 +34,4 @@ def dequantize_4bits_weight(layer):
     scales = scales.repeat_interleave(group_size, dim=0)
     unpacked_qzeros = unpacked_qzeros.repeat_interleave(group_size, dim=0)
     unpacked_qweight = (unpacked_qweight - unpacked_qzeros) * scales
-    return unpacked_qweight.T, unpacked_qzeros
\ No newline at end of file
+    return unpacked_qweight.T, unpacked_qzeros
diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
index d16efe460..63d1d06db 100644
--- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py
+++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
@@ -20,7 +20,6 @@
 
 import triton
 
-
 #  code based https://github.com/fpgaminer/GPTQ-triton
 """
 Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
diff --git a/gptqmodel/nn_modules/triton_utils/kernels.py b/gptqmodel/nn_modules/triton_utils/kernels.py
index f3616eb23..0873a022d 100644
--- a/gptqmodel/nn_modules/triton_utils/kernels.py
+++ b/gptqmodel/nn_modules/triton_utils/kernels.py
@@ -21,7 +21,6 @@
 from ...utils.logger import setup_logger
 from . import custom_autotune
 
-
 logger = setup_logger()
 
 
diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py
index 4f5a009f8..6a4f212df 100644
--- a/gptqmodel/quantization/__init__.py
+++ b/gptqmodel/quantization/__init__.py
@@ -13,16 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .config import (
-                     FORMAT,
-                     FORMAT_FIELD_CODE,
-                     FORMAT_FIELD_COMPAT_MARLIN,
-                     FORMAT_FIELD_JSON,
-                     QUANT_CONFIG_FILENAME,
-                     QUANT_METHOD,
-                     QUANT_METHOD_FIELD,
-                     BaseQuantizeConfig,
-                     QuantizeConfig,
-)
+from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_COMPAT_MARLIN, FORMAT_FIELD_JSON,
+                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
 from .gptq import GPTQ
 from .quantizer import Quantizer, quantize
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index 11c4a4120..0245b67de 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -27,7 +27,6 @@
 
 from ..utils.logger import setup_logger
 
-
 logger = setup_logger()
 
 FORMAT_FIELD_CODE = "format"
@@ -190,11 +189,11 @@ def __post_init__(self):
         else:
             if isinstance(self.pack_dtype, str):
                 self.pack_dtype = self.pack_dtype.lower()
-                if not self.pack_dtype in ["int64", "int32", "int16", "int8"]:
+                if self.pack_dtype not in ["int64", "int32", "int16", "int8"]:
                     raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}")
                 self.pack_dtype = getattr(torch, self.pack_dtype)
             elif isinstance(self.pack_dtype, torch.dtype):
-                if not self.pack_dtype in [torch.int64, torch.int32, torch.int16, torch.int8]:
+                if self.pack_dtype not in [torch.int64, torch.int32, torch.int16, torch.int8]:
                     raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}")
             else:
                 raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}")
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index 1ae102b89..4606f0dba 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -28,7 +28,6 @@
 from ..utils.torch import torch_empty_cache, torch_sync
 from .quantizer import Quantizer
 
-
 logger = setup_logger()
 
 # TODO do we really need max precision?
diff --git a/gptqmodel/quantization/quantizer.py b/gptqmodel/quantization/quantizer.py
index 96f9e4350..0460f2523 100644
--- a/gptqmodel/quantization/quantizer.py
+++ b/gptqmodel/quantization/quantizer.py
@@ -20,7 +20,6 @@
 
 from ..utils.logger import setup_logger
 
-
 logger = setup_logger()
 
 
diff --git a/gptqmodel/utils/bitblas.py b/gptqmodel/utils/bitblas.py
index e257aabce..b80096f61 100644
--- a/gptqmodel/utils/bitblas.py
+++ b/gptqmodel/utils/bitblas.py
@@ -17,16 +17,14 @@
 
 import threadpoolctl as tctl
 import torch
-from bitblas.quantization import general_compress
 
 from ..nn_modules.qlinear.bitblas import BitBLASQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
 from ..utils.logger import setup_logger
-from .model import load_checkpoint_in_model_then_tie_weights, recurse_getattr, recurse_setattr
+from .model import load_checkpoint_in_model_then_tie_weights
 from .progress import ProgressBar
 from .torch import torch_empty_cache
 
-
 logger = setup_logger()
 
 def prepare_model_for_bitblas_load(
diff --git a/gptqmodel/utils/device.py b/gptqmodel/utils/device.py
index f8fa06251..7f8eeaeb0 100644
--- a/gptqmodel/utils/device.py
+++ b/gptqmodel/utils/device.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 from device_smi import Device
-
 from gptqmodel.models._const import CPU, CUDA_0
 
 
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
index d96e29bba..2d95c9fa3 100644
--- a/gptqmodel/utils/importer.py
+++ b/gptqmodel/utils/importer.py
@@ -15,7 +15,7 @@
 
 import os
 from collections import OrderedDict
-from typing import Dict, Optional, Type, Union, List
+from typing import Dict, List, Optional, Type, Union
 
 import torch
 
@@ -35,7 +35,6 @@
 from .rocm import IS_ROCM
 from .torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
-
 message_logged = False
 logger = setup_logger()
 
diff --git a/gptqmodel/utils/logger.py b/gptqmodel/utils/logger.py
index 0366ef5bd..6054d3b94 100644
--- a/gptqmodel/utils/logger.py
+++ b/gptqmodel/utils/logger.py
@@ -15,7 +15,6 @@
 
 import logging
 
-
 # global static/shared logger instance
 logger = None
 
diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py
index 1f773d4f9..3093de8dd 100644
--- a/gptqmodel/utils/marlin.py
+++ b/gptqmodel/utils/marlin.py
@@ -18,12 +18,11 @@
 from ..nn_modules.qlinear.marlin import MarlinQuantLinear, _get_perms
 from ..quantization import FORMAT, QuantizeConfig
 from ..utils.logger import setup_logger
-from .model import load_checkpoint_in_model_then_tie_weights, recurse_getattr, recurse_setattr
+from .model import load_checkpoint_in_model_then_tie_weights
 from .progress import ProgressBar
 from .rocm import IS_ROCM
 from .torch import torch_empty_cache
 
-
 logger = setup_logger()
 
 
diff --git a/gptqmodel/utils/mlx.py b/gptqmodel/utils/mlx.py
index dadbae4d5..9fa642917 100644
--- a/gptqmodel/utils/mlx.py
+++ b/gptqmodel/utils/mlx.py
@@ -10,7 +10,6 @@
 from .progress import ProgressBar
 from .torch import torch_empty_cache
 
-
 try:
     import mlx.core as mx
     from mlx_lm import generate
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 52e2ced29..fb5d6e7d1 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -38,19 +38,12 @@
 from transformers.pytorch_utils import id_tensor_storage
 from transformers.utils.hub import cached_file
 
-from ..models._const import (
-    CPU,
-    DEVICE,
-    EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
-    EXPERT_INDEX_PLACEHOLDER,
-    SUPPORTED_MODELS,
-    SUPPORTS_MODULE_TYPES,
-)
+from ..models._const import (CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
+                             EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, SUPPORTS_MODULE_TYPES)
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
 from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
 from ..nn_modules.qlinear.ipex import IPEXQuantLinear
-from ..nn_modules.qlinear.torch import TorchQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
 from ..quantization.config import dynamic_get
 from .backend import BACKEND
@@ -59,7 +52,6 @@
 from .progress import ProgressBar
 from .torch import torch_empty_cache
 
-
 logger = setup_logger()
 
 
diff --git a/gptqmodel/utils/openai_server.py b/gptqmodel/utils/openai_server.py
index 83ecda18b..1a03c195b 100644
--- a/gptqmodel/utils/openai_server.py
+++ b/gptqmodel/utils/openai_server.py
@@ -19,7 +19,6 @@
 
 import torch
 
-
 try:
     import uvicorn
     from fastapi import FastAPI, HTTPException
diff --git a/gptqmodel/utils/perplexity.py b/gptqmodel/utils/perplexity.py
index b7e02c90d..057f42ff9 100644
--- a/gptqmodel/utils/perplexity.py
+++ b/gptqmodel/utils/perplexity.py
@@ -18,7 +18,6 @@
 import numpy as np
 import torch
 from datasets import load_dataset, load_from_disk
-
 from gptqmodel.utils.progress import ProgressBar
 
 
diff --git a/gptqmodel/utils/rocm.py b/gptqmodel/utils/rocm.py
index 1284dd6ef..cdfae3396 100644
--- a/gptqmodel/utils/rocm.py
+++ b/gptqmodel/utils/rocm.py
@@ -15,5 +15,4 @@
 
 import torch
 
-
 IS_ROCM = torch.version.hip is not None
diff --git a/gptqmodel/utils/safetensor.py b/gptqmodel/utils/safetensor.py
index df0caf320..ab906f9cb 100644
--- a/gptqmodel/utils/safetensor.py
+++ b/gptqmodel/utils/safetensor.py
@@ -1,11 +1,9 @@
 import os
-import torch
 
+import torch
 from accelerate.utils import find_tied_parameters
-from safetensors import safe_open
-
-
 from gptqmodel.utils.model import recurse_getattr, recurse_setattr
+from safetensors import safe_open
 
 
 # debug print all safetensor files in a directory and print its properties
@@ -41,4 +39,4 @@ def untie_weights(model):
                     model,
                     param_name,
                     recurse_getattr(model, param_name).clone(),
-                )
\ No newline at end of file
+                )
diff --git a/gptqmodel/utils/sglang.py b/gptqmodel/utils/sglang.py
index fde427114..656281d70 100644
--- a/gptqmodel/utils/sglang.py
+++ b/gptqmodel/utils/sglang.py
@@ -18,7 +18,6 @@
 import torch
 from transformers import AutoConfig
 
-
 try:
     import sglang as sgl
     SGLANG_AVAILABLE = True
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
index 7fcc7205f..9ab6eb293 100644
--- a/gptqmodel/utils/torch.py
+++ b/gptqmodel/utils/torch.py
@@ -17,7 +17,6 @@
 
 import torch
 
-
 HAS_CUDA = False
 HAS_XPU = False
 HAS_MPS = False
diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py
index d7d362ece..3c1c0cba2 100644
--- a/gptqmodel/utils/vllm.py
+++ b/gptqmodel/utils/vllm.py
@@ -17,7 +17,6 @@
 
 import torch
 
-
 try:
     from vllm import LLM, SamplingParams
 
diff --git a/setup.py b/setup.py
index bf9310d33..13ffbe3fd 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,6 @@
 
 from setuptools import find_packages, setup
 
-
 try:
     from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel
 except BaseException:
@@ -126,7 +125,6 @@ def get_version_tag() -> str:
 
 import torch  # noqa: E402
 
-
 if TORCH_CUDA_ARCH_LIST is None:
     HAS_CUDA_V8 = any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
 
diff --git a/tests/benchmark/benchmark.py b/tests/benchmark/benchmark.py
index 622a35d95..35f1ab919 100644
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 
 from benchmark_test import BenchmarkTest
-from parameterized import parameterized  # noqa: E402
-
 from gptqmodel import BACKEND
+from parameterized import parameterized  # noqa: E402
 
 
 class TestInference(BenchmarkTest):
diff --git a/tests/benchmark/benchmark_test.py b/tests/benchmark/benchmark_test.py
index 05824af41..5f7baf644 100644
--- a/tests/benchmark/benchmark_test.py
+++ b/tests/benchmark/benchmark_test.py
@@ -16,15 +16,13 @@
 import os
 import time
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 import unittest  # noqa: E402
 
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.utils.progress import ProgressBar  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class BenchmarkTest(unittest.TestCase):
diff --git a/tests/inference_speed.py b/tests/inference_speed.py
index ecb10e85e..8bb2a122e 100644
--- a/tests/inference_speed.py
+++ b/tests/inference_speed.py
@@ -16,15 +16,14 @@
 import os
 import time
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 
 import unittest
-from transformers import AutoTokenizer
 
 from gptqmodel import GPTQModel
 from gptqmodel.utils.progress import ProgressBar
+from transformers import AutoTokenizer
 
 
 class InferenceSpeed(unittest.TestCase):
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
index 499e36ec2..138d07b2a 100644
--- a/tests/models/model_test.py
+++ b/tests/models/model_test.py
@@ -17,14 +17,12 @@
 import os
 import sys
 
-
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 from pathlib import Path  # noqa: E402
 
-
 sys.path.insert(0, f"{str(Path(__file__).resolve().parent.parent)}/models")  # noqa: E402
 import contextlib  # noqa: E402
 import shutil  # noqa: E402
@@ -34,10 +32,6 @@
 import torch.cuda  # noqa: E402
 import transformers  # noqa: E402
 from datasets import load_dataset  # noqa: E402
-from ovis.image_to_test_dataset import get_calib_dataset  # noqa: E402
-from packaging.version import Version  # noqa: E402
-from transformers import AutoProcessor, AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
@@ -45,7 +39,9 @@
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
 from gptqmodel.utils.model import MODALITY  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-
+from ovis.image_to_test_dataset import get_calib_dataset  # noqa: E402
+from packaging.version import Version  # noqa: E402
+from transformers import AutoProcessor, AutoTokenizer  # noqa: E402
 
 RAND_SEED = 898
 
diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py
index d003f7a87..015473d9b 100644
--- a/tests/models/test_gptbigcode.py
+++ b/tests/models/test_gptbigcode.py
@@ -16,7 +16,6 @@
 import importlib.util
 import os
 
-
 # TODO: find how ipex registered it jit interpreter
 # if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter.
 # However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input
diff --git a/tests/models/test_opt.py b/tests/models/test_opt.py
index 6861f5276..6bb2d4965 100644
--- a/tests/models/test_opt.py
+++ b/tests/models/test_opt.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from model_test import ModelTest
-
 from gptqmodel import BACKEND
 from gptqmodel.utils.importer import backend_dict
+from model_test import ModelTest
 
 
 class TestOpt(ModelTest):
diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py
index 41dca1c68..63d2c46c7 100644
--- a/tests/models/test_qwen2_vl.py
+++ b/tests/models/test_qwen2_vl.py
@@ -13,9 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from model_test import ModelTest
-
 from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ
+from model_test import ModelTest
 
 
 class TestQwen2_VL(ModelTest):
diff --git a/tests/tasks/mmlu/_generate_configs.py b/tests/tasks/mmlu/_generate_configs.py
index 28b94616d..f613f7cd4 100644
--- a/tests/tasks/mmlu/_generate_configs.py
+++ b/tests/tasks/mmlu/_generate_configs.py
@@ -9,7 +9,6 @@
 import yaml
 from tqdm import tqdm
 
-
 eval_logger = logging.getLogger("lm-eval")
 
 
diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py
index fba17f04e..ddeede3bd 100644
--- a/tests/test_asym_gptq_v1.py
+++ b/tests/test_asym_gptq_v1.py
@@ -16,13 +16,11 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+from gptqmodel.quantization import FORMAT  # noqa: E402
 # -- end do not touch
 from models.model_test import ModelTest  # noqa: E402
 
-from gptqmodel.quantization import FORMAT  # noqa: E402
-
 
 class Test(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct"  # "meta-llama/Llama-3.2-1B-Instruct"
diff --git a/tests/test_bits.py b/tests/test_bits.py
index ce58cca55..b70163907 100644
--- a/tests/test_bits.py
+++ b/tests/test_bits.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import logging  # noqa: E402
@@ -24,9 +23,6 @@
 import traceback  # noqa: E402
 import unittest  # noqa: E402
 
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
@@ -37,7 +33,8 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
-
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_dynamic.py b/tests/test_dynamic.py
index dddae591b..fa3827d81 100644
--- a/tests/test_dynamic.py
+++ b/tests/test_dynamic.py
@@ -17,18 +17,12 @@
 import os
 
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear
-from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
-import tempfile  # noqa: E402
 import json
-
-from datasets import load_dataset  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+import tempfile  # noqa: E402
 
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
@@ -36,6 +30,9 @@
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity, safetensor  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestDynamic(ModelTest):
diff --git a/tests/test_estimate_vram.py b/tests/test_estimate_vram.py
index b505dc253..70f20c03c 100644
--- a/tests/test_estimate_vram.py
+++ b/tests/test_estimate_vram.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import unittest  # noqa: E402
diff --git a/tests/test_eval.py b/tests/test_eval.py
index d910bea33..ecdee8c05 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -18,12 +18,10 @@
 import unittest
 from typing import Union
 
-from lm_eval.tasks import TaskManager
-from parameterized import parameterized
-
 from gptqmodel import GPTQModel
 from gptqmodel.utils.eval import EVAL
-
+from lm_eval.tasks import TaskManager
+from parameterized import parameterized
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
diff --git a/tests/test_evalplus.py b/tests/test_evalplus.py
index 1e381d68d..40703b0bc 100644
--- a/tests/test_evalplus.py
+++ b/tests/test_evalplus.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
index df676d97e..0dc985599 100644
--- a/tests/test_flash_attention.py
+++ b/tests/test_flash_attention.py
@@ -16,16 +16,14 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
+from gptqmodel import GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import GPTQModel  # noqa: E402
-
 
 class Test(ModelTest):
 
diff --git a/tests/test_group_size.py b/tests/test_group_size.py
index 2dc4d66bb..b6452952a 100644
--- a/tests/test_group_size.py
+++ b/tests/test_group_size.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import logging  # noqa: E402
@@ -24,9 +23,6 @@
 import traceback  # noqa: E402
 import unittest  # noqa: E402
 
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
@@ -37,7 +33,8 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
-
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py
index b34eb094e..c372e9259 100644
--- a/tests/test_inference_speed.py
+++ b/tests/test_inference_speed.py
@@ -13,13 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# -- do not touch
 import os
-from parameterized import parameterized
-
-from gptqmodel.utils import BACKEND
-from inference_speed import InferenceSpeed
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+from gptqmodel.utils import BACKEND  # noqa: E402
+# -- end do not touch
+from inference_speed import InferenceSpeed  # noqa: E402
+from parameterized import parameterized  # noqa: E402
 
 '''
 NATIVE_MODEL_ID = /monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1
@@ -54,4 +55,4 @@ def test_inference_speed(self, model_path, backend, tokens_per_second):
         # (there is a cache when running bitblas for the second time),
         # so only the results of the second run of bitblas are asserted.
         # The first run of bitblas only prints relevant information
-        self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, compile=True, warmup_runs=1)
\ No newline at end of file
+        self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, compile=True, warmup_runs=1)
diff --git a/tests/test_inference_speed_ipex.py b/tests/test_inference_speed_ipex.py
index bcb64e11c..24b132aa8 100644
--- a/tests/test_inference_speed_ipex.py
+++ b/tests/test_inference_speed_ipex.py
@@ -13,13 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# -- do not touch
 import os
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
 
 from gptqmodel.utils import BACKEND
-from parameterized import parameterized
 from inference_speed import InferenceSpeed
+from parameterized import parameterized
 
 
 class TestInferenceSpeedIpex(InferenceSpeed):
@@ -29,4 +31,4 @@ class TestInferenceSpeedIpex(InferenceSpeed):
         ]
     )
     def test_inference_speed_ipex(self, model_path, backend, tokens_per_second):
-        self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)
\ No newline at end of file
+        self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)
diff --git a/tests/test_ipex_xpu.py b/tests/test_ipex_xpu.py
index 4fe06517b..f0c1bff66 100644
--- a/tests/test_ipex_xpu.py
+++ b/tests/test_ipex_xpu.py
@@ -16,16 +16,14 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.models._const import DEVICE  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestsIPEX(ModelTest):
diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py
index ce18d9c48..e7d352667 100644
--- a/tests/test_lm_eval.py
+++ b/tests/test_lm_eval.py
@@ -15,15 +15,12 @@
 
 # -- do not touch
 import os
-
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-from lm_eval.utils import make_table  # noqa: E402
-
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
-
+from lm_eval.utils import make_table  # noqa: E402
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py
index 3b141e4b6..29b36bcb7 100644
--- a/tests/test_lm_head.py
+++ b/tests/test_lm_head.py
@@ -19,13 +19,11 @@
 
 from datasets import load_dataset
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-# -- end do not touch
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
+# -- end do not touch
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestLmHeadLoad(ModelTest):
diff --git a/tests/test_mlx.py b/tests/test_mlx.py
index d3fa1137b..32ca4125f 100644
--- a/tests/test_mlx.py
+++ b/tests/test_mlx.py
@@ -1,7 +1,6 @@
 import os
 import sys
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 if sys.platform == "darwin":
@@ -9,12 +8,11 @@
 
 import tempfile  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
 from mlx_lm import generate, load  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import GPTQModel  # noqa: E402
-
 
 class TestExport(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct/gptq_4bits_01-07_14-18-11_maxlen1024_ns1024_descFalse_damp0.1/"
diff --git a/tests/test_mlx_generate.py b/tests/test_mlx_generate.py
index f8581101b..f3484bfe1 100644
--- a/tests/test_mlx_generate.py
+++ b/tests/test_mlx_generate.py
@@ -1,16 +1,13 @@
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 import sys  # noqa: E402
 
-
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestMlxGenerate(ModelTest):
diff --git a/tests/test_openai_server.py b/tests/test_openai_server.py
index c5c4cb0f4..19f3e33aa 100644
--- a/tests/test_openai_server.py
+++ b/tests/test_openai_server.py
@@ -17,10 +17,8 @@
 import unittest
 
 import openai
-
 from gptqmodel import GPTQModel
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 class TestOpeniServer(unittest.TestCase):
diff --git a/tests/test_packing.py b/tests/test_packing.py
index 4abaf0b7a..484bff883 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -19,11 +19,8 @@
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
-import time  # noqa: E402
 import unittest  # noqa: E402
 
-from parameterized import parameterized  # noqa: E402
-
 # isort: off
 import torch  # noqa: E402
 import torch.nn as nn  # noqa: E402
diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py
index a685cbeeb..4b843117c 100644
--- a/tests/test_packing_speed.py
+++ b/tests/test_packing_speed.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,7 +24,6 @@
 
 from parameterized import parameterized  # noqa: E402
 
-
 # isort: off
 import torch  # noqa: E402
 import torch.nn as nn  # noqa: E402
@@ -33,7 +31,6 @@
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
-from gptqmodel.nn_modules.qlinear.utils import dequantize_4bits_weight  # noqa: E402
 
 
 def gen_quant4(k, n, groupsize=-1):
diff --git a/tests/test_parameter_count.py b/tests/test_parameter_count.py
index 260ac2541..599c5823a 100644
--- a/tests/test_parameter_count.py
+++ b/tests/test_parameter_count.py
@@ -2,11 +2,10 @@
 import tempfile
 
 import torch.cuda
-from models.model_test import ModelTest
-from safetensors.torch import load_file
-
 from gptqmodel import GPTQModel, QuantizeConfig
 from gptqmodel.utils.tensor import tensor_parameters
+from models.model_test import ModelTest
+from safetensors.torch import load_file
 
 
 class TestsParameterCount(ModelTest):
@@ -20,11 +19,10 @@ class TestsParameterCount(ModelTest):
     def test_parameter_count(self):
         import os.path
 
-        from huggingface_hub import hf_hub_download
-        from safetensors.torch import load_file
-
         from gptqmodel import QuantizeConfig
         from gptqmodel.utils.tensor import tensor_parameters
+        from huggingface_hub import hf_hub_download
+        from safetensors.torch import load_file
 
         model_id = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
         if os.path.isdir(model_id):
diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py
index dbef77856..08e826e6f 100644
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -24,13 +23,12 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
-
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization.config import FORMAT, QUANT_METHOD, AutoRoundQuantizeConfig, QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
 from gptqmodel.utils.rocm import IS_ROCM  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
 
 
 class TestPerplexity(unittest.TestCase):
diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py
index 9911f31b5..0f89eb3ca 100644
--- a/tests/test_q4_bitblas.py
+++ b/tests/test_q4_bitblas.py
@@ -16,17 +16,15 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQ4BitBLAS(unittest.TestCase):
diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py
index f8ada0d99..c9ee5ad15 100644
--- a/tests/test_q4_cuda.py
+++ b/tests/test_q4_cuda.py
@@ -16,18 +16,16 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
 import torch  # noqa: E402
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-
 
 class TestsQ4CUDA(ModelTest):
 
diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py
index 913f57249..14fbd4b47 100644
--- a/tests/test_q4_exllama_v1.py
+++ b/tests/test_q4_exllama_v1.py
@@ -16,23 +16,20 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
 import torch  # noqa: E402
-from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length  # noqa: E402
 from gptqmodel.models._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-
+from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 REFERENCE = torch.Tensor(
     [
diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py
index 3e650930e..e1a239cf5 100644
--- a/tests/test_q4_exllama_v2.py
+++ b/tests/test_q4_exllama_v2.py
@@ -16,22 +16,19 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-
+from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 GENERATE_EVAL_SIZE = 100
 
diff --git a/tests/test_q4_ipex.py b/tests/test_q4_ipex.py
index 9693a1ace..654c3eacf 100644
--- a/tests/test_q4_ipex.py
+++ b/tests/test_q4_ipex.py
@@ -17,14 +17,12 @@
 import os
 import sys
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import BACKEND  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestsIPEX(ModelTest):
diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py
index 51f8e95cb..478598923 100644
--- a/tests/test_q4_marlin.py
+++ b/tests/test_q4_marlin.py
@@ -16,18 +16,16 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
-
 
 class TestQ4Marlin(ModelTest):
 
diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py
index f71516dec..e9547da3b 100644
--- a/tests/test_q4_torch.py
+++ b/tests/test_q4_torch.py
@@ -16,16 +16,14 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-
 
 class TestsQ4Torch(ModelTest):
     GENERATE_EVAL_SIZE_MIN = 5
diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py
index 65faeee43..00d85c08d 100644
--- a/tests/test_q4_torch_apple.py
+++ b/tests/test_q4_torch_apple.py
@@ -16,12 +16,11 @@
 import sys  # noqa: E402
 
 import torch  # noqa: E402
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-
 
 class TestsQ4Torch(ModelTest):
     GENERATE_EVAL_SIZE_MIN = 5
diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py
index 1b3c53e6f..4084903f2 100644
--- a/tests/test_q4_triton.py
+++ b/tests/test_q4_triton.py
@@ -16,16 +16,14 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestsQ4Triton(ModelTest):
diff --git a/tests/test_quant_batch.py b/tests/test_quant_batch.py
index 6c672961b..a80bb257e 100644
--- a/tests/test_quant_batch.py
+++ b/tests/test_quant_batch.py
@@ -16,19 +16,16 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import tempfile  # noqa: E402
-import unittest  # noqa: E402
-
-from transformers import AutoTokenizer  # noqa: E402
 
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantBatch(ModelTest):
diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 18fb037d5..c02dd3078 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -16,29 +16,22 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import json  # noqa: E402
 import logging  # noqa: E402
 import tempfile  # noqa: E402
-import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (  # noqa: E402
-    META_FIELD_QUANTIZER,
-    META_QUANTIZER_GPTQMODEL,
-    AutoRoundQuantizeConfig,
-    QuantizeConfig,
-)
+from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
+                                           AutoRoundQuantizeConfig, QuantizeConfig)
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantization(ModelTest):
diff --git a/tests/test_quant_formats_auto_round.py b/tests/test_quant_formats_auto_round.py
index 9906ef39a..55607644e 100644
--- a/tests/test_quant_formats_auto_round.py
+++ b/tests/test_quant_formats_auto_round.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,19 +24,14 @@
 import tempfile  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (  # noqa: E402
-    META_FIELD_QUANTIZER,
-    META_QUANTIZER_GPTQMODEL,
-    AutoRoundQuantizeConfig,
-    QuantizeConfig,
-)
+from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
+                                           AutoRoundQuantizeConfig, QuantizeConfig)
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantization(ModelTest):
diff --git a/tests/test_quant_time.py b/tests/test_quant_time.py
index ae9688d4b..b65559699 100644
--- a/tests/test_quant_time.py
+++ b/tests/test_quant_time.py
@@ -15,13 +15,10 @@
 
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
-import unittest  # noqa: E402
 import time  # noqa: E402
 
-from datasets import load_dataset  # noqa: E402
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization.config import QuantizeConfig  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
diff --git a/tests/test_quant_trust_remote.py b/tests/test_quant_trust_remote.py
index 24232223a..4a45500d6 100644
--- a/tests/test_quant_trust_remote.py
+++ b/tests/test_quant_trust_remote.py
@@ -16,21 +16,18 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import tempfile  # noqa: E402
-import unittest  # noqa: E402
 
 import transformers  # noqa: E402
-from datasets import load_dataset  # noqa: E402
-from packaging.version import Version  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization import FORMAT, QuantizeConfig  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
+from packaging.version import Version  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 
 class TestQuantWithTrustRemoteTrue(ModelTest):
     @classmethod
diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py
index e8a313392..f3a54f911 100644
--- a/tests/test_save_loaded_quantized_model.py
+++ b/tests/test_save_loaded_quantized_model.py
@@ -16,18 +16,15 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
-
-
 MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
 class TestSave(unittest.TestCase):
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index c1ee6a2eb..ca1213303 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_sglang.py b/tests/test_sglang.py
index dd34a5d29..a47a0b65e 100644
--- a/tests/test_sglang.py
+++ b/tests/test_sglang.py
@@ -15,7 +15,6 @@
 
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -24,9 +23,8 @@
 import sys  # noqa: E402
 
 import torch  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestLoadSglang(ModelTest):
diff --git a/tests/test_sharded.py b/tests/test_sharded.py
index f21fb128f..02b013ead 100644
--- a/tests/test_sharded.py
+++ b/tests/test_sharded.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,9 +24,8 @@
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import GPTQModel  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestSharded(unittest.TestCase):
diff --git a/tests/test_tgi.py b/tests/test_tgi.py
index e0c9178c0..28d9a6135 100644
--- a/tests/test_tgi.py
+++ b/tests/test_tgi.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import json  # noqa: E402
diff --git a/tests/test_transformers_integration.py b/tests/test_transformers_integration.py
index 8a86bfd57..de2ce3585 100644
--- a/tests/test_transformers_integration.py
+++ b/tests/test_transformers_integration.py
@@ -14,15 +14,13 @@
 # limitations under the License.
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 import tempfile  # noqa: E402
 
+from gptqmodel.integration import integration  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig  # noqa: E402
 
-from gptqmodel.integration import integration  # noqa: E402
-
 
 class TestTransformersIntegration(ModelTest):
 
diff --git a/tests/test_triton.py b/tests/test_triton.py
index 216bd41ef..880cc632b 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,10 +24,8 @@
 
 import torch  # noqa: E402
 import torch.utils.benchmark as benchmark  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-
+from transformers import AutoTokenizer  # noqa: E402
 
 MODEL_ID = "/monster/data/model/Llama-7B-GPTQ"
 DATASET_ID = "timdettmers/openassistant-guanaco"
diff --git a/tests/test_triton_xpu.py b/tests/test_triton_xpu.py
index cd2afa2d5..3f971a2e4 100644
--- a/tests/test_triton_xpu.py
+++ b/tests/test_triton_xpu.py
@@ -16,16 +16,14 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
-from models.model_test import ModelTest  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.models._const import DEVICE  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
 
 
 class TestTritonXPU(ModelTest):
diff --git a/tests/test_verify_hash.py b/tests/test_verify_hash.py
index ae60766b8..ff2d444ff 100644
--- a/tests/test_verify_hash.py
+++ b/tests/test_verify_hash.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_vllm.py b/tests/test_vllm.py
index 2e74e428b..671b3ccf3 100644
--- a/tests/test_vllm.py
+++ b/tests/test_vllm.py
@@ -16,7 +16,6 @@
 # -- do not touch
 import os
 
-
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -26,13 +25,11 @@
 import tempfile  # noqa: E402
 
 import torch  # noqa: E402
-from datasets import load_dataset  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestLoadVLLM(ModelTest):