From fb60ef34d57bb6dc2a7e8296c1bdb7fc9b7adbab Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 31 Jan 2025 20:35:17 +0800 Subject: [PATCH] format (#1189) * format * ignore format * format ignore E402 * install uv * remove uv --- examples/benchmark/generation_speed.py | 6 ++-- examples/benchmark/ipex.py | 2 -- examples/benchmark/perplexity.py | 4 +-- .../evaluation/run_language_modeling_task.py | 4 +-- .../run_sequence_classification_task.py | 4 +-- .../evaluation/run_text_summarization_task.py | 4 +-- examples/inference/run_transformers.py | 1 - .../inference/run_with_different_backends.py | 4 +-- examples/quantization/basic_usage.py | 4 +-- .../quantization/basic_usage_autoround.py | 4 +-- .../quantization/basic_usage_wikitext2.py | 4 +-- examples/quantization/transformers_usage.py | 1 - format/ruff.toml | 2 +- gptqmodel/models/_const.py | 1 - gptqmodel/models/auto.py | 7 ++--- gptqmodel/models/base.py | 30 ++++--------------- gptqmodel/models/definitions/gemma2.py | 1 - gptqmodel/models/loader.py | 26 ++++------------ gptqmodel/models/writer.py | 30 ++++--------------- gptqmodel/nn_modules/qlinear/__init__.py | 2 +- gptqmodel/nn_modules/qlinear/bitblas.py | 4 +-- .../qlinear/bitblas_target_detector.py | 1 - gptqmodel/nn_modules/qlinear/dynamic_cuda.py | 1 - gptqmodel/nn_modules/qlinear/exllama.py | 4 +-- gptqmodel/nn_modules/qlinear/exllamav2.py | 2 -- gptqmodel/nn_modules/qlinear/ipex.py | 2 -- gptqmodel/nn_modules/qlinear/marlin.py | 4 +-- gptqmodel/nn_modules/qlinear/torch.py | 4 +-- gptqmodel/nn_modules/qlinear/utils.py | 2 +- .../triton_utils/custom_autotune.py | 1 - gptqmodel/nn_modules/triton_utils/kernels.py | 1 - gptqmodel/quantization/__init__.py | 13 ++------ gptqmodel/quantization/config.py | 5 ++-- gptqmodel/quantization/gptq.py | 1 - gptqmodel/quantization/quantizer.py | 1 - gptqmodel/utils/bitblas.py | 4 +-- gptqmodel/utils/device.py | 1 - gptqmodel/utils/importer.py | 3 +- gptqmodel/utils/logger.py | 1 - gptqmodel/utils/marlin.py | 3 +- gptqmodel/utils/mlx.py | 1 - gptqmodel/utils/model.py | 12 ++------ gptqmodel/utils/openai_server.py | 1 - gptqmodel/utils/perplexity.py | 1 - gptqmodel/utils/rocm.py | 1 - gptqmodel/utils/safetensor.py | 8 ++--- gptqmodel/utils/sglang.py | 1 - gptqmodel/utils/torch.py | 1 - gptqmodel/utils/vllm.py | 1 - setup.py | 2 -- tests/benchmark/benchmark.py | 3 +- tests/benchmark/benchmark_test.py | 4 +-- tests/inference_speed.py | 3 +- tests/models/model_test.py | 10 ++----- tests/models/test_gptbigcode.py | 1 - tests/models/test_opt.py | 3 +- tests/models/test_qwen2_vl.py | 3 +- tests/tasks/mmlu/_generate_configs.py | 1 - tests/test_asym_gptq_v1.py | 4 +-- tests/test_bits.py | 7 ++--- tests/test_dynamic.py | 11 +++---- tests/test_estimate_vram.py | 1 - tests/test_eval.py | 6 ++-- tests/test_evalplus.py | 1 - tests/test_flash_attention.py | 4 +-- tests/test_group_size.py | 7 ++--- tests/test_inference_speed.py | 11 +++---- tests/test_inference_speed_ipex.py | 6 ++-- tests/test_ipex_xpu.py | 4 +-- tests/test_lm_eval.py | 5 +--- tests/test_lm_head.py | 6 ++-- tests/test_mlx.py | 4 +-- tests/test_mlx_generate.py | 5 +--- tests/test_openai_server.py | 2 -- tests/test_packing.py | 3 -- tests/test_packing_speed.py | 3 -- tests/test_parameter_count.py | 10 +++---- tests/test_perplexity.py | 6 ++-- tests/test_q4_bitblas.py | 4 +-- tests/test_q4_cuda.py | 4 +-- tests/test_q4_exllama_v1.py | 9 ++---- tests/test_q4_exllama_v2.py | 7 ++--- tests/test_q4_ipex.py | 4 +-- tests/test_q4_marlin.py | 6 ++-- tests/test_q4_torch.py | 4 +-- tests/test_q4_torch_apple.py | 3 +- tests/test_q4_triton.py | 6 ++-- tests/test_quant_batch.py | 5 +--- tests/test_quant_formats.py | 15 +++------- tests/test_quant_formats_auto_round.py | 14 +++------ tests/test_quant_time.py | 3 -- tests/test_quant_trust_remote.py | 9 ++---- tests/test_save_loaded_quantized_model.py | 5 +--- tests/test_serialization.py | 1 - tests/test_sglang.py | 4 +-- tests/test_sharded.py | 4 +-- tests/test_tgi.py | 1 - tests/test_transformers_integration.py | 4 +-- tests/test_triton.py | 5 +--- tests/test_triton_xpu.py | 4 +-- tests/test_verify_hash.py | 1 - tests/test_vllm.py | 7 ++--- 102 files changed, 125 insertions(+), 366 deletions(-) diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py index 1018d641c..643acd861 100644 --- a/examples/benchmark/generation_speed.py +++ b/examples/benchmark/generation_speed.py @@ -22,12 +22,10 @@ import torch from datasets import Dataset, load_dataset -from transformers import AutoTokenizer, GenerationConfig -from transformers.generation.logits_process import LogitsProcessor - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.utils.progress import ProgressBar - +from transformers import AutoTokenizer, GenerationConfig +from transformers.generation.logits_process import LogitsProcessor logger = logging.getLogger(__name__) diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py index b221d9bcf..50541ee55 100644 --- a/examples/benchmark/ipex.py +++ b/examples/benchmark/ipex.py @@ -19,7 +19,6 @@ import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - try: from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf bind_cores_for_best_perf() @@ -29,7 +28,6 @@ import argparse - parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.") parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.") parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.") diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py index cf3e4e3bb..6326d9b35 100644 --- a/examples/benchmark/perplexity.py +++ b/examples/benchmark/perplexity.py @@ -17,10 +17,8 @@ import os import torch -from transformers import AutoTokenizer - from gptqmodel.utils import Perplexity - +from transformers import AutoTokenizer os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py index 59b51405b..5e9e98df4 100644 --- a/examples/evaluation/run_language_modeling_task.py +++ b/examples/evaluation/run_language_modeling_task.py @@ -17,12 +17,10 @@ import datasets import torch -from transformers import AutoTokenizer - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import LanguageModelingTask from gptqmodel.utils.torch import torch_empty_cache - +from transformers import AutoTokenizer DATASET = "tatsu-lab/alpaca" WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n" diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py index 94e9a43b8..7a7658048 100644 --- a/examples/evaluation/run_sequence_classification_task.py +++ b/examples/evaluation/run_sequence_classification_task.py @@ -18,12 +18,10 @@ import datasets import torch -from transformers import AutoTokenizer - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import SequenceClassificationTask from gptqmodel.utils.torch import torch_empty_cache - +from transformers import AutoTokenizer DATASET = "cardiffnlp/tweet_sentiment_multilingual" TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:" diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py index 2c3b16808..31c745acb 100644 --- a/examples/evaluation/run_text_summarization_task.py +++ b/examples/evaluation/run_text_summarization_task.py @@ -18,12 +18,10 @@ import datasets import torch -from transformers import AutoTokenizer, GenerationConfig - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import TextSummarizationTask from gptqmodel.utils.torch import torch_empty_cache - +from transformers import AutoTokenizer, GenerationConfig os.system("pip install py7zr") diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py index b53e8487c..23a511ba3 100644 --- a/examples/inference/run_transformers.py +++ b/examples/inference/run_transformers.py @@ -15,7 +15,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0])) diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py index e0837b2ee..ef993f949 100644 --- a/examples/inference/run_with_different_backends.py +++ b/examples/inference/run_with_different_backends.py @@ -18,10 +18,8 @@ import sys from argparse import ArgumentParser -from transformers import AutoTokenizer - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device - +from transformers import AutoTokenizer os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py index ab1dd2f45..129f33a83 100644 --- a/examples/quantization/basic_usage.py +++ b/examples/quantization/basic_usage.py @@ -15,10 +15,8 @@ import os -from transformers import AutoTokenizer - from gptqmodel import GPTQModel, QuantizeConfig, get_best_device - +from transformers import AutoTokenizer os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py index c3420fae6..1b071641f 100644 --- a/examples/quantization/basic_usage_autoround.py +++ b/examples/quantization/basic_usage_autoround.py @@ -14,11 +14,9 @@ # limitations under the License. import torch -from transformers import AutoTokenizer - from gptqmodel import GPTQModel from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402 - +from transformers import AutoTokenizer pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g" diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py index ce8e88d7d..43237e145 100644 --- a/examples/quantization/basic_usage_wikitext2.py +++ b/examples/quantization/basic_usage_wikitext2.py @@ -15,10 +15,8 @@ import torch from datasets import load_dataset -from transformers import AutoTokenizer - from gptqmodel import GPTQModel, QuantizeConfig - +from transformers import AutoTokenizer pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g" diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py index a984a9a8d..98ad1855f 100755 --- a/examples/quantization/transformers_usage.py +++ b/examples/quantization/transformers_usage.py @@ -15,7 +15,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig - model_id = "facebook/opt-125m" tokenizer = AutoTokenizer.from_pretrained(model_id) dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] diff --git a/format/ruff.toml b/format/ruff.toml index 4f769ee6b..e2cb2afb9 100644 --- a/format/ruff.toml +++ b/format/ruff.toml @@ -1,5 +1,5 @@ # Never enforce `E501` (line length violations). -lint.ignore = ["C901", "E501", "E741", "W605"] +lint.ignore = ["C901", "E501", "E741", "W605", "E402"] lint.select = ["C", "E", "F", "I", "W"] line-length = 119 diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py index 17448c192..67874d040 100644 --- a/gptqmodel/models/_const.py +++ b/gptqmodel/models/_const.py @@ -24,7 +24,6 @@ from ..utils.rocm import IS_ROCM from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU - CPU = device("cpu") CUDA = device("cuda") CUDA_0 = device("cuda:0") diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 3fc68e32c..882a0a66f 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -17,7 +17,6 @@ import os - if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True' print("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.") @@ -28,7 +27,6 @@ import sys # noqa: E402 - # TODO: waiting for pytorch implementgation of aten ops for MPS if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" @@ -96,7 +94,6 @@ from .definitions.xverse import XverseGPTQ # noqa: E402 from .definitions.yi import YiGPTQ # noqa: E402 - logger = setup_logger() MODEL_MAP = { @@ -300,11 +297,10 @@ def eval( if task not in EVAL.get_task_enums(): raise ValueError(f"lm_eval support tasks: {EVAL.get_all_tasks_string()}") + from gptqmodel.utils.eval import lm_eval from lm_eval.utils import make_table from transformers import AutoTokenizer - from gptqmodel.utils.eval import lm_eval - tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) model_name = 'hf' if backend == 'gptqmodel' else backend @@ -372,6 +368,7 @@ def export(model_id_or_path: str, target_path: str, format: str, trust_remote_co if format == "mlx": try: from mlx_lm.utils import save_config, save_weights + from ..utils.mlx import convert_gptq_to_mlx_weights except ImportError: raise ValueError("MLX not installed. Please install via `pip install gptqmodel[mlx] --no-build-isolation`.") diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 7c9fe9d73..9f1fe1406 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -36,33 +36,15 @@ from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory from ..utils.importer import select_quant_linear from ..utils.logger import setup_logger -from ..utils.model import ( - MODALITY, - check_to_quantized, - find_layers, - get_device, - get_module, - get_module_by_name_prefix, - get_moe_layer_modules, - move_to, - nested_move_to, - normalize_tokenizer, - pack_model, -) +from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device, + get_module, get_module_by_name_prefix, get_moe_layer_modules, + move_to, nested_move_to, normalize_tokenizer, pack_model) from ..utils.progress import ProgressBar -from ..utils.safetensor import untie_weights from ..utils.torch import torch_empty_cache from ._const import CPU, DEVICE, SUPPORTS_MODULE_TYPES from .loader import ModelLoader -from .writer import ( - QUANT_LOG_DAMP, - QUANT_LOG_FWD_TIME, - QUANT_LOG_LAYER, - QUANT_LOG_LOSS, - QUANT_LOG_MODULE, - QUANT_LOG_TIME, - ModelWriter, -) +from .writer import (QUANT_LOG_DAMP, QUANT_LOG_FWD_TIME, QUANT_LOG_LAYER, + QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter) # pytorch 2.6.0 fixes many compilation errors PYTORCH_MIN_VERFSION_WITH_COMPILE = Version("2.6.0") @@ -931,7 +913,7 @@ def compile(self, backend="inductor", mode="reduce-overhead"): try: self.model = torch.compile(self.model, fullgraph=True, backend=backend, mode=mode) - except Exception as e: + except Exception: logger.info("Compiling model again with `fullgraph=False`") self.model = torch.compile(self.model, fullgraph=False, backend=backend, mode=mode) return self diff --git a/gptqmodel/models/definitions/gemma2.py b/gptqmodel/models/definitions/gemma2.py index d7be41c58..08ec1aadc 100644 --- a/gptqmodel/models/definitions/gemma2.py +++ b/gptqmodel/models/definitions/gemma2.py @@ -17,7 +17,6 @@ from ...utils.logger import setup_logger from ..base import BaseGPTQModel - logger = setup_logger() SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ." diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 68581778e..ec72cb62b 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -36,28 +36,14 @@ from ..utils.backend import BACKEND from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear from ..utils.logger import setup_logger -from ..utils.marlin import ( - _validate_marlin_compatibility, - _validate_marlin_device_support, - prepare_model_for_marlin_load, -) -from ..utils.model import ( - auto_dtype, - convert_gptq_v1_to_v2_format, - find_layers, - get_checkpoints, - get_moe_layer_modules, - gptqmodel_post_init, - load_checkpoint_in_model_then_tie_weights, - make_quant, - normalize_tokenizer, - simple_dispatch_model, - verify_model_hash, - verify_sharded_model_hashes, -) +from ..utils.marlin import (_validate_marlin_compatibility, + _validate_marlin_device_support, prepare_model_for_marlin_load) +from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_layers, get_checkpoints, + get_moe_layer_modules, gptqmodel_post_init, + load_checkpoint_in_model_then_tie_weights, make_quant, normalize_tokenizer, + simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes) from ._const import DEVICE, SUPPORTED_MODELS, normalize_device - logger = setup_logger() ATTN_IMPLEMENTATION = "attn_implementation" diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index a56e51fdf..385709509 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -33,36 +33,18 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils.generic import ContextManagers -from ..quantization.config import ( - FORMAT, - META_FIELD_DAMP_AUTO_INCREMENT, - META_FIELD_DAMP_PERCENT, - META_FIELD_MSE, - META_FIELD_QUANTIZER, - META_FIELD_STATIC_GROUPS, - META_FIELD_TRUE_SEQUENTIAL, - META_FIELD_URI, - META_QUANTIZER_GPTQMODEL, - META_VALUE_URI, - MIN_VERSION_WITH_V2, -) +from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE, + META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL, + META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2) from ..utils.backend import BACKEND from ..utils.logger import setup_logger -from ..utils.model import ( - convert_gptq_v2_to_v1_format, - copy_py_files, - find_layers, - get_model_files_size, - get_moe_layer_modules, - get_state_dict_for_save, - load_checkpoint_in_model_then_tie_weights, - make_quant, -) +from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_layers, + get_model_files_size, get_moe_layer_modules, get_state_dict_for_save, + load_checkpoint_in_model_then_tie_weights, make_quant) from ..utils.torch import torch_empty_cache from ..version import __version__ from ._const import CPU - logger = setup_logger() QUANT_LOG_LAYER = "layer" diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py index 248fd5a9d..5b072f179 100644 --- a/gptqmodel/nn_modules/qlinear/__init__.py +++ b/gptqmodel/nn_modules/qlinear/__init__.py @@ -17,7 +17,7 @@ from typing import List, Optional, Tuple import numpy as np -import torch as t # conflict with torch.py +import torch as t # conflict with torch.py import torch.nn as nn import transformers diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py index 024709aca..02fca774f 100644 --- a/gptqmodel/nn_modules/qlinear/bitblas.py +++ b/gptqmodel/nn_modules/qlinear/bitblas.py @@ -22,13 +22,11 @@ import numpy as np import torch import torch.nn as nn - from gptqmodel.nn_modules.qlinear import BaseQuantLinear from ...models._const import DEVICE, PLATFORM from ...utils.logger import setup_logger - logger = setup_logger() BITBLAS_TARGET = None @@ -389,4 +387,4 @@ def forward(self, A): return C -__all__ = ["BitBLASQuantLinear"] \ No newline at end of file +__all__ = ["BitBLASQuantLinear"] diff --git a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py index ecd65915f..89c2b3d94 100644 --- a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py +++ b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py @@ -22,7 +22,6 @@ from ...utils.logger import setup_logger - logger = setup_logger() TARGET_MISSING_ERROR = ( diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py index 8a323f1ef..04950deb5 100644 --- a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py +++ b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py @@ -16,7 +16,6 @@ from typing import Optional, Tuple import torch - from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear from gptqmodel.utils.logger import setup_logger diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py index f17beef9d..de171b44a 100644 --- a/gptqmodel/nn_modules/qlinear/exllama.py +++ b/gptqmodel/nn_modules/qlinear/exllama.py @@ -21,12 +21,10 @@ import torch import torch.nn.functional as F - -from gptqmodel.nn_modules.qlinear import BaseQuantLinear, PackableQuantLinear +from gptqmodel.nn_modules.qlinear import PackableQuantLinear from ...models._const import DEVICE, PLATFORM - exllama_import_exception = None try: from gptqmodel_exllama_kernels import make_q4, q4_matmul diff --git a/gptqmodel/nn_modules/qlinear/exllamav2.py b/gptqmodel/nn_modules/qlinear/exllamav2.py index 18e42fd5e..e73b1ec30 100644 --- a/gptqmodel/nn_modules/qlinear/exllamav2.py +++ b/gptqmodel/nn_modules/qlinear/exllamav2.py @@ -20,13 +20,11 @@ import torch import torch.nn.functional as F - from gptqmodel.nn_modules.qlinear import BaseQuantLinear from ...models._const import DEVICE, PLATFORM from ...utils.logger import setup_logger - exllama_v2_import_exception = None try: from gptqmodel_exllamav2_kernels import gemm_half_q_half, make_q_matrix diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py index 206f908fc..a6aacd306 100644 --- a/gptqmodel/nn_modules/qlinear/ipex.py +++ b/gptqmodel/nn_modules/qlinear/ipex.py @@ -20,14 +20,12 @@ import torch import torch.nn as nn import transformers - from gptqmodel.models._const import DEVICE, PLATFORM from gptqmodel.nn_modules.qlinear import BaseQuantLinear from ...utils.logger import setup_logger from ...utils.torch import HAS_XPU - logger = setup_logger() BITS_DTYPE_MAPPING = { diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py index bd08569a9..61f617d27 100644 --- a/gptqmodel/nn_modules/qlinear/marlin.py +++ b/gptqmodel/nn_modules/qlinear/marlin.py @@ -20,14 +20,12 @@ import numpy as np import torch -from torch.nn.parameter import Parameter - from gptqmodel.nn_modules.qlinear import BaseQuantLinear +from torch.nn.parameter import Parameter from ...models._const import DEVICE, PLATFORM from ...utils.rocm import IS_ROCM - marlin_import_exception = None try: import gptqmodel_marlin_kernels diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py index d43443158..3be532339 100644 --- a/gptqmodel/nn_modules/qlinear/torch.py +++ b/gptqmodel/nn_modules/qlinear/torch.py @@ -18,13 +18,11 @@ import torch import torch.nn as nn import torch.nn.functional as F - from gptqmodel.nn_modules.qlinear import BaseQuantLinear, PackableQuantLinear from gptqmodel.utils.logger import setup_logger from ...models._const import DEVICE, PLATFORM - logger = setup_logger() class TorchQuantLinear(PackableQuantLinear): @@ -224,7 +222,7 @@ def dequantize_model(model: nn.Module): module_name = name setattr(parent, module_name, new_module) - + del model.config.quantization_config return model diff --git a/gptqmodel/nn_modules/qlinear/utils.py b/gptqmodel/nn_modules/qlinear/utils.py index 2c52aac67..2d1b01309 100644 --- a/gptqmodel/nn_modules/qlinear/utils.py +++ b/gptqmodel/nn_modules/qlinear/utils.py @@ -34,4 +34,4 @@ def dequantize_4bits_weight(layer): scales = scales.repeat_interleave(group_size, dim=0) unpacked_qzeros = unpacked_qzeros.repeat_interleave(group_size, dim=0) unpacked_qweight = (unpacked_qweight - unpacked_qzeros) * scales - return unpacked_qweight.T, unpacked_qzeros \ No newline at end of file + return unpacked_qweight.T, unpacked_qzeros diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py index d16efe460..63d1d06db 100644 --- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py +++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py @@ -20,7 +20,6 @@ import triton - # code based https://github.com/fpgaminer/GPTQ-triton """ Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. diff --git a/gptqmodel/nn_modules/triton_utils/kernels.py b/gptqmodel/nn_modules/triton_utils/kernels.py index f3616eb23..0873a022d 100644 --- a/gptqmodel/nn_modules/triton_utils/kernels.py +++ b/gptqmodel/nn_modules/triton_utils/kernels.py @@ -21,7 +21,6 @@ from ...utils.logger import setup_logger from . import custom_autotune - logger = setup_logger() diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py index 4f5a009f8..6a4f212df 100644 --- a/gptqmodel/quantization/__init__.py +++ b/gptqmodel/quantization/__init__.py @@ -13,16 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config import ( - FORMAT, - FORMAT_FIELD_CODE, - FORMAT_FIELD_COMPAT_MARLIN, - FORMAT_FIELD_JSON, - QUANT_CONFIG_FILENAME, - QUANT_METHOD, - QUANT_METHOD_FIELD, - BaseQuantizeConfig, - QuantizeConfig, -) +from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_COMPAT_MARLIN, FORMAT_FIELD_JSON, + QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) from .gptq import GPTQ from .quantizer import Quantizer, quantize diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index 11c4a4120..0245b67de 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -27,7 +27,6 @@ from ..utils.logger import setup_logger - logger = setup_logger() FORMAT_FIELD_CODE = "format" @@ -190,11 +189,11 @@ def __post_init__(self): else: if isinstance(self.pack_dtype, str): self.pack_dtype = self.pack_dtype.lower() - if not self.pack_dtype in ["int64", "int32", "int16", "int8"]: + if self.pack_dtype not in ["int64", "int32", "int16", "int8"]: raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}") self.pack_dtype = getattr(torch, self.pack_dtype) elif isinstance(self.pack_dtype, torch.dtype): - if not self.pack_dtype in [torch.int64, torch.int32, torch.int16, torch.int8]: + if self.pack_dtype not in [torch.int64, torch.int32, torch.int16, torch.int8]: raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}") else: raise ValueError(f"Unsupported pack_dtype: {self.pack_dtype}") diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py index 1ae102b89..4606f0dba 100644 --- a/gptqmodel/quantization/gptq.py +++ b/gptqmodel/quantization/gptq.py @@ -28,7 +28,6 @@ from ..utils.torch import torch_empty_cache, torch_sync from .quantizer import Quantizer - logger = setup_logger() # TODO do we really need max precision? diff --git a/gptqmodel/quantization/quantizer.py b/gptqmodel/quantization/quantizer.py index 96f9e4350..0460f2523 100644 --- a/gptqmodel/quantization/quantizer.py +++ b/gptqmodel/quantization/quantizer.py @@ -20,7 +20,6 @@ from ..utils.logger import setup_logger - logger = setup_logger() diff --git a/gptqmodel/utils/bitblas.py b/gptqmodel/utils/bitblas.py index e257aabce..b80096f61 100644 --- a/gptqmodel/utils/bitblas.py +++ b/gptqmodel/utils/bitblas.py @@ -17,16 +17,14 @@ import threadpoolctl as tctl import torch -from bitblas.quantization import general_compress from ..nn_modules.qlinear.bitblas import BitBLASQuantLinear from ..quantization import FORMAT, QuantizeConfig from ..utils.logger import setup_logger -from .model import load_checkpoint_in_model_then_tie_weights, recurse_getattr, recurse_setattr +from .model import load_checkpoint_in_model_then_tie_weights from .progress import ProgressBar from .torch import torch_empty_cache - logger = setup_logger() def prepare_model_for_bitblas_load( diff --git a/gptqmodel/utils/device.py b/gptqmodel/utils/device.py index f8fa06251..7f8eeaeb0 100644 --- a/gptqmodel/utils/device.py +++ b/gptqmodel/utils/device.py @@ -14,7 +14,6 @@ # limitations under the License. from device_smi import Device - from gptqmodel.models._const import CPU, CUDA_0 diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index d96e29bba..2d95c9fa3 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -15,7 +15,7 @@ import os from collections import OrderedDict -from typing import Dict, Optional, Type, Union, List +from typing import Dict, List, Optional, Type, Union import torch @@ -35,7 +35,6 @@ from .rocm import IS_ROCM from .torch import HAS_CUDA, HAS_MPS, HAS_XPU - message_logged = False logger = setup_logger() diff --git a/gptqmodel/utils/logger.py b/gptqmodel/utils/logger.py index 0366ef5bd..6054d3b94 100644 --- a/gptqmodel/utils/logger.py +++ b/gptqmodel/utils/logger.py @@ -15,7 +15,6 @@ import logging - # global static/shared logger instance logger = None diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py index 1f773d4f9..3093de8dd 100644 --- a/gptqmodel/utils/marlin.py +++ b/gptqmodel/utils/marlin.py @@ -18,12 +18,11 @@ from ..nn_modules.qlinear.marlin import MarlinQuantLinear, _get_perms from ..quantization import FORMAT, QuantizeConfig from ..utils.logger import setup_logger -from .model import load_checkpoint_in_model_then_tie_weights, recurse_getattr, recurse_setattr +from .model import load_checkpoint_in_model_then_tie_weights from .progress import ProgressBar from .rocm import IS_ROCM from .torch import torch_empty_cache - logger = setup_logger() diff --git a/gptqmodel/utils/mlx.py b/gptqmodel/utils/mlx.py index dadbae4d5..9fa642917 100644 --- a/gptqmodel/utils/mlx.py +++ b/gptqmodel/utils/mlx.py @@ -10,7 +10,6 @@ from .progress import ProgressBar from .torch import torch_empty_cache - try: import mlx.core as mx from mlx_lm import generate diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 52e2ced29..fb5d6e7d1 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -38,19 +38,12 @@ from transformers.pytorch_utils import id_tensor_storage from transformers.utils.hub import cached_file -from ..models._const import ( - CPU, - DEVICE, - EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, - EXPERT_INDEX_PLACEHOLDER, - SUPPORTED_MODELS, - SUPPORTS_MODULE_TYPES, -) +from ..models._const import (CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, + EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, SUPPORTS_MODULE_TYPES) from ..nn_modules.qlinear import BaseQuantLinear from ..nn_modules.qlinear.exllama import ExllamaQuantLinear from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear from ..nn_modules.qlinear.ipex import IPEXQuantLinear -from ..nn_modules.qlinear.torch import TorchQuantLinear from ..quantization import FORMAT, QuantizeConfig from ..quantization.config import dynamic_get from .backend import BACKEND @@ -59,7 +52,6 @@ from .progress import ProgressBar from .torch import torch_empty_cache - logger = setup_logger() diff --git a/gptqmodel/utils/openai_server.py b/gptqmodel/utils/openai_server.py index 83ecda18b..1a03c195b 100644 --- a/gptqmodel/utils/openai_server.py +++ b/gptqmodel/utils/openai_server.py @@ -19,7 +19,6 @@ import torch - try: import uvicorn from fastapi import FastAPI, HTTPException diff --git a/gptqmodel/utils/perplexity.py b/gptqmodel/utils/perplexity.py index b7e02c90d..057f42ff9 100644 --- a/gptqmodel/utils/perplexity.py +++ b/gptqmodel/utils/perplexity.py @@ -18,7 +18,6 @@ import numpy as np import torch from datasets import load_dataset, load_from_disk - from gptqmodel.utils.progress import ProgressBar diff --git a/gptqmodel/utils/rocm.py b/gptqmodel/utils/rocm.py index 1284dd6ef..cdfae3396 100644 --- a/gptqmodel/utils/rocm.py +++ b/gptqmodel/utils/rocm.py @@ -15,5 +15,4 @@ import torch - IS_ROCM = torch.version.hip is not None diff --git a/gptqmodel/utils/safetensor.py b/gptqmodel/utils/safetensor.py index df0caf320..ab906f9cb 100644 --- a/gptqmodel/utils/safetensor.py +++ b/gptqmodel/utils/safetensor.py @@ -1,11 +1,9 @@ import os -import torch +import torch from accelerate.utils import find_tied_parameters -from safetensors import safe_open - - from gptqmodel.utils.model import recurse_getattr, recurse_setattr +from safetensors import safe_open # debug print all safetensor files in a directory and print its properties @@ -41,4 +39,4 @@ def untie_weights(model): model, param_name, recurse_getattr(model, param_name).clone(), - ) \ No newline at end of file + ) diff --git a/gptqmodel/utils/sglang.py b/gptqmodel/utils/sglang.py index fde427114..656281d70 100644 --- a/gptqmodel/utils/sglang.py +++ b/gptqmodel/utils/sglang.py @@ -18,7 +18,6 @@ import torch from transformers import AutoConfig - try: import sglang as sgl SGLANG_AVAILABLE = True diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py index 7fcc7205f..9ab6eb293 100644 --- a/gptqmodel/utils/torch.py +++ b/gptqmodel/utils/torch.py @@ -17,7 +17,6 @@ import torch - HAS_CUDA = False HAS_XPU = False HAS_MPS = False diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index d7d362ece..3c1c0cba2 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -17,7 +17,6 @@ import torch - try: from vllm import LLM, SamplingParams diff --git a/setup.py b/setup.py index bf9310d33..13ffbe3fd 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,6 @@ from setuptools import find_packages, setup - try: from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel except BaseException: @@ -126,7 +125,6 @@ def get_version_tag() -> str: import torch # noqa: E402 - if TORCH_CUDA_ARCH_LIST is None: HAS_CUDA_V8 = any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count())) diff --git a/tests/benchmark/benchmark.py b/tests/benchmark/benchmark.py index 622a35d95..35f1ab919 100644 --- a/tests/benchmark/benchmark.py +++ b/tests/benchmark/benchmark.py @@ -14,9 +14,8 @@ # limitations under the License. from benchmark_test import BenchmarkTest -from parameterized import parameterized # noqa: E402 - from gptqmodel import BACKEND +from parameterized import parameterized # noqa: E402 class TestInference(BenchmarkTest): diff --git a/tests/benchmark/benchmark_test.py b/tests/benchmark/benchmark_test.py index 05824af41..5f7baf644 100644 --- a/tests/benchmark/benchmark_test.py +++ b/tests/benchmark/benchmark_test.py @@ -16,15 +16,13 @@ import os import time - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import unittest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.utils.progress import ProgressBar # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class BenchmarkTest(unittest.TestCase): diff --git a/tests/inference_speed.py b/tests/inference_speed.py index ecb10e85e..8bb2a122e 100644 --- a/tests/inference_speed.py +++ b/tests/inference_speed.py @@ -16,15 +16,14 @@ import os import time - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import unittest -from transformers import AutoTokenizer from gptqmodel import GPTQModel from gptqmodel.utils.progress import ProgressBar +from transformers import AutoTokenizer class InferenceSpeed(unittest.TestCase): diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 499e36ec2..138d07b2a 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -17,14 +17,12 @@ import os import sys - if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch from pathlib import Path # noqa: E402 - sys.path.insert(0, f"{str(Path(__file__).resolve().parent.parent)}/models") # noqa: E402 import contextlib # noqa: E402 import shutil # noqa: E402 @@ -34,10 +32,6 @@ import torch.cuda # noqa: E402 import transformers # noqa: E402 from datasets import load_dataset # noqa: E402 -from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 -from packaging.version import Version # noqa: E402 -from transformers import AutoProcessor, AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 @@ -45,7 +39,9 @@ from gptqmodel.utils.eval import lm_eval # noqa: E402 from gptqmodel.utils.model import MODALITY # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 - +from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 +from packaging.version import Version # noqa: E402 +from transformers import AutoProcessor, AutoTokenizer # noqa: E402 RAND_SEED = 898 diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py index d003f7a87..015473d9b 100644 --- a/tests/models/test_gptbigcode.py +++ b/tests/models/test_gptbigcode.py @@ -16,7 +16,6 @@ import importlib.util import os - # TODO: find how ipex registered it jit interpreter # if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter. # However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input diff --git a/tests/models/test_opt.py b/tests/models/test_opt.py index 6861f5276..6bb2d4965 100644 --- a/tests/models/test_opt.py +++ b/tests/models/test_opt.py @@ -13,10 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from model_test import ModelTest - from gptqmodel import BACKEND from gptqmodel.utils.importer import backend_dict +from model_test import ModelTest class TestOpt(ModelTest): diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py index 41dca1c68..63d2c46c7 100644 --- a/tests/models/test_qwen2_vl.py +++ b/tests/models/test_qwen2_vl.py @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from model_test import ModelTest - from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ +from model_test import ModelTest class TestQwen2_VL(ModelTest): diff --git a/tests/tasks/mmlu/_generate_configs.py b/tests/tasks/mmlu/_generate_configs.py index 28b94616d..f613f7cd4 100644 --- a/tests/tasks/mmlu/_generate_configs.py +++ b/tests/tasks/mmlu/_generate_configs.py @@ -9,7 +9,6 @@ import yaml from tqdm import tqdm - eval_logger = logging.getLogger("lm-eval") diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py index fba17f04e..ddeede3bd 100644 --- a/tests/test_asym_gptq_v1.py +++ b/tests/test_asym_gptq_v1.py @@ -16,13 +16,11 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +from gptqmodel.quantization import FORMAT # noqa: E402 # -- end do not touch from models.model_test import ModelTest # noqa: E402 -from gptqmodel.quantization import FORMAT # noqa: E402 - class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" diff --git a/tests/test_bits.py b/tests/test_bits.py index ce58cca55..b70163907 100644 --- a/tests/test_bits.py +++ b/tests/test_bits.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import logging # noqa: E402 @@ -24,9 +23,6 @@ import traceback # noqa: E402 import unittest # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear # noqa: E402 @@ -37,7 +33,8 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.utils.eval import lm_eval # noqa: E402 - +from lm_eval.utils import make_table # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 logger = logging.getLogger(__name__) diff --git a/tests/test_dynamic.py b/tests/test_dynamic.py index dddae591b..fa3827d81 100644 --- a/tests/test_dynamic.py +++ b/tests/test_dynamic.py @@ -17,18 +17,12 @@ import os from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear -from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch -import tempfile # noqa: E402 import json - -from datasets import load_dataset # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 +import tempfile # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 @@ -36,6 +30,9 @@ from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.quantization import QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity, safetensor # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestDynamic(ModelTest): diff --git a/tests/test_estimate_vram.py b/tests/test_estimate_vram.py index b505dc253..70f20c03c 100644 --- a/tests/test_estimate_vram.py +++ b/tests/test_estimate_vram.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 diff --git a/tests/test_eval.py b/tests/test_eval.py index d910bea33..ecdee8c05 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -18,12 +18,10 @@ import unittest from typing import Union -from lm_eval.tasks import TaskManager -from parameterized import parameterized - from gptqmodel import GPTQModel from gptqmodel.utils.eval import EVAL - +from lm_eval.tasks import TaskManager +from parameterized import parameterized os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" diff --git a/tests/test_evalplus.py b/tests/test_evalplus.py index 1e381d68d..40703b0bc 100644 --- a/tests/test_evalplus.py +++ b/tests/test_evalplus.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py index df676d97e..0dc985599 100644 --- a/tests/test_flash_attention.py +++ b/tests/test_flash_attention.py @@ -16,16 +16,14 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch +from gptqmodel import GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 - class Test(ModelTest): diff --git a/tests/test_group_size.py b/tests/test_group_size.py index 2dc4d66bb..b6452952a 100644 --- a/tests/test_group_size.py +++ b/tests/test_group_size.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import logging # noqa: E402 @@ -24,9 +23,6 @@ import traceback # noqa: E402 import unittest # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear # noqa: E402 @@ -37,7 +33,8 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.utils.eval import lm_eval # noqa: E402 - +from lm_eval.utils import make_table # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 logger = logging.getLogger(__name__) diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py index b34eb094e..c372e9259 100644 --- a/tests/test_inference_speed.py +++ b/tests/test_inference_speed.py @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +# -- do not touch import os -from parameterized import parameterized - -from gptqmodel.utils import BACKEND -from inference_speed import InferenceSpeed os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +from gptqmodel.utils import BACKEND # noqa: E402 +# -- end do not touch +from inference_speed import InferenceSpeed # noqa: E402 +from parameterized import parameterized # noqa: E402 ''' NATIVE_MODEL_ID = /monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1 @@ -54,4 +55,4 @@ def test_inference_speed(self, model_path, backend, tokens_per_second): # (there is a cache when running bitblas for the second time), # so only the results of the second run of bitblas are asserted. # The first run of bitblas only prints relevant information - self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, compile=True, warmup_runs=1) \ No newline at end of file + self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, compile=True, warmup_runs=1) diff --git a/tests/test_inference_speed_ipex.py b/tests/test_inference_speed_ipex.py index bcb64e11c..24b132aa8 100644 --- a/tests/test_inference_speed_ipex.py +++ b/tests/test_inference_speed_ipex.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +# -- do not touch import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +# -- end do not touch from gptqmodel.utils import BACKEND -from parameterized import parameterized from inference_speed import InferenceSpeed +from parameterized import parameterized class TestInferenceSpeedIpex(InferenceSpeed): @@ -29,4 +31,4 @@ class TestInferenceSpeedIpex(InferenceSpeed): ] ) def test_inference_speed_ipex(self, model_path, backend, tokens_per_second): - self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second) \ No newline at end of file + self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second) diff --git a/tests/test_ipex_xpu.py b/tests/test_ipex_xpu.py index 4fe06517b..f0c1bff66 100644 --- a/tests/test_ipex_xpu.py +++ b/tests/test_ipex_xpu.py @@ -16,16 +16,14 @@ # -- do not touch import os - os.environ["CUDA_VISIBLE_DEVICES"] = "" # -- end do not touch import tempfile # noqa: E402 -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.models._const import DEVICE # noqa: E402 +from models.model_test import ModelTest # noqa: E402 class TestsIPEX(ModelTest): diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py index ce18d9c48..e7d352667 100644 --- a/tests/test_lm_eval.py +++ b/tests/test_lm_eval.py @@ -15,15 +15,12 @@ # -- do not touch import os - # -- end do not touch import tempfile # noqa: E402 import unittest # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 - from gptqmodel.utils.eval import lm_eval # noqa: E402 - +from lm_eval.utils import make_table # noqa: E402 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py index 3b141e4b6..29b36bcb7 100644 --- a/tests/test_lm_head.py +++ b/tests/test_lm_head.py @@ -19,13 +19,11 @@ from datasets import load_dataset - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -# -- end do not touch -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 +# -- end do not touch +from models.model_test import ModelTest # noqa: E402 class TestLmHeadLoad(ModelTest): diff --git a/tests/test_mlx.py b/tests/test_mlx.py index d3fa1137b..32ca4125f 100644 --- a/tests/test_mlx.py +++ b/tests/test_mlx.py @@ -1,7 +1,6 @@ import os import sys - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if sys.platform == "darwin": @@ -9,12 +8,11 @@ import tempfile # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 from mlx_lm import generate, load # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 - class TestExport(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct/gptq_4bits_01-07_14-18-11_maxlen1024_ns1024_descFalse_damp0.1/" diff --git a/tests/test_mlx_generate.py b/tests/test_mlx_generate.py index f8581101b..f3484bfe1 100644 --- a/tests/test_mlx_generate.py +++ b/tests/test_mlx_generate.py @@ -1,16 +1,13 @@ import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import sys # noqa: E402 - if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from models.model_test import ModelTest # noqa: E402 class TestMlxGenerate(ModelTest): diff --git a/tests/test_openai_server.py b/tests/test_openai_server.py index c5c4cb0f4..19f3e33aa 100644 --- a/tests/test_openai_server.py +++ b/tests/test_openai_server.py @@ -17,10 +17,8 @@ import unittest import openai - from gptqmodel import GPTQModel - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" class TestOpeniServer(unittest.TestCase): diff --git a/tests/test_packing.py b/tests/test_packing.py index 4abaf0b7a..484bff883 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -19,11 +19,8 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch -import time # noqa: E402 import unittest # noqa: E402 -from parameterized import parameterized # noqa: E402 - # isort: off import torch # noqa: E402 import torch.nn as nn # noqa: E402 diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py index a685cbeeb..4b843117c 100644 --- a/tests/test_packing_speed.py +++ b/tests/test_packing_speed.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,7 +24,6 @@ from parameterized import parameterized # noqa: E402 - # isort: off import torch # noqa: E402 import torch.nn as nn # noqa: E402 @@ -33,7 +31,6 @@ from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 -from gptqmodel.nn_modules.qlinear.utils import dequantize_4bits_weight # noqa: E402 def gen_quant4(k, n, groupsize=-1): diff --git a/tests/test_parameter_count.py b/tests/test_parameter_count.py index 260ac2541..599c5823a 100644 --- a/tests/test_parameter_count.py +++ b/tests/test_parameter_count.py @@ -2,11 +2,10 @@ import tempfile import torch.cuda -from models.model_test import ModelTest -from safetensors.torch import load_file - from gptqmodel import GPTQModel, QuantizeConfig from gptqmodel.utils.tensor import tensor_parameters +from models.model_test import ModelTest +from safetensors.torch import load_file class TestsParameterCount(ModelTest): @@ -20,11 +19,10 @@ class TestsParameterCount(ModelTest): def test_parameter_count(self): import os.path - from huggingface_hub import hf_hub_download - from safetensors.torch import load_file - from gptqmodel import QuantizeConfig from gptqmodel.utils.tensor import tensor_parameters + from huggingface_hub import hf_hub_download + from safetensors.torch import load_file model_id = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" if os.path.isdir(model_id): diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py index dbef77856..08e826e6f 100644 --- a/tests/test_perplexity.py +++ b/tests/test_perplexity.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -24,13 +23,12 @@ import unittest # noqa: E402 from datasets import load_dataset # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: E402 - from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization.config import FORMAT, QUANT_METHOD, AutoRoundQuantizeConfig, QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity # noqa: E402 from gptqmodel.utils.rocm import IS_ROCM # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: E402 class TestPerplexity(unittest.TestCase): diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py index 9911f31b5..0f89eb3ca 100644 --- a/tests/test_q4_bitblas.py +++ b/tests/test_q4_bitblas.py @@ -16,17 +16,15 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 import torch # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestQ4BitBLAS(unittest.TestCase): diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py index f8ada0d99..c9ee5ad15 100644 --- a/tests/test_q4_cuda.py +++ b/tests/test_q4_cuda.py @@ -16,18 +16,16 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 - class TestsQ4CUDA(ModelTest): diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py index 913f57249..14fbd4b47 100644 --- a/tests/test_q4_exllama_v1.py +++ b/tests/test_q4_exllama_v1.py @@ -16,23 +16,20 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length # noqa: E402 from gptqmodel.models._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH # noqa: E402 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 from gptqmodel.utils.importer import select_quant_linear # noqa: E402 from gptqmodel.utils.model import gptqmodel_post_init # noqa: E402 - +from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 REFERENCE = torch.Tensor( [ diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py index 3e650930e..e1a239cf5 100644 --- a/tests/test_q4_exllama_v2.py +++ b/tests/test_q4_exllama_v2.py @@ -16,22 +16,19 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 import torch # noqa: E402 -from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 from gptqmodel.utils.importer import select_quant_linear # noqa: E402 from gptqmodel.utils.model import gptqmodel_post_init # noqa: E402 - +from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 GENERATE_EVAL_SIZE = 100 diff --git a/tests/test_q4_ipex.py b/tests/test_q4_ipex.py index 9693a1ace..654c3eacf 100644 --- a/tests/test_q4_ipex.py +++ b/tests/test_q4_ipex.py @@ -17,14 +17,12 @@ import os import sys - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import BACKEND # noqa: E402 +from models.model_test import ModelTest # noqa: E402 class TestsIPEX(ModelTest): diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py index 51f8e95cb..478598923 100644 --- a/tests/test_q4_marlin.py +++ b/tests/test_q4_marlin.py @@ -16,18 +16,16 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 -from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 - class TestQ4Marlin(ModelTest): diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py index f71516dec..e9547da3b 100644 --- a/tests/test_q4_torch.py +++ b/tests/test_q4_torch.py @@ -16,16 +16,14 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 - class TestsQ4Torch(ModelTest): GENERATE_EVAL_SIZE_MIN = 5 diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py index 65faeee43..00d85c08d 100644 --- a/tests/test_q4_torch_apple.py +++ b/tests/test_q4_torch_apple.py @@ -16,12 +16,11 @@ import sys # noqa: E402 import torch # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 - class TestsQ4Torch(ModelTest): GENERATE_EVAL_SIZE_MIN = 5 diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py index 1b3c53e6f..4084903f2 100644 --- a/tests/test_q4_triton.py +++ b/tests/test_q4_triton.py @@ -16,16 +16,14 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestsQ4Triton(ModelTest): diff --git a/tests/test_quant_batch.py b/tests/test_quant_batch.py index 6c672961b..a80bb257e 100644 --- a/tests/test_quant_batch.py +++ b/tests/test_quant_batch.py @@ -16,19 +16,16 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 -import unittest # noqa: E402 - -from transformers import AutoTokenizer # noqa: E402 from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization import QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestQuantBatch(ModelTest): diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 18fb037d5..c02dd3078 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -16,29 +16,22 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import json # noqa: E402 import logging # noqa: E402 import tempfile # noqa: E402 -import unittest # noqa: E402 from datasets import load_dataset # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 -from gptqmodel.quantization.config import ( # noqa: E402 - META_FIELD_QUANTIZER, - META_QUANTIZER_GPTQMODEL, - AutoRoundQuantizeConfig, - QuantizeConfig, -) +from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, # noqa: E402 + AutoRoundQuantizeConfig, QuantizeConfig) from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestQuantization(ModelTest): diff --git a/tests/test_quant_formats_auto_round.py b/tests/test_quant_formats_auto_round.py index 9906ef39a..55607644e 100644 --- a/tests/test_quant_formats_auto_round.py +++ b/tests/test_quant_formats_auto_round.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,19 +24,14 @@ import tempfile # noqa: E402 from datasets import load_dataset # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 -from gptqmodel.quantization.config import ( # noqa: E402 - META_FIELD_QUANTIZER, - META_QUANTIZER_GPTQMODEL, - AutoRoundQuantizeConfig, - QuantizeConfig, -) +from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, # noqa: E402 + AutoRoundQuantizeConfig, QuantizeConfig) from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestQuantization(ModelTest): diff --git a/tests/test_quant_time.py b/tests/test_quant_time.py index ae9688d4b..b65559699 100644 --- a/tests/test_quant_time.py +++ b/tests/test_quant_time.py @@ -15,13 +15,10 @@ import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -import unittest # noqa: E402 import time # noqa: E402 -from datasets import load_dataset # noqa: E402 from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization.config import QuantizeConfig # noqa: E402 from models.model_test import ModelTest # noqa: E402 diff --git a/tests/test_quant_trust_remote.py b/tests/test_quant_trust_remote.py index 24232223a..4a45500d6 100644 --- a/tests/test_quant_trust_remote.py +++ b/tests/test_quant_trust_remote.py @@ -16,21 +16,18 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 -import unittest # noqa: E402 import transformers # noqa: E402 -from datasets import load_dataset # noqa: E402 -from packaging.version import Version # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization import FORMAT, QuantizeConfig # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from packaging.version import Version # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + class TestQuantWithTrustRemoteTrue(ModelTest): @classmethod diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py index e8a313392..f3a54f911 100644 --- a/tests/test_save_loaded_quantized_model.py +++ b/tests/test_save_loaded_quantized_model.py @@ -16,18 +16,15 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 import unittest # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 - - MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" class TestSave(unittest.TestCase): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index c1ee6a2eb..ca1213303 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_sglang.py b/tests/test_sglang.py index dd34a5d29..a47a0b65e 100644 --- a/tests/test_sglang.py +++ b/tests/test_sglang.py @@ -15,7 +15,6 @@ import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -24,9 +23,8 @@ import sys # noqa: E402 import torch # noqa: E402 -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from models.model_test import ModelTest # noqa: E402 class TestLoadSglang(ModelTest): diff --git a/tests/test_sharded.py b/tests/test_sharded.py index f21fb128f..02b013ead 100644 --- a/tests/test_sharded.py +++ b/tests/test_sharded.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,9 +24,8 @@ import unittest # noqa: E402 import torch # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import GPTQModel # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestSharded(unittest.TestCase): diff --git a/tests/test_tgi.py b/tests/test_tgi.py index e0c9178c0..28d9a6135 100644 --- a/tests/test_tgi.py +++ b/tests/test_tgi.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import json # noqa: E402 diff --git a/tests/test_transformers_integration.py b/tests/test_transformers_integration.py index 8a86bfd57..de2ce3585 100644 --- a/tests/test_transformers_integration.py +++ b/tests/test_transformers_integration.py @@ -14,15 +14,13 @@ # limitations under the License. import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import tempfile # noqa: E402 +from gptqmodel.integration import integration # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig # noqa: E402 -from gptqmodel.integration import integration # noqa: E402 - class TestTransformersIntegration(ModelTest): diff --git a/tests/test_triton.py b/tests/test_triton.py index 216bd41ef..880cc632b 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,10 +24,8 @@ import torch # noqa: E402 import torch.utils.benchmark as benchmark # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel # noqa: E402 - +from transformers import AutoTokenizer # noqa: E402 MODEL_ID = "/monster/data/model/Llama-7B-GPTQ" DATASET_ID = "timdettmers/openassistant-guanaco" diff --git a/tests/test_triton_xpu.py b/tests/test_triton_xpu.py index cd2afa2d5..3f971a2e4 100644 --- a/tests/test_triton_xpu.py +++ b/tests/test_triton_xpu.py @@ -16,16 +16,14 @@ # -- do not touch import os - os.environ["CUDA_VISIBLE_DEVICES"] = "" # -- end do not touch import tempfile # noqa: E402 -from models.model_test import ModelTest # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.models._const import DEVICE # noqa: E402 +from models.model_test import ModelTest # noqa: E402 class TestTritonXPU(ModelTest): diff --git a/tests/test_verify_hash.py b/tests/test_verify_hash.py index ae60766b8..ff2d444ff 100644 --- a/tests/test_verify_hash.py +++ b/tests/test_verify_hash.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_vllm.py b/tests/test_vllm.py index 2e74e428b..671b3ccf3 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -16,7 +16,6 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -26,13 +25,11 @@ import tempfile # noqa: E402 import torch # noqa: E402 -from datasets import load_dataset # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestLoadVLLM(ModelTest):