Skip to content

Commit

Permalink
format (#1189)
Browse files Browse the repository at this point in the history
* format

* ignore format

* format ignore E402

* install uv

* remove uv
  • Loading branch information
Qubitium authored Jan 31, 2025
1 parent a4cb9bc commit fb60ef3
Show file tree
Hide file tree
Showing 102 changed files with 125 additions and 366 deletions.
6 changes: 2 additions & 4 deletions examples/benchmark/generation_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@

import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, GenerationConfig
from transformers.generation.logits_process import LogitsProcessor

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.utils.progress import ProgressBar

from transformers import AutoTokenizer, GenerationConfig
from transformers.generation.logits_process import LogitsProcessor

logger = logging.getLogger(__name__)

Expand Down
2 changes: 0 additions & 2 deletions examples/benchmark/ipex.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


try:
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
bind_cores_for_best_perf()
Expand All @@ -29,7 +28,6 @@

import argparse


parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
Expand Down
4 changes: 1 addition & 3 deletions examples/benchmark/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
import os

import torch
from transformers import AutoTokenizer

from gptqmodel.utils import Perplexity

from transformers import AutoTokenizer

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

Expand Down
4 changes: 1 addition & 3 deletions examples/evaluation/run_language_modeling_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import LanguageModelingTask
from gptqmodel.utils.torch import torch_empty_cache

from transformers import AutoTokenizer

DATASET = "tatsu-lab/alpaca"
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
Expand Down
4 changes: 1 addition & 3 deletions examples/evaluation/run_sequence_classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import SequenceClassificationTask
from gptqmodel.utils.torch import torch_empty_cache

from transformers import AutoTokenizer

DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
Expand Down
4 changes: 1 addition & 3 deletions examples/evaluation/run_text_summarization_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@

import datasets
import torch
from transformers import AutoTokenizer, GenerationConfig

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import TextSummarizationTask
from gptqmodel.utils.torch import torch_empty_cache

from transformers import AutoTokenizer, GenerationConfig

os.system("pip install py7zr")

Expand Down
1 change: 0 additions & 1 deletion examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
Expand Down
4 changes: 1 addition & 3 deletions examples/inference/run_with_different_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@
import sys
from argparse import ArgumentParser

from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device

from transformers import AutoTokenizer

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
Expand Down
4 changes: 1 addition & 3 deletions examples/quantization/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@

import os

from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device

from transformers import AutoTokenizer

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

Expand Down
4 changes: 1 addition & 3 deletions examples/quantization/basic_usage_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@
# limitations under the License.

import torch
from transformers import AutoTokenizer

from gptqmodel import GPTQModel
from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402

from transformers import AutoTokenizer

pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
4 changes: 1 addition & 3 deletions examples/quantization/basic_usage_wikitext2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@

import torch
from datasets import load_dataset
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig

from transformers import AutoTokenizer

pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
1 change: 0 additions & 1 deletion examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig


model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
Expand Down
2 changes: 1 addition & 1 deletion format/ruff.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Never enforce `E501` (line length violations).
lint.ignore = ["C901", "E501", "E741", "W605"]
lint.ignore = ["C901", "E501", "E741", "W605", "E402"]
lint.select = ["C", "E", "F", "I", "W"]
line-length = 119

Expand Down
1 change: 0 additions & 1 deletion gptqmodel/models/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from ..utils.rocm import IS_ROCM
from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU


CPU = device("cpu")
CUDA = device("cuda")
CUDA_0 = device("cuda:0")
Expand Down
7 changes: 2 additions & 5 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

import os


if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True'
print("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.")
Expand All @@ -28,7 +27,6 @@

import sys # noqa: E402


# TODO: waiting for pytorch implementgation of aten ops for MPS
if sys.platform == "darwin":
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
Expand Down Expand Up @@ -96,7 +94,6 @@
from .definitions.xverse import XverseGPTQ # noqa: E402
from .definitions.yi import YiGPTQ # noqa: E402


logger = setup_logger()

MODEL_MAP = {
Expand Down Expand Up @@ -300,11 +297,10 @@ def eval(
if task not in EVAL.get_task_enums():
raise ValueError(f"lm_eval support tasks: {EVAL.get_all_tasks_string()}")

from gptqmodel.utils.eval import lm_eval
from lm_eval.utils import make_table
from transformers import AutoTokenizer

from gptqmodel.utils.eval import lm_eval

tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)

model_name = 'hf' if backend == 'gptqmodel' else backend
Expand Down Expand Up @@ -372,6 +368,7 @@ def export(model_id_or_path: str, target_path: str, format: str, trust_remote_co
if format == "mlx":
try:
from mlx_lm.utils import save_config, save_weights

from ..utils.mlx import convert_gptq_to_mlx_weights
except ImportError:
raise ValueError("MLX not installed. Please install via `pip install gptqmodel[mlx] --no-build-isolation`.")
Expand Down
30 changes: 6 additions & 24 deletions gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,33 +36,15 @@
from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
from ..utils.importer import select_quant_linear
from ..utils.logger import setup_logger
from ..utils.model import (
MODALITY,
check_to_quantized,
find_layers,
get_device,
get_module,
get_module_by_name_prefix,
get_moe_layer_modules,
move_to,
nested_move_to,
normalize_tokenizer,
pack_model,
)
from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device,
get_module, get_module_by_name_prefix, get_moe_layer_modules,
move_to, nested_move_to, normalize_tokenizer, pack_model)
from ..utils.progress import ProgressBar
from ..utils.safetensor import untie_weights
from ..utils.torch import torch_empty_cache
from ._const import CPU, DEVICE, SUPPORTS_MODULE_TYPES
from .loader import ModelLoader
from .writer import (
QUANT_LOG_DAMP,
QUANT_LOG_FWD_TIME,
QUANT_LOG_LAYER,
QUANT_LOG_LOSS,
QUANT_LOG_MODULE,
QUANT_LOG_TIME,
ModelWriter,
)
from .writer import (QUANT_LOG_DAMP, QUANT_LOG_FWD_TIME, QUANT_LOG_LAYER,
QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter)

# pytorch 2.6.0 fixes many compilation errors
PYTORCH_MIN_VERFSION_WITH_COMPILE = Version("2.6.0")
Expand Down Expand Up @@ -931,7 +913,7 @@ def compile(self, backend="inductor", mode="reduce-overhead"):

try:
self.model = torch.compile(self.model, fullgraph=True, backend=backend, mode=mode)
except Exception as e:
except Exception:
logger.info("Compiling model again with `fullgraph=False`")
self.model = torch.compile(self.model, fullgraph=False, backend=backend, mode=mode)
return self
Expand Down
1 change: 0 additions & 1 deletion gptqmodel/models/definitions/gemma2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from ...utils.logger import setup_logger
from ..base import BaseGPTQModel


logger = setup_logger()

SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ."
Expand Down
26 changes: 6 additions & 20 deletions gptqmodel/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,14 @@
from ..utils.backend import BACKEND
from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear
from ..utils.logger import setup_logger
from ..utils.marlin import (
_validate_marlin_compatibility,
_validate_marlin_device_support,
prepare_model_for_marlin_load,
)
from ..utils.model import (
auto_dtype,
convert_gptq_v1_to_v2_format,
find_layers,
get_checkpoints,
get_moe_layer_modules,
gptqmodel_post_init,
load_checkpoint_in_model_then_tie_weights,
make_quant,
normalize_tokenizer,
simple_dispatch_model,
verify_model_hash,
verify_sharded_model_hashes,
)
from ..utils.marlin import (_validate_marlin_compatibility,
_validate_marlin_device_support, prepare_model_for_marlin_load)
from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_layers, get_checkpoints,
get_moe_layer_modules, gptqmodel_post_init,
load_checkpoint_in_model_then_tie_weights, make_quant, normalize_tokenizer,
simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
from ._const import DEVICE, SUPPORTED_MODELS, normalize_device


logger = setup_logger()

ATTN_IMPLEMENTATION = "attn_implementation"
Expand Down
30 changes: 6 additions & 24 deletions gptqmodel/models/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,18 @@
from transformers.models.auto.tokenization_auto import get_tokenizer_config
from transformers.utils.generic import ContextManagers

from ..quantization.config import (
FORMAT,
META_FIELD_DAMP_AUTO_INCREMENT,
META_FIELD_DAMP_PERCENT,
META_FIELD_MSE,
META_FIELD_QUANTIZER,
META_FIELD_STATIC_GROUPS,
META_FIELD_TRUE_SEQUENTIAL,
META_FIELD_URI,
META_QUANTIZER_GPTQMODEL,
META_VALUE_URI,
MIN_VERSION_WITH_V2,
)
from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE,
META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL,
META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2)
from ..utils.backend import BACKEND
from ..utils.logger import setup_logger
from ..utils.model import (
convert_gptq_v2_to_v1_format,
copy_py_files,
find_layers,
get_model_files_size,
get_moe_layer_modules,
get_state_dict_for_save,
load_checkpoint_in_model_then_tie_weights,
make_quant,
)
from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_layers,
get_model_files_size, get_moe_layer_modules, get_state_dict_for_save,
load_checkpoint_in_model_then_tie_weights, make_quant)
from ..utils.torch import torch_empty_cache
from ..version import __version__
from ._const import CPU


logger = setup_logger()

QUANT_LOG_LAYER = "layer"
Expand Down
2 changes: 1 addition & 1 deletion gptqmodel/nn_modules/qlinear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import List, Optional, Tuple

import numpy as np
import torch as t # conflict with torch.py
import torch as t # conflict with torch.py
import torch.nn as nn
import transformers

Expand Down
4 changes: 1 addition & 3 deletions gptqmodel/nn_modules/qlinear/bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,11 @@
import numpy as np
import torch
import torch.nn as nn

from gptqmodel.nn_modules.qlinear import BaseQuantLinear

from ...models._const import DEVICE, PLATFORM
from ...utils.logger import setup_logger


logger = setup_logger()

BITBLAS_TARGET = None
Expand Down Expand Up @@ -389,4 +387,4 @@ def forward(self, A):
return C


__all__ = ["BitBLASQuantLinear"]
__all__ = ["BitBLASQuantLinear"]
1 change: 0 additions & 1 deletion gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from ...utils.logger import setup_logger


logger = setup_logger()

TARGET_MISSING_ERROR = (
Expand Down
1 change: 0 additions & 1 deletion gptqmodel/nn_modules/qlinear/dynamic_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from typing import Optional, Tuple

import torch

from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
from gptqmodel.utils.logger import setup_logger

Expand Down
Loading

0 comments on commit fb60ef3

Please sign in to comment.