From d77e518cf79e4d7322aeca580bf5a6e977506952 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 23 May 2024 14:11:23 +0000 Subject: [PATCH] Fix examples --- example.py | 4 +--- example_dataset.py | 21 ++++----------------- examples/example_mixtral.py | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 20 deletions(-) create mode 100644 examples/example_mixtral.py diff --git a/example.py b/example.py index 4ee920e..832cf1e 100644 --- a/example.py +++ b/example.py @@ -9,12 +9,10 @@ examples = ["auto_fp8 is an easy-to-use model quantization library"] examples = tokenizer(examples, return_tensors="pt").to("cuda") -ignore_patterns = ["re:.*gate"] - quantize_config = BaseQuantizeConfig( quant_method="fp8", activation_scheme="dynamic", # or "static" - ignore_patterns=ignore_patterns, + ignore_patterns=["re:.*lm_head"], ) model = AutoFP8ForCausalLM.from_pretrained( diff --git a/example_dataset.py b/example_dataset.py index d98c9f8..403dba8 100644 --- a/example_dataset.py +++ b/example_dataset.py @@ -7,24 +7,11 @@ quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(512) +examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] +examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") -DATASET_ID = "mgoin/ultrachat_2k" -DATASET_SPLIT = "train_sft" -ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) -ds = ds.map( - lambda batch: { - "text": tokenizer.apply_chat_template(batch["messages"], tokenize=False) - } -) -examples = [sample["text"] for sample in ds] -tokenizer.pad_token = tokenizer.eos_token -examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to( - "cuda" -) - -quantize_config = BaseQuantizeConfig( - quant_method="fp8", activation_scheme="static" -) # or "static" +quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") model = AutoFP8ForCausalLM.from_pretrained( pretrained_model_dir, quantize_config=quantize_config diff --git a/examples/example_mixtral.py b/examples/example_mixtral.py new file mode 100644 index 0000000..e37a900 --- /dev/null +++ b/examples/example_mixtral.py @@ -0,0 +1,24 @@ +from datasets import load_dataset +from transformers import AutoTokenizer + +from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + +pretrained_model_dir = "mistralai/Mixtral-8x7B-Instruct-v0.1" +quantized_model_dir = "Mixtral-8x7B-Instruct-v0.1-FP8" + +tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(10) +examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] +examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") + +quantize_config = BaseQuantizeConfig( + quant_method="fp8", + activation_scheme="static", + ignore_patterns=["re:.*lm_head", "re:.*gate"], +) + +model = AutoFP8ForCausalLM.from_pretrained( + pretrained_model_dir, quantize_config=quantize_config +) +model.quantize(examples) +model.save_quantized(quantized_model_dir) \ No newline at end of file