From 29b580f1d0f112155e03b25abbd5dfa8549fea26 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:17:12 +0800 Subject: [PATCH 01/15] Create quantization.md --- docs/backend/quantization.md | 102 +++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 docs/backend/quantization.md diff --git a/docs/backend/quantization.md b/docs/backend/quantization.md new file mode 100644 index 00000000000..2389368cc5a --- /dev/null +++ b/docs/backend/quantization.md @@ -0,0 +1,102 @@ +# Quantization + +`SGLang` support various quantization methods, including online dynamic quantization and offline quantization. + +Online quantization computes weight scaling stats(max/min) dynamically at runtime, as examplified by the delayed scaling in NVIDIA FP8 training. For inference this quantizes the model once on loading. + +Offline quantization saves pre-quantized model weights and loads during inference. This is useful for methods requiring pre-computed stats such as AWQ, which collects activation stats from the pre-training set. + +Please visit [here](https://huggingface.co/collections/neuralmagic) for some popular quantized LLMs on huggingface. + +## Online Quantization + +> Note: Although we support online quantization, we recommend users to use quantized models. + +To enable online quantization, you can simply specify `--quantization` in the command line. For example, if you want to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command: + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --quantization fp8 \ + --port 30000 --host 0.0.0.0 +``` + +Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `["awq", "gptq", "marlin", "gptq_marlin", "awq_marlin", "bitsandbytes", "gguf"]` + +We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command: + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --torchao-config int4wo-128 \ + --port 30000 --host 0.0.0.0 +``` + +We support the following quantization methods based on torchao `["int8dq", "int8wo", "fp8wo", "fp8dq-per_tensor", "fp8dq-per_row", "int4wo-32", "int4wo-64", "int4wo-128", "int4wo-256"]` + +Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `"int8dq"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `"int8dq"` method. Namely, please use the following command: + +```bash +python3 -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --torchao-config int8dq \ + --disable-cuda-graph \ + --port 30000 --host 0.0.0.0 +``` + + +## Offline Quantization + +To do offline quantization for your model, firstly you need to install [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```bash +pip install llmcompressor +``` + +Here, we take quantize `meta-llama/Meta-Llama-3-8B-Instruct` to `FP8` as an example to elaborate on how to do offline quantization. + +```python +from transformers import AutoTokenizer +from llmcompressor.transformers import SparseAutoModelForCausalLM +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +# Step 1: Load the original model. +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Step 2: Perform offline quantization. +# Step 2.1: Configure the simple PTQ quantization. +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + +# Step 2.2: Apply the quantization algorithm. +oneshot(model=model, recipe=recipe) + +# Step 3: Save the model. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) +``` + +Then, you can directly use the quantized model with `SGLang`, by using the following command: + +```bash +python3 -m sglang.launch_server \ + --model-path $PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic \ + --port 30000 --host 0.0.0.0 +``` + +**Note: If the model has already quantized offline, please **do not** add `--quantization` argument when starting the engine.** + + +## Reference + +- [quantization document of vllm](https://docs.vllm.ai/en/latest/quantization/fp8.html) + +- [torchao](https://github.com/pytorch/ao) + +- [llm-compressor](https://github.com/vllm-project/llm-compressor/) From 0fc00f8c02f6395fbdf2b7d39bc104e9b8ac128f Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:18:52 +0800 Subject: [PATCH 02/15] Create quantization.ipynb --- docs/backend/quantization.ipynb | 281 ++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 docs/backend/quantization.ipynb diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb new file mode 100644 index 00000000000..2f7c583e15b --- /dev/null +++ b/docs/backend/quantization.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization\n", + "\n", + "`SGLang` support various quantization methods, including online dynamic quantization and offline quantization.\n", + "\n", + "Please visit [here](https://huggingface.co/collections/neuralmagic) for some popular quantized LLMs on huggingface.\n", + "\n", + "## Online Quantization\n", + "\n", + "> Note: Although we support online quantization, we recommend users to use quantized models. \n", + "\n", + "To enable online quantization, you can simply specify `--quantization` in the command line. For example, if you want to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", + "\n", + "```bash\n", + "python3 -m sglang.launch_server \\\n", + " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + " --quantization fp8 \\\n", + " --port 30000 --host 0.0.0.0\n", + "```\n", + "\n", + "which is equivalent to the following code block:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-27 05:23:35] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization='fp8', context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=559844691, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n", + "[2024-12-27 05:23:44 TP0] Init torch distributed begin.\n", + "[2024-12-27 05:23:45 TP0] Load weight begin. avail mem=78.58 GB\n", + "[2024-12-27 05:23:46 TP0] Using model weights format ['*.safetensors']\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sglang.utils import (\n", + " execute_shell_command,\n", + " wait_for_server,\n", + " terminate_process,\n", + " print_highlight,\n", + ")\n", + "\n", + "server_process = execute_shell_command(\n", + " \"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --torchao-config int4wo-128 --port 30000 --host 0.0.0.0\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30000\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We support the following quantization methods based on torchao `[\"int8dq\", \"int8wo\", \"fp8wo\", \"fp8dq-per_tensor\", \"fp8dq-per_row\", \"int4wo-32\", \"int4wo-64\", \"int4wo-128\", \"int4wo-256\"]`\n", + "\n", + "Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `\"int8dq\"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `\"int8dq\"` method. Namely, please use the following command:\n", + "\n", + "```bash\n", + "python3 -m sglang.launch_server \\\n", + " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + " --torchao-config int8dq \\\n", + " --disable-cuda-graph \\\n", + " --port 30000 --host 0.0.0.0\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Offline Quantization\n", + "\n", + "To do offline quantization for your model, firstly you need to install [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:\n", + "\n", + "```bash\n", + "pip install llmcompressor\n", + "```\n", + "\n", + "Here, we take quantize `meta-llama/Meta-Llama-3-8B-Instruct` to `FP8` as an example to elaborate on how to do offline quantization.\n", + "\n", + "```python\n", + "from transformers import AutoTokenizer\n", + "from llmcompressor.transformers import SparseAutoModelForCausalLM\n", + "from llmcompressor.transformers import oneshot\n", + "from llmcompressor.modifiers.quantization import QuantizationModifier\n", + "\n", + "# Step 1: Load the original model.\n", + "MODEL_ID = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n", + "\n", + "model = SparseAutoModelForCausalLM.from_pretrained(\n", + " MODEL_ID, device_map=\"auto\", torch_dtype=\"auto\")\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n", + "\n", + "# Step 2: Perform offline quantization.\n", + "# Step 2.1: Configure the simple PTQ quantization.\n", + "recipe = QuantizationModifier(\n", + " targets=\"Linear\", scheme=\"FP8_DYNAMIC\", ignore=[\"lm_head\"])\n", + "\n", + "# Step 2.2: Apply the quantization algorithm.\n", + "oneshot(model=model, recipe=recipe)\n", + "\n", + "# Step 3: Save the model.\n", + "SAVE_DIR = MODEL_ID.split(\"/\")[1] + \"-FP8-Dynamic\"\n", + "model.save_pretrained(SAVE_DIR)\n", + "tokenizer.save_pretrained(SAVE_DIR)\n", + "```\n", + "\n", + "Then, you can directly use the quantized model with `SGLang`, by using the following command:\n", + "\n", + "```bash\n", + "python3 -m sglang.launch_server \\\n", + " --model-path $PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic \\\n", + " --port 30000 --host 0.0.0.0\n", + "```\n", + "\n", + "Note: If the model has already quantized offline, please **do not** add `--quantization` argument when starting the engine.\n", + "\n", + "\n", + "## Reference\n", + "\n", + "- [quantization document of vllm](https://docs.vllm.ai/en/latest/quantization/fp8.html)\n", + "\n", + "- [torchao](https://github.com/pytorch/ao)\n", + "\n", + "- [llm-compressor](https://github.com/vllm-project/llm-compressor/)\n" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 24ee864d16132f63e443355672b3e1c212d75245 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:25:14 +0800 Subject: [PATCH 03/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 2f7c583e15b..a09804c4074 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -6,13 +6,17 @@ "source": [ "# Quantization\n", "\n", - "`SGLang` support various quantization methods, including online dynamic quantization and offline quantization.\n", + "SGLang support various quantization methods, including offline quantization for weight and online dynamic quantization for activation only (we do not recommend online quantization for weights).\n", "\n", - "Please visit [here](https://huggingface.co/collections/neuralmagic) for some popular quantized LLMs on huggingface.\n", + For model quantization, you have three options: + + Use official quantized versions if available (recommended, e.g. official Llama quantized models) + Use third-party quantized versions (e.g. models from neuralmagic collection here) + Quantize the models yourself\n", "\n", "## Online Quantization\n", "\n", - "> Note: Although we support online quantization, we recommend users to use quantized models. \n", + "> Note: Although we support online quantization, users are advised to load offline quantized weights \n", "\n", "To enable online quantization, you can simply specify `--quantization` in the command line. For example, if you want to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", "\n", From 89c390b7ec6bded8d92e122bb30744a3fbe2a686 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:25:42 +0800 Subject: [PATCH 04/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index a09804c4074..31aba9b6895 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -8,7 +8,7 @@ "\n", "SGLang support various quantization methods, including offline quantization for weight and online dynamic quantization for activation only (we do not recommend online quantization for weights).\n", "\n", - For model quantization, you have three options: + "For model quantization, you have three options: Use official quantized versions if available (recommended, e.g. official Llama quantized models) Use third-party quantized versions (e.g. models from neuralmagic collection here) From 1412ec90b2ea583716fab95136cc7803c5920c94 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:26:54 +0800 Subject: [PATCH 05/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 31aba9b6895..ee4dac5c222 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -8,11 +8,10 @@ "\n", "SGLang support various quantization methods, including offline quantization for weight and online dynamic quantization for activation only (we do not recommend online quantization for weights).\n", "\n", - "For model quantization, you have three options: - - Use official quantized versions if available (recommended, e.g. official Llama quantized models) - Use third-party quantized versions (e.g. models from neuralmagic collection here) - Quantize the models yourself\n", + """For model quantization, you have three options:\n\n\ + 1. Use official quantized versions if available (recommended, e.g. official Llama quantized models)\n\ + 2. Use third-party quantized versions (e.g. models from neuralmagic collection here)\n\ + 3. Quantize the models yourself\n""" "\n", "## Online Quantization\n", "\n", From 97778064ef893cb3857091f195c4915280c5aec3 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:27:14 +0800 Subject: [PATCH 06/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index ee4dac5c222..ca40527ea63 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -11,7 +11,7 @@ """For model quantization, you have three options:\n\n\ 1. Use official quantized versions if available (recommended, e.g. official Llama quantized models)\n\ 2. Use third-party quantized versions (e.g. models from neuralmagic collection here)\n\ - 3. Quantize the models yourself\n""" + 3. Quantize the models yourself\n""", "\n", "## Online Quantization\n", "\n", From 020ef7a5d2c75f004da964254a5dad1d5212d08e Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:28:27 +0800 Subject: [PATCH 07/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index ca40527ea63..961d06c08e0 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -8,10 +8,11 @@ "\n", "SGLang support various quantization methods, including offline quantization for weight and online dynamic quantization for activation only (we do not recommend online quantization for weights).\n", "\n", - """For model quantization, you have three options:\n\n\ - 1. Use official quantized versions if available (recommended, e.g. official Llama quantized models)\n\ - 2. Use third-party quantized versions (e.g. models from neuralmagic collection here)\n\ - 3. Quantize the models yourself\n""", + "For model quantization, you have three options:\n", + "\n", + "1. Use official quantized versions if available (recommended, e.g. official Llama quantized models)\n", + "2. Use third-party quantized versions (e.g. models from neuralmagic collection here)\n", + "3. Quantize the models yourself\n", "\n", "## Online Quantization\n", "\n", From b2a713a426aa628fb667ae9a817548d72c8ae3f5 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:31:49 +0800 Subject: [PATCH 08/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 961d06c08e0..b31e5cc7a9b 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -105,6 +105,8 @@ "source": [ "Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `[\"awq\", \"gptq\", \"marlin\", \"gptq_marlin\", \"awq_marlin\", \"bitsandbytes\", \"gguf\"]`\n", "\n", + "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n" + "\n" "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", "\n", "```bash\n", From 2a8c33bbc68f101f1d974f8facca8055e5309de2 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:32:04 +0800 Subject: [PATCH 09/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index b31e5cc7a9b..cb7002b7223 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -105,7 +105,7 @@ "source": [ "Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `[\"awq\", \"gptq\", \"marlin\", \"gptq_marlin\", \"awq_marlin\", \"bitsandbytes\", \"gguf\"]`\n", "\n", - "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n" + "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n", "\n" "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", "\n", From 1e771e7d3f0cfe6543c733604a3850c1aed7619c Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:32:17 +0800 Subject: [PATCH 10/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index cb7002b7223..918a803c566 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -106,7 +106,7 @@ "Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `[\"awq\", \"gptq\", \"marlin\", \"gptq_marlin\", \"awq_marlin\", \"bitsandbytes\", \"gguf\"]`\n", "\n", "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n", - "\n" + "\n", "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", "\n", "```bash\n", From e55d75e827ab905c473d0274744952f7b2fb0be9 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:42:18 +0800 Subject: [PATCH 11/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 918a803c566..0bccde81276 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -42,7 +42,13 @@ { "data": { "text/html": [ - "

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + "
\n", + "\n", + " NOTE: Typically, the server runs in a separate terminal.
\n", + " In this notebook, we run the server and notebook code together, so their outputs are combined.
\n", + " To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
\n", + "
\n", + "
" ], "text/plain": [ "" From 3cc21e4fc68ca7b5e77786ad943b640ddb1bf649 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 22:54:47 +0800 Subject: [PATCH 12/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 0bccde81276..15096e2a9cc 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -109,20 +109,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `[\"awq\", \"gptq\", \"marlin\", \"gptq_marlin\", \"awq_marlin\", \"bitsandbytes\", \"gguf\"]`\n", + "Our team is working on supporting more quantization methods. We will soon support other quantization methods including but not limited to `[\"gptq\", \"marlin\", \"gptq_marlin\", \"awq_marlin\", \"bitsandbytes\", \"gguf\"]`\n", "\n", "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n", "\n", "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", "\n", - "```bash\n", - "python3 -m sglang.launch_server \\\n", - " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - " --torchao-config int4wo-128 \\\n", - " --port 30000 --host 0.0.0.0\n", - "```\n", - "\n", - "which is equivalent to the following code block:" ] }, { @@ -180,6 +172,7 @@ } ], "source": [ + "\n", "from sglang.utils import (\n", " execute_shell_command,\n", " wait_for_server,\n", @@ -202,15 +195,8 @@ "source": [ "We support the following quantization methods based on torchao `[\"int8dq\", \"int8wo\", \"fp8wo\", \"fp8dq-per_tensor\", \"fp8dq-per_row\", \"int4wo-32\", \"int4wo-64\", \"int4wo-128\", \"int4wo-256\"]`\n", "\n", - "Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `\"int8dq\"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `\"int8dq\"` method. Namely, please use the following command:\n", + "Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `\"int8dq\"` method currently has some bugs when using together with cuda graph capture. So we suggest to use `--disable-cuda-graph` capture when using `\"int8dq\"` method. \n", "\n", - "```bash\n", - "python3 -m sglang.launch_server \\\n", - " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - " --torchao-config int8dq \\\n", - " --disable-cuda-graph \\\n", - " --port 30000 --host 0.0.0.0\n", - "```" ] }, { From 12453219fd6f6c2281ed396f60e91f74c5e09ecd Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 23:00:56 +0800 Subject: [PATCH 13/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 15096e2a9cc..6ab0aae4795 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -113,8 +113,8 @@ "\n", "**Note that:** Some of these quantization methods are still under development and may not be fully stable yet.\n", "\n", - "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:\n", - "\n", + "We also support quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct` \n", + "\n" ] }, { @@ -196,7 +196,7 @@ "We support the following quantization methods based on torchao `[\"int8dq\", \"int8wo\", \"fp8wo\", \"fp8dq-per_tensor\", \"fp8dq-per_row\", \"int4wo-32\", \"int4wo-64\", \"int4wo-128\", \"int4wo-256\"]`\n", "\n", "Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `\"int8dq\"` method currently has some bugs when using together with cuda graph capture. So we suggest to use `--disable-cuda-graph` capture when using `\"int8dq\"` method. \n", - "\n", + "\n" ] }, { From 1fe89de49fd3051b6bed2ca9c6c51128fd32265b Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 23:02:14 +0800 Subject: [PATCH 14/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 6ab0aae4795..962e99e3e49 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -39,24 +39,6 @@ } }, "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " NOTE: Typically, the server runs in a separate terminal.
\n", - " In this notebook, we run the server and notebook code together, so their outputs are combined.
\n", - " To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", From e4f3253d6aeafe6a746a64df7a5d451ce52c89c1 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 1 Feb 2025 23:03:07 +0800 Subject: [PATCH 15/15] Update quantization.ipynb --- docs/backend/quantization.ipynb | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/backend/quantization.ipynb b/docs/backend/quantization.ipynb index 962e99e3e49..da7896e5fed 100644 --- a/docs/backend/quantization.ipynb +++ b/docs/backend/quantization.ipynb @@ -39,6 +39,24 @@ } }, "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " NOTE: Typically, the server runs in a separate terminal.
\n", + " In this notebook, we run the server and notebook code together, so their outputs are combined.
\n", + " To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -139,18 +157,6 @@ "[2024-12-27 05:25:27] INFO: 127.0.0.1:40704 - \"POST /generate HTTP/1.1\" 200 OK\n", "[2024-12-27 05:25:27] The server is fired up and ready to roll!\n" ] - }, - { - "data": { - "text/html": [ - "

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [