Merge branch 'main' into fix_min_p

sgl-project · Jan 31, 2025 · 6e0c1b1 · 6e0c1b1
2 parents a3660e3 + 7876279
commit 6e0c1b1
Show file tree

Hide file tree

Showing 110 changed files with 50 additions and 23,854 deletions.
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
@@ -14,7 +14,7 @@ jobs:
     environment: 'prod'
     strategy:
       matrix:
-        cuda_version: ['11.8.0', '12.1.1', '12.4.1']
+        cuda_version: ['11.8.0', '12.1.1', '12.4.1', '12.5.1']
         build_type: ['all', 'srt']
     steps:
       - name: Delete huge unnecessary tools folder
@@ -39,6 +39,8 @@ jobs:
             cuda_tag="cu121"
           elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
             cuda_tag="cu124"
+          elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
+            cuda_tag="cu125"
           else
             echo "Unsupported CUDA version"
             exit 1
@@ -58,7 +60,7 @@ jobs:
           docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
           docker push lmsysorg/sglang:${tag}${tag_suffix}
 
-          if [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
+          if [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
             docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
             docker push lmsysorg/sglang:latest${tag_suffix}
           fi
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 
 ## Acknowledgment and Citation
 We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -30,6 +30,8 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
          python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \
        elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
          python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
+       elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
        elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
          python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118; \
          python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
@@ -42,6 +44,8 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
            python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.4/flashinfer/; \
          elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
            python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/; \
+         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/; \
          elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
            python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu118/torch2.4/flashinfer/; \
            python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
@@ -53,6 +57,8 @@ RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
            python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu121/torch2.4/flashinfer/; \
          elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
            python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/; \
+         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/; \
          elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
            python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.4/flashinfer/; \
            python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.2 -t v0.4.2-rocm620 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocmshared/vllm-rocm:20250114-tuned-elementwise-layernorm"

diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb
@@ -8,10 +8,11 @@
     "\n",
     "SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
     "\n",
+    "**Note:** Currently, Speculative Decoding in SGLang does not support radix cache.\n",
+    "\n",
     "To run the following tests or benchmarks, you also need to install [**cutex**](https://pypi.org/project/cutex/):  \n",
-    "> ```bash\n",
-    "> pip install cutex\n",
-    "> ```\n",
+    "\n",
+    "`pip install cutex`\n",
     "\n",
     "### Performance Highlights\n",
     "\n",

diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
@@ -11,9 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Nvidia
 docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
 # AMD
-docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash
 # AMD just the last 2 GPUs
-docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2-rocm620 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.4.2.post1-rocm620 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
@@ -2,7 +2,7 @@
 
 ## Generative Models
 - Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
-- Mistral / Mixtral / Mistral NeMo
+- Mistral / Mixtral / Mistral NeMo / Mistral Small 3
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
 - DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3)

diff --git a/docs/start/install.md b/docs/start/install.md
@@ -14,7 +14,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
 ## Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.2.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -28,7 +28,7 @@ Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
 
 ```
 # Use the last release branch
-git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
+git clone -b v0.4.2.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -54,7 +54,7 @@ docker run --gpus all \
 Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
-docker build --build-arg SGL_BRANCH=v0.4.2 -t v0.4.2-rocm620 -f Dockerfile.rocm .
+docker build --build-arg SGL_BRANCH=v0.4.2.post1 -t v0.4.2.post1-rocm620 -f Dockerfile.rocm .
 
 alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
     --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -63,11 +63,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
 drun -p 30000:30000 \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HF_TOKEN=<secret>" \
-    v0.4.2-rocm620 \
+    v0.4.2.post1-rocm620 \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 
 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
-drun v0.4.2-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
+drun v0.4.2.post1-rocm620 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
 ```
 
 ## Method 4: Using docker compose

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.4.2"
+version = "0.4.2.post1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -27,7 +27,7 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.3", "torch", "vllm==0.6.4.post1",
+    "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
     "flashinfer==0.1.6"
 ]
 

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -800,7 +800,9 @@ def call_begin_forward(
             kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
             kv_indptr = kv_indptr[: bs + 1]
             kv_indices = torch.empty(
-                paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
+                paged_kernel_lens_sum + 256,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
             )
             create_flashinfer_kv_indices_triton[(bs,)](
                 self.req_to_token,

diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
@@ -17,6 +17,8 @@
 import torch
 import torch.nn.functional as F
 
+from sglang.srt.utils import get_compiler_backend
+
 
 def fused_topk_native(
     hidden_states: torch.Tensor,
@@ -74,6 +76,7 @@ def fused_topk(
 
 
 # This is used by the Deepseek-V2 model
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -108,6 +111,7 @@ def grouped_topk(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def biased_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -290,6 +290,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     weight_scale, requires_grad=False
                 )
                 layer.input_scale = None
+            else:
+                layer.weight = torch.nn.Parameter(
+                    layer.weight.data, requires_grad=False
+                )
+                layer.weight_scale_inv = torch.nn.Parameter(
+                    layer.weight_scale_inv.data, requires_grad=False
+                )
             return
         layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
         # If checkpoint not serialized fp8, quantize the weights.

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
@@ -72,9 +72,11 @@ def forward(
                     # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
                     # https://github.com/flashinfer-ai/flashinfer/issues/708
                     # so we use the torch implementation.
+
+                    # clamp to avoid -inf
                     logprobs = torch.log(
                         top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    )
+                    ).clamp(min=torch.finfo(probs.dtype).min)
 
                 max_top_k_round, batch_size = 32, probs.shape[0]
                 uniform_samples = torch.rand(
@@ -109,9 +111,10 @@ def forward(
                     sampling_info.need_min_p_sampling,
                 )
                 if return_logprob:
+                    # clamp to avoid -inf
                     logprobs = torch.log(
                         top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    )
+                    ).clamp(min=torch.finfo(probs.dtype).min)
             else:
                 raise ValueError(
                     f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"

diff --git a/python/sglang/version.py b/python/sglang/version.py
@@ -1 +1 @@
-__version__ = "0.4.2"
+__version__ = "0.4.2.post1"
diff --git a/sgl-kernel/3rdparty/cutlass b/sgl-kernel/3rdparty/cutlass
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
+1 −1		..._warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+770 −0		..._warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+5 −0		examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
+507 −0		...les/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+36 −9		include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+20 −4		include/cutlass/gemm/collective/fp8_accumulation.hpp
+73 −32		include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+6 −2		include/cutlass/gemm/dispatch_policy.hpp