[NVIDIA] Support nvfp4 quantization (vllm-project#12784)

Alexei-V-Ivanov-AMD · Feb 13, 2025 · 4fc5c23 · 4fc5c23
1 parent 9f9704d
commit 4fc5c23
Show file tree

Hide file tree

Showing 12 changed files with 688 additions and 13 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -264,6 +264,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_compressor_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
@@ -377,6 +378,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # FP4 Archs and flags
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
+    set(SRCS 
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${FP4_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+  else()
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
+    # clear FP4_ARCHS
+    set(FP4_ARCHS)
+  endif()
 
   #
   # Machete kernels

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -257,9 +257,9 @@ endmacro()
 #  where `<=` is the version comparison operator.
 # In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
-# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
-#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -272,8 +272,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
   set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
-  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
-  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
@@ -283,6 +283,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
     endif()
   endif()
 
+  if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
+    list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
+    if ("10.0" IN_LIST TGT_CUDA_ARCHS)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
+      set(_CUDA_ARCHS "10.0a")
+    endif()
+  endif()
+
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
   # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that

diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdio.h>
+
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
   #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
   #define DEVICE_INLINE __forceinline__ __device__
@@ -10,6 +12,16 @@
   #define HOST_INLINE inline
 #endif
 
+#define CUDA_CHECK(cmd)                                             \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
 int64_t get_device_attribute(int64_t attribute, int64_t device_id);
 
 int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
@@ -1,16 +1,22 @@
+#include "cuda_utils.h"
 #ifdef USE_ROCM
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
 #endif
+
 int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
-  int device, value;
-  if (device_id < 0) {
-    cudaGetDevice(&device);
-  } else {
-    device = device_id;
-  }
-  cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
-                         device);
+  // Return the cached value on subsequent calls
+  static int value = [=]() {
+    int device = static_cast<int>(device_id);
+    if (device < 0) {
+      CUDA_CHECK(cudaGetDevice(&device));
+    }
+    int value;
+    CUDA_CHECK(cudaDeviceGetAttribute(
+        &value, static_cast<cudaDeviceAttr>(attribute), device));
+    return static_cast<int>(value);
+  }();
+
   return value;
 }
 

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -195,6 +195,10 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale);
+
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale);
 

diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf);
+#endif
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+}