From 601d20c91615163b6c062fd24ce1629f314761eb Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 13:44:03 +0100 Subject: [PATCH 01/10] remove personal paths --- kernels/cuda/cutlass_gemm/setup.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernels/cuda/cutlass_gemm/setup.py b/kernels/cuda/cutlass_gemm/setup.py index eda350d..4ca4d33 100644 --- a/kernels/cuda/cutlass_gemm/setup.py +++ b/kernels/cuda/cutlass_gemm/setup.py @@ -1,5 +1,8 @@ +import os from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME + +current_location = os.path.abspath(os.path.dirname(__file__)) setup( name='cutlass_gemm', @@ -23,11 +26,11 @@ ] }, include_dirs=[ - '/home/adhoq26/cutlass/include', - '/home/adhoq26/cutlass/tools/util/include', + f'{current_location}/cutlass/include', + f'{current_location}/cutlass/tools/util/include', ], libraries=['cuda'], - library_dirs=['/usr/local/cuda-12.4/lib64'], + library_dirs=[os.path.join(CUDA_HOME, 'lib64')], ) ], cmdclass={ From eccdb047b9317619f47b7ce70a9571fd09e048e5 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:01:32 +0100 Subject: [PATCH 02/10] Added cutlass 3.5.1 as submodule --- .gitmodules | 6 ++++++ kernels/cuda/cutlass_gemm/cutlass_3.5.1 | 1 + 2 files changed, 7 insertions(+) create mode 100644 .gitmodules create mode 160000 kernels/cuda/cutlass_gemm/cutlass_3.5.1 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1042d58 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "kernels/cuda/cutlass_gemm/cutlass"] + path = kernels/cuda/cutlass_gemm/cutlass + url = https://github.com/NVIDIA/cutlass.git +[submodule "kernels/cuda/cutlass_gemm/cutlass_3.5.1"] + path = kernels/cuda/cutlass_gemm/cutlass_3.5.1 + url = https://github.com/NVIDIA/cutlass.git diff --git a/kernels/cuda/cutlass_gemm/cutlass_3.5.1 b/kernels/cuda/cutlass_gemm/cutlass_3.5.1 new file mode 160000 index 0000000..06b2134 --- /dev/null +++ b/kernels/cuda/cutlass_gemm/cutlass_3.5.1 @@ -0,0 +1 @@ +Subproject commit 06b21349bcf6ddf6a1686a47a137ad1446579db9 From 119e41f7a41f214aa2c436c2ecbcb6a2d614c45f Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:04:13 +0100 Subject: [PATCH 03/10] remove cutlass 3.6 from the submodules --- .gitmodules | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 1042d58..d2a9ea4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "kernels/cuda/cutlass_gemm/cutlass"] - path = kernels/cuda/cutlass_gemm/cutlass - url = https://github.com/NVIDIA/cutlass.git [submodule "kernels/cuda/cutlass_gemm/cutlass_3.5.1"] path = kernels/cuda/cutlass_gemm/cutlass_3.5.1 url = https://github.com/NVIDIA/cutlass.git From 87497f94a9b94f0f39dd14d25416e4c7dc964dc0 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:29:26 +0100 Subject: [PATCH 04/10] change the import order to avoid libc10.so not found --- kernels/cuda/cutlass_gemm/test_cutlass_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py index d722ff2..f773ca0 100644 --- a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py +++ b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py @@ -1,5 +1,5 @@ -from pingpong_gemm import cutlass_scaled_mm import torch +from pingpong_gemm import cutlass_scaled_mm m, k, n = 16, 4096, 4096 dtype = torch.float8_e4m3fn From 2126b7b04a4f54b2c2d9437c91546208edf68b10 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:30:36 +0100 Subject: [PATCH 05/10] add docker to specify the cutlass version --- kernels/cuda/cutlass_gemm/Dockerfile | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 kernels/cuda/cutlass_gemm/Dockerfile diff --git a/kernels/cuda/cutlass_gemm/Dockerfile b/kernels/cuda/cutlass_gemm/Dockerfile new file mode 100644 index 0000000..0190040 --- /dev/null +++ b/kernels/cuda/cutlass_gemm/Dockerfile @@ -0,0 +1,28 @@ +# To build the image, run the following command: +# docker build -t cutlass_gemm . +# To run the image, run the following command: +# docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash + +FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel + +# Install common dependencies and utilities +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + wget \ + sudo \ + build-essential \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Set the working directory +COPY ./ /workspace +WORKDIR /workspace +ENV PYTHONPATH /workspace:$PYTHONPATH + +# Clone the cutlass repository +RUN git clone https://github.com/NVIDIA/cutlass.git /workspace/cutlass +RUN cd /workspace/cutlass && git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9 +# Install cutlass +RUN cd /workspace/cutlass && mkdir -p build +RUN cd /workspace/cutlass/build && cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON \ No newline at end of file From 81ecc8a8f0a5c9acd1da97549bb5ae2b9494cc8b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:31:32 +0100 Subject: [PATCH 06/10] Revert "remove cutlass 3.6 from the submodules" This reverts commit 119e41f7a41f214aa2c436c2ecbcb6a2d614c45f. --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index d2a9ea4..1042d58 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ +[submodule "kernels/cuda/cutlass_gemm/cutlass"] + path = kernels/cuda/cutlass_gemm/cutlass + url = https://github.com/NVIDIA/cutlass.git [submodule "kernels/cuda/cutlass_gemm/cutlass_3.5.1"] path = kernels/cuda/cutlass_gemm/cutlass_3.5.1 url = https://github.com/NVIDIA/cutlass.git From f0a84f8717fca86d2e143ab73d5ff7dfbfd517ca Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:31:32 +0100 Subject: [PATCH 07/10] Revert "Added cutlass 3.5.1 as submodule" This reverts commit eccdb047b9317619f47b7ce70a9571fd09e048e5. --- .gitmodules | 6 ------ kernels/cuda/cutlass_gemm/cutlass_3.5.1 | 1 - 2 files changed, 7 deletions(-) delete mode 100644 .gitmodules delete mode 160000 kernels/cuda/cutlass_gemm/cutlass_3.5.1 diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 1042d58..0000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "kernels/cuda/cutlass_gemm/cutlass"] - path = kernels/cuda/cutlass_gemm/cutlass - url = https://github.com/NVIDIA/cutlass.git -[submodule "kernels/cuda/cutlass_gemm/cutlass_3.5.1"] - path = kernels/cuda/cutlass_gemm/cutlass_3.5.1 - url = https://github.com/NVIDIA/cutlass.git diff --git a/kernels/cuda/cutlass_gemm/cutlass_3.5.1 b/kernels/cuda/cutlass_gemm/cutlass_3.5.1 deleted file mode 160000 index 06b2134..0000000 --- a/kernels/cuda/cutlass_gemm/cutlass_3.5.1 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 06b21349bcf6ddf6a1686a47a137ad1446579db9 From e288b7f00bc3b42f050348d0a84211e470621eb3 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:51:37 +0100 Subject: [PATCH 08/10] install cutlass_gemm within the Dockerfile --- kernels/cuda/cutlass_gemm/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernels/cuda/cutlass_gemm/Dockerfile b/kernels/cuda/cutlass_gemm/Dockerfile index 0190040..d1bccd0 100644 --- a/kernels/cuda/cutlass_gemm/Dockerfile +++ b/kernels/cuda/cutlass_gemm/Dockerfile @@ -25,4 +25,7 @@ RUN git clone https://github.com/NVIDIA/cutlass.git /workspace/cutlass RUN cd /workspace/cutlass && git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9 # Install cutlass RUN cd /workspace/cutlass && mkdir -p build -RUN cd /workspace/cutlass/build && cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON \ No newline at end of file +RUN cd /workspace/cutlass/build && cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON + +# Install cutlass gemm +RUN cd /workspace/ && pip install -e . \ No newline at end of file From aa618a3043523cc6a1eacbab3862e7fd521cb705 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 12 Dec 2024 14:53:41 +0100 Subject: [PATCH 09/10] add a readme for the cuda/cutlass fp8 gemm kernel --- kernels/cuda/cutlass_gemm/readme.md | 39 +++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/kernels/cuda/cutlass_gemm/readme.md b/kernels/cuda/cutlass_gemm/readme.md index dab43a0..aebf613 100644 --- a/kernels/cuda/cutlass_gemm/readme.md +++ b/kernels/cuda/cutlass_gemm/readme.md @@ -1,2 +1,37 @@ -Currently the CPP extension builds with Cutlass 3.5.1 (credit to @SamirMoustafa for the update). -3.6 will fail atm due to a refactor in the TMA descriptor. +# CUTLASS FP8 GEMM + +This project uses NVIDIA's CUTLASS library with Ping-Pong kernel on Hopper architecture design for efficient GPU-based GEMM. [learn more](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/) +## Installation + +- Prerequisites: NVIDIA Hopper GPU with CUDA support + +### Without Docker +```bash +# 1. Clone the CUTLASS repository +git clone https://github.com/NVIDIA/cutlass.git +cd cutlass +git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9 + +# 2. Build CUTLASS +mkdir build && cd build +cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON + +# 3. Install the Python package +pip install -e . + +# 4. Run the test script +python test_cutlass_gemm.py +``` + +### With Docker +```bash +# 1. Build the Docker image +docker build -t cutlass_gemm . + +# 2. Run the Docker container +docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash + +# 3. Inside the container, run the test script +python test_cutlass_gemm.py +``` + From 25e87fca752df6d4c323658bb56c7dfdf6c016c3 Mon Sep 17 00:00:00 2001 From: Samir Moustafa <31715540+SamirMoustafa@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:47:25 +0300 Subject: [PATCH 10/10] minor update for the setup without Docker --- kernels/cuda/cutlass_gemm/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/cuda/cutlass_gemm/readme.md b/kernels/cuda/cutlass_gemm/readme.md index aebf613..29bfeca 100644 --- a/kernels/cuda/cutlass_gemm/readme.md +++ b/kernels/cuda/cutlass_gemm/readme.md @@ -17,7 +17,7 @@ mkdir build && cd build cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # 3. Install the Python package -pip install -e . +cd ../../ && pip install -e . # 4. Run the test script python test_cutlass_gemm.py