2025-02-19 nightly release (56d6e4a)

pytorch · Feb 19, 2025 · db659a3 · db659a3
1 parent f9eadd4
commit db659a3
Show file tree

Hide file tree

Showing 33 changed files with 414 additions and 236 deletions.
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -93,6 +93,7 @@ __install_check_operator_registrations () {
     local test_operators=(
       "torch.ops.fbgemm.nccl_init"
       "torch.ops.fbgemm.gqa_attn_splitk"
+      "torch.ops.fbgemm.rope_qkv_decoding"
     )
   else
     local test_operators=(

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -494,6 +494,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
       11.8.0
       12.4.1
       12.6.3
+      12.8.0
     )
   elif [ "$variant_type" == "rocm" ]; then
     local variant_versions=(

diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -18,7 +18,8 @@ if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-outpu
 
 if  [[ "$CU_VERSION" == "cu121" ]] ||
     [[ "$CU_VERSION" == "cu124" ]] ||
-    [[ "$CU_VERSION" == "cu126" ]]; then
+    [[ "$CU_VERSION" == "cu126" ]] ||
+    [[ "$CU_VERSION" == "cu128" ]]; then
     export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;9.0a"
     echo "Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"
 

diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml
@@ -61,3 +61,4 @@ jobs:
       trigger-event: ${{ github.event_name }}
       architecture: aarch64
       setup-miniconda: false
+      timeout: 180
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
@@ -9,10 +9,15 @@ on:
   push:
     branches:
       - main
+
   pull_request:
     branches:
       - main
 
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -73,7 +73,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -155,7 +155,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]

diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -57,7 +57,7 @@ jobs:
           { arch: x86, instance: "ubuntu-latest" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -144,7 +144,7 @@ jobs:
           { arch: x86, instance: "ubuntu-latest" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         compiler: [ "gcc", "clang" ]
     needs: build_artifact
 

diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
@@ -18,6 +18,10 @@ on:
     branches:
       - main
 
+  # Manual Trigger (for testing only)
+  #
+  workflow_dispatch:
+
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -125,7 +125,7 @@ jobs:
           { instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
 
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "11.8.0", "12.4.1", "12.6.3" ]
+        options: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         default: "12.4.1"
       publish_to_pypi:
         description: Publish Artifact to PyPI
@@ -72,7 +72,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
 
     steps:
     - name: Setup Build Container
@@ -146,7 +146,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
     needs: build_artifact
 
     steps:

diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "11.8.0", "12.4.1", "12.6.3" ]
+        options: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         default: "12.4.1"
       publish_to_pypi:
         description: Publish Artifact to PyPI
@@ -72,7 +72,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
 
     steps:
     - name: Setup Build Container
@@ -146,7 +146,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
     needs: build_artifact
 
     steps:

diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py
@@ -133,6 +133,7 @@ def benchmark_grouped(
     use_rotating_buffer_bench: bool = False,
     use_cuda_graph: bool = True,
     trace: bool = False,
+    num_iters: int = 1,
 ) -> Dict[str, Any]:
     num_groups = len(m)
     # Create input tensors.
@@ -171,40 +172,47 @@ def benchmark_grouped(
             metrics.sim += float(
                 torch.mean(torch.pow(output[i] - out_ref[i], 2)).item()
             )
-
-        # Now perform benchmark.
-        if bench_quantize:
-            # Benchmark both quantize and compute.
-            with profiler_or_nullcontext(enabled=trace, with_stack=True):
-                metrics.ms = quantize_op.benchmark(
-                    *preprocessed_args,
-                    bench_quantize=True,
-                    use_rotating_buffer_bench=use_rotating_buffer_bench,
-                    use_cuda_graph=use_cuda_graph,
+        for _ in range(num_iters):
+            # Now perform benchmark.
+            if bench_quantize:
+                # Benchmark both quantize and compute.
+                with profiler_or_nullcontext(enabled=trace, with_stack=True):
+                    ms_runtime = quantize_op.benchmark(
+                        *preprocessed_args,
+                        bench_quantize=True,
+                        use_rotating_buffer_bench=use_rotating_buffer_bench,
+                        use_cuda_graph=use_cuda_graph,
+                    )
+            else:
+                with profiler_or_nullcontext(enabled=trace, with_stack=True):
+                    ms_runtime = quantize_op.benchmark(
+                        *quantized_vals,
+                        bench_quantize=False,
+                        use_rotating_buffer_bench=use_rotating_buffer_bench,
+                        use_cuda_graph=use_cuda_graph,
+                    )
+
+            # Print out results for this op.
+            for i in range(num_groups):
+                metrics.tflops += (
+                    2 * b[i] * m[i] * n[i] * k[i] / (ms_runtime / 1e3) / 1e12
                 )
-        else:
-            with profiler_or_nullcontext(enabled=trace, with_stack=True):
-                metrics.ms = quantize_op.benchmark(
-                    *quantized_vals,
-                    bench_quantize=False,
-                    use_rotating_buffer_bench=use_rotating_buffer_bench,
-                    use_cuda_graph=use_cuda_graph,
+                metrics.gbps += (
+                    (
+                        quantized_vals[0][i][: m[i]].numel()
+                        * quantized_vals[0][i][: m[i]].element_size()
+                        + quantized_vals[1][i].numel()
+                        * quantized_vals[1][i].element_size()
+                        + output[i].numel() * output[i].element_size()
+                    )
+                    / (ms_runtime / 1e3)
+                    / 1e9
                 )
-
-        # Print out results for this op.
-        for i in range(num_groups):
-            metrics.tflops += 2 * b[i] * m[i] * n[i] * k[i] / (metrics.ms / 1e3) / 1e12
-            metrics.gbps += (
-                (
-                    quantized_vals[0][i][: m[i]].numel()
-                    * quantized_vals[0][i][: m[i]].element_size()
-                    + quantized_vals[1][i].numel() * quantized_vals[1][i].element_size()
-                    + output[i].numel() * output[i].element_size()
-                )
-                / (metrics.ms / 1e3)
-                / 1e9
-            )
-        print(metrics)
+            metrics.ms += ms_runtime
+        metrics.ms /= num_iters
+        metrics.tflops /= num_iters
+        metrics.gbps /= num_iters
+        print(f"Average metrics over {num_iters} iterations: \n{metrics}")
 
         # Save results for this operator.
         results[f"{quantize_op.name}_sim"] = metrics.sim
@@ -225,6 +233,7 @@ def benchmark(
     use_rotating_buffer_bench: bool = False,
     use_cuda_graph: bool = True,
     trace: bool = False,
+    num_iters: int = 1,
 ) -> Dict[str, Any]:
     # Create input tensors.
     if b > 1:
@@ -250,37 +259,43 @@ def benchmark(
         # Compare the quantize op output to reference as a sanity check.
         metrics.sim = torch.mean(torch.pow(output - out_ref, 2)).item()
 
-        # Now perform benchmark.
-        if bench_quantize:
-            # Benchmark both quantize and compute.
-            with profiler_or_nullcontext(enabled=trace, with_stack=True):
-                metrics.ms = quantize_op.benchmark(
-                    *preprocessed_args,
-                    bench_quantize=True,
-                    use_rotating_buffer_bench=use_rotating_buffer_bench,
-                    use_cuda_graph=use_cuda_graph,
-                )
-        else:
-            with profiler_or_nullcontext(enabled=trace, with_stack=True):
-                metrics.ms = quantize_op.benchmark(
-                    *quantized_vals,
-                    bench_quantize=False,
-                    use_rotating_buffer_bench=use_rotating_buffer_bench,
-                    use_cuda_graph=use_cuda_graph,
+        for _ in range(num_iters):
+            # Now perform benchmark.
+            if bench_quantize:
+                # Benchmark both quantize and compute.
+                with profiler_or_nullcontext(enabled=trace, with_stack=True):
+                    ms_runtime = quantize_op.benchmark(
+                        *preprocessed_args,
+                        bench_quantize=True,
+                        use_rotating_buffer_bench=use_rotating_buffer_bench,
+                        use_cuda_graph=use_cuda_graph,
+                    )
+            else:
+                with profiler_or_nullcontext(enabled=trace, with_stack=True):
+                    ms_runtime = quantize_op.benchmark(
+                        *quantized_vals,
+                        bench_quantize=False,
+                        use_rotating_buffer_bench=use_rotating_buffer_bench,
+                        use_cuda_graph=use_cuda_graph,
+                    )
+
+            # Print out results for this op.
+            metrics.tflops += 2 * b * m * n * k / (ms_runtime / 1e3) / 1e12
+            metrics.gbps += (
+                (
+                    quantized_vals[0].numel() * quantized_vals[0].element_size()
+                    + quantized_vals[1].numel() * quantized_vals[1].element_size()
+                    + output.numel() * output.element_size()
                 )
-
-        # Print out results for this op.
-        metrics.tflops = 2 * b * m * n * k / (metrics.ms / 1e3) / 1e12
-        metrics.gbps = (
-            (
-                quantized_vals[0].numel() * quantized_vals[0].element_size()
-                + quantized_vals[1].numel() * quantized_vals[1].element_size()
-                + output.numel() * output.element_size()
+                / (ms_runtime / 1e3)
+                / 1e9
             )
-            / (metrics.ms / 1e3)
-            / 1e9
-        )
-        print(metrics)
+            metrics.ms += ms_runtime
+        # Print out results for this op.
+        metrics.ms /= num_iters
+        metrics.tflops /= num_iters
+        metrics.gbps /= num_iters
+        print(f"Average metrics over {num_iters}: \n{metrics}")
 
         # Save results for this operator.
         results[f"{quantize_op.name}_sim"] = metrics.sim
@@ -333,6 +348,13 @@ def main(args: Any):
         args.kernels.strip().split(",") if args.kernels else None
     )
 
+    if len(quantize_ops) == 0:
+        raise Exception("No valid kernels to benchmark.")
+
+    if args.num_iters < 1:
+        print("Number of iterations must be at least 1.")
+        args.num_iters = 1
+
     # Enumerate shapes to benchmark.
     if args.grouped and not args.groups:
         # In grouped mode, M, N, and K represent the groups of a single gemm.
@@ -397,6 +419,7 @@ def main(args: Any):
             args.use_rotating_buffer_bench,
             not args.no_cuda_graph,
             args.trace,
+            args.num_iters,
         )
         benchmark_results.append(quantize_measurements)
     if args.export_csv or args.plot:
@@ -416,6 +439,12 @@ def invoke_main() -> None:
     parser.add_argument(
         "--output_dir", default="/tmp", help="Directory to save plots and csvs to"
     )
+    parser.add_argument(
+        "--num_iters",
+        default=1,
+        type=int,
+        help="Number of iterations to run each benchmark for",
+    )
     parser.add_argument(
         "--export_csv",
         action="store_true",
-Original file line number
+Diff line change
@@ Expand Up / @@ -494,6 +494,7 @@ test_fbgemm_gpu_setup_and_pip_install () { @@
 .8.0
 .4.1
 .6.3
+.8.0
         )
       elif [ "$variant_type" == "rocm" ]; then
         local variant_versions=(
@@ Expand Down @@