Skip to content

Commit

Permalink
2025-02-19 nightly release (56d6e4a)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Feb 19, 2025
1 parent f9eadd4 commit db659a3
Show file tree
Hide file tree
Showing 33 changed files with 414 additions and 236 deletions.
1 change: 1 addition & 0 deletions .github/scripts/fbgemm_gpu_install.bash
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ __install_check_operator_registrations () {
local test_operators=(
"torch.ops.fbgemm.nccl_init"
"torch.ops.fbgemm.gqa_attn_splitk"
"torch.ops.fbgemm.rope_qkv_decoding"
)
else
local test_operators=(
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
11.8.0
12.4.1
12.6.3
12.8.0
)
elif [ "$variant_type" == "rocm" ]; then
local variant_versions=(
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/nova_dir.bash
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-outpu

if [[ "$CU_VERSION" == "cu121" ]] ||
[[ "$CU_VERSION" == "cu124" ]] ||
[[ "$CU_VERSION" == "cu126" ]]; then
[[ "$CU_VERSION" == "cu126" ]] ||
[[ "$CU_VERSION" == "cu128" ]]; then
export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;9.0a"
echo "Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_wheels_linux_aarch64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,4 @@ jobs:
trigger-event: ${{ github.event_name }}
architecture: aarch64
setup-miniconda: false
timeout: 180
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@ on:
push:
branches:
- main

pull_request:
branches:
- main

# Manual Trigger (for testing only)
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]

steps:
Expand Down Expand Up @@ -155,7 +155,7 @@ jobs:
# { arch: x86, instance: "linux.gcp.a100" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.4.1" ]
compiler: [ "gcc", "clang" ]
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]

steps:
Expand Down Expand Up @@ -144,7 +144,7 @@ jobs:
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]
needs: build_artifact

Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/fbgemm_gpu_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ on:
branches:
- main

# Manual Trigger (for testing only)
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ jobs:
{ instance: "linux.g5.4xlarge.nvidia.gpu" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]

steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ on:
description: CUDA Version to Use for Building Artifact
type: choice
required: false
options: [ "11.8.0", "12.4.1", "12.6.3" ]
options: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
default: "12.4.1"
publish_to_pypi:
description: Publish Artifact to PyPI
Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]

steps:
- name: Setup Build Container
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
needs: build_artifact

steps:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_release_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ on:
description: CUDA Version to Use for Building Artifact
type: choice
required: false
options: [ "11.8.0", "12.4.1", "12.6.3" ]
options: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
default: "12.4.1"
publish_to_pypi:
description: Publish Artifact to PyPI
Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]

steps:
- name: Setup Build Container
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
needs: build_artifact

steps:
Expand Down
151 changes: 90 additions & 61 deletions fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def benchmark_grouped(
use_rotating_buffer_bench: bool = False,
use_cuda_graph: bool = True,
trace: bool = False,
num_iters: int = 1,
) -> Dict[str, Any]:
num_groups = len(m)
# Create input tensors.
Expand Down Expand Up @@ -171,40 +172,47 @@ def benchmark_grouped(
metrics.sim += float(
torch.mean(torch.pow(output[i] - out_ref[i], 2)).item()
)

# Now perform benchmark.
if bench_quantize:
# Benchmark both quantize and compute.
with profiler_or_nullcontext(enabled=trace, with_stack=True):
metrics.ms = quantize_op.benchmark(
*preprocessed_args,
bench_quantize=True,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
for _ in range(num_iters):
# Now perform benchmark.
if bench_quantize:
# Benchmark both quantize and compute.
with profiler_or_nullcontext(enabled=trace, with_stack=True):
ms_runtime = quantize_op.benchmark(
*preprocessed_args,
bench_quantize=True,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
)
else:
with profiler_or_nullcontext(enabled=trace, with_stack=True):
ms_runtime = quantize_op.benchmark(
*quantized_vals,
bench_quantize=False,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
)

# Print out results for this op.
for i in range(num_groups):
metrics.tflops += (
2 * b[i] * m[i] * n[i] * k[i] / (ms_runtime / 1e3) / 1e12
)
else:
with profiler_or_nullcontext(enabled=trace, with_stack=True):
metrics.ms = quantize_op.benchmark(
*quantized_vals,
bench_quantize=False,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
metrics.gbps += (
(
quantized_vals[0][i][: m[i]].numel()
* quantized_vals[0][i][: m[i]].element_size()
+ quantized_vals[1][i].numel()
* quantized_vals[1][i].element_size()
+ output[i].numel() * output[i].element_size()
)
/ (ms_runtime / 1e3)
/ 1e9
)

# Print out results for this op.
for i in range(num_groups):
metrics.tflops += 2 * b[i] * m[i] * n[i] * k[i] / (metrics.ms / 1e3) / 1e12
metrics.gbps += (
(
quantized_vals[0][i][: m[i]].numel()
* quantized_vals[0][i][: m[i]].element_size()
+ quantized_vals[1][i].numel() * quantized_vals[1][i].element_size()
+ output[i].numel() * output[i].element_size()
)
/ (metrics.ms / 1e3)
/ 1e9
)
print(metrics)
metrics.ms += ms_runtime
metrics.ms /= num_iters
metrics.tflops /= num_iters
metrics.gbps /= num_iters
print(f"Average metrics over {num_iters} iterations: \n{metrics}")

# Save results for this operator.
results[f"{quantize_op.name}_sim"] = metrics.sim
Expand All @@ -225,6 +233,7 @@ def benchmark(
use_rotating_buffer_bench: bool = False,
use_cuda_graph: bool = True,
trace: bool = False,
num_iters: int = 1,
) -> Dict[str, Any]:
# Create input tensors.
if b > 1:
Expand All @@ -250,37 +259,43 @@ def benchmark(
# Compare the quantize op output to reference as a sanity check.
metrics.sim = torch.mean(torch.pow(output - out_ref, 2)).item()

# Now perform benchmark.
if bench_quantize:
# Benchmark both quantize and compute.
with profiler_or_nullcontext(enabled=trace, with_stack=True):
metrics.ms = quantize_op.benchmark(
*preprocessed_args,
bench_quantize=True,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
)
else:
with profiler_or_nullcontext(enabled=trace, with_stack=True):
metrics.ms = quantize_op.benchmark(
*quantized_vals,
bench_quantize=False,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
for _ in range(num_iters):
# Now perform benchmark.
if bench_quantize:
# Benchmark both quantize and compute.
with profiler_or_nullcontext(enabled=trace, with_stack=True):
ms_runtime = quantize_op.benchmark(
*preprocessed_args,
bench_quantize=True,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
)
else:
with profiler_or_nullcontext(enabled=trace, with_stack=True):
ms_runtime = quantize_op.benchmark(
*quantized_vals,
bench_quantize=False,
use_rotating_buffer_bench=use_rotating_buffer_bench,
use_cuda_graph=use_cuda_graph,
)

# Print out results for this op.
metrics.tflops += 2 * b * m * n * k / (ms_runtime / 1e3) / 1e12
metrics.gbps += (
(
quantized_vals[0].numel() * quantized_vals[0].element_size()
+ quantized_vals[1].numel() * quantized_vals[1].element_size()
+ output.numel() * output.element_size()
)

# Print out results for this op.
metrics.tflops = 2 * b * m * n * k / (metrics.ms / 1e3) / 1e12
metrics.gbps = (
(
quantized_vals[0].numel() * quantized_vals[0].element_size()
+ quantized_vals[1].numel() * quantized_vals[1].element_size()
+ output.numel() * output.element_size()
/ (ms_runtime / 1e3)
/ 1e9
)
/ (metrics.ms / 1e3)
/ 1e9
)
print(metrics)
metrics.ms += ms_runtime
# Print out results for this op.
metrics.ms /= num_iters
metrics.tflops /= num_iters
metrics.gbps /= num_iters
print(f"Average metrics over {num_iters}: \n{metrics}")

# Save results for this operator.
results[f"{quantize_op.name}_sim"] = metrics.sim
Expand Down Expand Up @@ -333,6 +348,13 @@ def main(args: Any):
args.kernels.strip().split(",") if args.kernels else None
)

if len(quantize_ops) == 0:
raise Exception("No valid kernels to benchmark.")

if args.num_iters < 1:
print("Number of iterations must be at least 1.")
args.num_iters = 1

# Enumerate shapes to benchmark.
if args.grouped and not args.groups:
# In grouped mode, M, N, and K represent the groups of a single gemm.
Expand Down Expand Up @@ -397,6 +419,7 @@ def main(args: Any):
args.use_rotating_buffer_bench,
not args.no_cuda_graph,
args.trace,
args.num_iters,
)
benchmark_results.append(quantize_measurements)
if args.export_csv or args.plot:
Expand All @@ -416,6 +439,12 @@ def invoke_main() -> None:
parser.add_argument(
"--output_dir", default="/tmp", help="Directory to save plots and csvs to"
)
parser.add_argument(
"--num_iters",
default=1,
type=int,
help="Number of iterations to run each benchmark for",
)
parser.add_argument(
"--export_csv",
action="store_true",
Expand Down
Loading

0 comments on commit db659a3

Please sign in to comment.