Skip to content
This repository has been archived by the owner on Aug 7, 2024. It is now read-only.

Commit

Permalink
Add in pre-commit config and some more CI/CD (#232)
Browse files Browse the repository at this point in the history
Summary:
Some quality of life changes

Pull Request resolved: #232

Reviewed By: wanchaol

Differential Revision: D54437609

Pulled By: drisspg

fbshipit-source-id: 31c27a98695ee5c092b52d59c8520c844ad2a700
  • Loading branch information
drisspg authored and facebook-github-bot committed Mar 2, 2024
1 parent b9b37f8 commit 607ff7b
Show file tree
Hide file tree
Showing 20 changed files with 198 additions and 53 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Basic flak8 + pytest workflow for Python 3.10

name: Python Lint and Test

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
pip install -e .'[dev]'
pip install -e .'[test]'
- name: Lint with ruff
run: |
ruff check .
- name: Running Tests
run: |
./test/test_everything.sh
33 changes: 33 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
exclude: 'build'

default_language_version:
python: python3

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: 6306a48f7dae5861702d573c9c247e4e9498e867
hooks:
- id: trailing-whitespace
- id: check-ast
- id: check-merge-conflict
- id: no-commit-to-branch
args: ['--branch=main']
- id: check-added-large-files
args: ['--maxkb=500']
- id: end-of-file-fixer
exclude: '^(.*\.svg)$'

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.0
hooks:
# Run the linter.
- id: ruff

- repo: https://github.com/omnilib/ufmt
rev: v2.3.0
hooks:
- id: ufmt
additional_dependencies:
- black == 23.3.0
- usort == 1.0.6
3 changes: 1 addition & 2 deletions benchmarks/bench_linear_float8.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ def wrapper(*args, **kwargs):
print(data_pd_simple)

sweep_path = sweep_path.with_suffix(".csv")
with open(sweep_path, mode="w") as file:
data_pd.to_csv(sweep_path)
data_pd.to_csv(sweep_path)


def invoke_main() -> None:
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def run(n_limit: Optional[int] = None):
results = []

name_to_shapes = name_to_shapes_70b
bsz_and_seq_len = ((4, 4096),)
dtypes = torch.bfloat16, torch.float16

for idx, (dtype, (name, (K, N))) in enumerate(
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def fsdp_main(rank, world_size, args):
base_dtype, input_global, compile = args

# basic distributed data sampling
bsz_global = input_global.shape[0]
assert B % world_size == 0
bsz_local_start = int(rank / world_size * B)
bsz_local_end = int((rank + 1) / world_size * B)
Expand Down
1 change: 0 additions & 1 deletion float8_experimental/distributed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def _transform(t):

def _reduce_scatter(ctx: Any, input_: torch.Tensor):
group = get_model_parallel_group()
rank = torch.distributed.get_rank(group)
world_size = torch.distributed.get_world_size(group)

assert input_.shape[0] % world_size == 0
Expand Down
2 changes: 0 additions & 2 deletions float8_experimental/float8_linear_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from enum import auto, Enum
from typing import List, Optional, Type

import float8_experimental.config as fp8_config

import torch
import torch.distributed as dist
import torch.nn as nn
Expand Down
10 changes: 8 additions & 2 deletions float8_experimental/float8_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

aten = torch.ops.aten
c10d_functional = torch.ops.c10d_functional
_c10d_functional = torch.ops._c10d_functional
FLOAT8_OPS_TABLE: Dict[Any, Any] = {}


Expand Down Expand Up @@ -148,7 +149,12 @@ def autocast_to_copy(aten_op, args, kwargs=None):
)


@implements([c10d_functional.all_gather_into_tensor.default])
@implements(
[
c10d_functional.all_gather_into_tensor.default,
_c10d_functional.all_gather_into_tensor.default,
]
)
def allgather_fp8(aten_op, args, kwargs=None):
"""
override funcol with FP8 handling
Expand All @@ -166,7 +172,7 @@ def allgather_fp8(aten_op, args, kwargs=None):
return Float8Tensor(fp8_out, fp8_input._scale, fp8_input._orig_dtype)


@implements([c10d_functional.wait_tensor.default])
@implements([c10d_functional.wait_tensor.default, _c10d_functional.wait_tensor.default])
def wait_tensor_fp8(aten_op, args, kwargs=None):
fp8_input = args[0]
assert isinstance(fp8_input, Float8Tensor)
Expand Down
6 changes: 1 addition & 5 deletions float8_experimental/float8_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@

import torch

from float8_experimental.float8_utils import (
tensor_to_amax,
tensor_to_scale,
to_fp8_saturated,
)
from float8_experimental.float8_utils import tensor_to_amax, to_fp8_saturated

from torch.distributed._tensor import DTensor

Expand Down
67 changes: 57 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ dependencies = [
test = [
"transformers==4.32.0",
"pandas >= 2.0",
"tqdm==4.66.1",
"fire==0.5.0"
"tqdm==4.66.2",
"fire==0.5.0",
"expecttest",
]
dev = [
"black==23.3.0",
Expand All @@ -32,16 +33,62 @@ dev = [
"libcst==1.0.1",
"pytest==7.4.0",
"bumpver",
"pip-tools"
"pip-tools",
"ruff==0.3.0"
]

# Since we have multiple top level folders we specify what we want to be included
# in the package
[tool.setuptools]
packages = ["float8_experimental"]

# ---------- TOOL CONFIGURATIONS ------------
[tool.usort]
first_party_detection = false

[tool.black]
target-version = ["py38"]
target-version = ["py310"]

[tool.ruff]
# Exclude a variety of commonly ignored directories.
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".ipynb_checkpoints",
".mypy_cache",
".nox",
".pants.d",
".pyenv",
".pytest_cache",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
".vscode",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"site-packages",
"venv",
]

# Same as Black.
line-length = 88
indent-width = 4

# Assume Python 3.10
target-version = "py310"

[tool.ruff.lint]
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
select = ["E4", "E7", "E9", "F"]
ignore = ["E731"]

# Allow fix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
unfixable = []

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
55 changes: 33 additions & 22 deletions test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
random.seed(0)
torch.manual_seed(0)

is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)


class TestFloat8Tensor(unittest.TestCase):
def test_preserves_dtype(self) -> None:
Expand Down Expand Up @@ -114,13 +116,14 @@ def _test_linear_impl(
), f"{buffer_name} not filled, current value {buffer_value}"

# verify initialization flags got updated
assert m_fp8.is_amax_initialized == True
assert m_fp8.is_amax_initialized, "Amax was not properly initialized"

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_linear_nobias(
self,
x_shape,
Expand All @@ -142,14 +145,15 @@ def test_linear_nobias(
m_ref = nn.Linear(16, 32, bias=False, device="cuda")
self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_linear_bias(
self,
x_shape,
Expand All @@ -172,13 +176,14 @@ def test_linear_bias(
m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)

@pytest.mark.parametrize("emulate", [True, False])
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
@pytest.mark.parametrize("use_activation_hooks", [True, False])
@pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_autocast_outputs(
self,
linear_type: LinearType,
Expand Down Expand Up @@ -225,31 +230,36 @@ def test_autocast_outputs(
@pytest.mark.parametrize(
"linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
)
def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
@pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_type_cast(
self, linear_type: LinearType, linear_dtype: torch.dtype, emulate: bool
):
emulate = (
not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0)
)

m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
m = Float8Linear.from_float(m, emulate)
m = get_float8_linear(linear_type, m, emulate, False)

# Cast the module to dtype
m = m.to(dtype=linear_dtype)
# Check amax buffer types
for key in [
"fp8_amax_x",
"fp8_amax_history_x",
"fp8_scale_x",
"fp8_amax_w",
"fp8_amax_history_w",
"fp8_scale_w",
"fp8_amax_dL_dY",
"fp8_amax_history_dL_dY",
"fp8_scale_dL_dY",
]:
assert (
m._buffers[key].dtype == torch.float32
), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
if linear_requires_sync(linear_type):
# Check amax buffer types
for key in [
"fp8_amax_x",
"fp8_amax_history_x",
"fp8_scale_x",
"fp8_amax_w",
"fp8_amax_history_w",
"fp8_scale_w",
"fp8_amax_dL_dY",
"fp8_amax_history_dL_dY",
"fp8_scale_dL_dY",
]:
assert (
m._buffers[key].dtype == torch.float32
), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"

# autocast off
x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
Expand All @@ -273,7 +283,7 @@ def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):

class TestScaledMM:
@unittest.skipIf(
not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0),
not is_H100,
"CUDA not available",
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -321,6 +331,7 @@ def test_scaled_mm_vs_emulated(self, base_dtype):

class TestNumerics:
@pytest.mark.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_small_amax_float16(self, float8_dtype):
# If we calculate scale naively with FP8_MAX_POS / amax,
# the result may not be representable in fp16. Verify that
Expand Down
Loading

0 comments on commit 607ff7b

Please sign in to comment.