Add support for Flash attention (#725)

* Add support for Flash attention * Fix attention type can be both sparse and flash * Updates from running pre-commit on modified files * Update README.md Co-authored-by: Stella Biderman <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
IBM · Dec 10, 2022 · efd5911 · efd5911
1 parent 589c70a
commit efd5911
Show file tree

Hide file tree

Showing 7 changed files with 273 additions and 17 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
       hooks:
       - id: codespell
         args: [
-              '--ignore-words-list=reord',  # Word used in error messages that need rewording
+              '--ignore-words-list=reord,dout',  # Word used in error messages that need rewording
               --check-filenames,
               --check-hidden,
           ]
diff --git a/README.md b/README.md
@@ -99,6 +99,11 @@ from the repository root.
 </aside>
 
 
+### Flash Attention
+
+To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
+
+
 ### Containerized Setup
 
 We also provide a Dockerfile if you prefer to run NeoX in a container. To use this option, first build an image named `gpt-neox` from the repository root directory with `docker build -t gpt-neox -f Dockerfile .`. We also host pre-built images on Docker Hub at `leogao2/gpt-neox`.

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -334,7 +334,7 @@ Model Arguments
     The first item in the list specifies the attention type(s), and should be a list of strings. The second item
     specifies the number of times to repeat those attention types in the full list.
 
-    attention type choices:  [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird]
+    attention type choices:  [global, local, sparse_fixed, sparse_variable, bigbird, bslongformer, gmlp, amlp, flash]
 
     So a 12 layer network with only global attention could be specified like:
         [[[`global`], 12]]
@@ -345,6 +345,8 @@ Model Arguments
     If none is specified, this defaults to
         [[[`global`], n_layers]]
 
+    "flash" attention refers to optimized global attention for Ampere (and some other) generation GPUs described here [Flash-Attention](https://github.com/HazyResearch/flash-attention).
+
 
 
 - **sparsity_config**: dict
@@ -950,7 +952,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default = 
+    Default =
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1538,7 +1540,7 @@ Args for deepspeed config
 
     Default = None
 
-    
+
 
 
 
@@ -1670,6 +1672,4 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 - **comment**: str
 
     Default = None
-
     Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
-
diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py
@@ -0,0 +1,185 @@
+# Based on: https://github.com/HazyResearch/flash-attention/blob/4a6eaa9f27df6fff7ffb2c24e894938a687dd870/flash_attn/flash_attn_interface.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import flash_attn_cuda
+
+
+def _flash_attn_forward(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    return_softmax,
+    num_splits=0,
+    generator=None,
+):
+    """
+    num_splits: how much to parallelize over the seqlen_q dimension. num_splits=0 means
+    it will be set by an internal heuristic. We're exposing num_splits mostly for benchmarking.
+    Don't change it unless you know what you're doing.
+    """
+    softmax_lse, *rest = flash_attn_cuda.fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        return_softmax,
+        num_splits,
+        generator,
+    )
+    # if out.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    S_dmask = rest[0] if return_softmax else None
+    return out, softmax_lse, S_dmask
+
+
+def _flash_attn_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    num_splits=0,
+    generator=None,
+):
+    """
+    num_splits: whether to parallelize over the seqlen_k dimension (num_splits > 1) or
+    not (num_splits = 1). num_splits=0 means it will be set by an internal heuristic.
+    Any value above 1 will call the same kernel (i.e. num_splits=2 would call the same kernel
+    as num_splits=3), so effectively the choices are 0, 1, and 2.
+    This hyperparameter can be tuned for performance, but default value (heuristic) should work fine.
+    """
+    _, _, _, softmax_d = flash_attn_cuda.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        num_splits,
+        generator,
+    )
+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dq, dk, dv, softmax_d
+
+
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward(
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            torch.empty_like(qkv[:, 0]),
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(qkv, out, softmax_lse, cu_seqlens, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen = max_seqlen
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        qkv, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dqkv = torch.empty_like(qkv)
+        _flash_attn_backward(
+            dout,
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            out,
+            softmax_lse,
+            dqkv[:, 0],
+            dqkv[:, 1],
+            dqkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            ctx.max_seqlen,
+            ctx.max_seqlen,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_qkvpacked_func(
+    qkv,
+    cu_seqlens,
+    max_seqlen,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    return FlashAttnQKVPackedFunc.apply(
+        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
+    )
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -259,7 +259,8 @@ def __init__(
             self.rotary_emb = None
 
         self.attention_type = neox_args.attention_config[layer_number]
-        self.sparse = self.attention_type != "global"
+        self.use_flash_attention = self.attention_type == "flash"
+        self.sparse = self.attention_type != "global" and not self.use_flash_attention
         if self.sparse:
             self.sparse_attn = configure_sparse_attention(
                 neox_args,
@@ -268,19 +269,31 @@ def __init__(
                 mpu=mpu,
             )
         else:
-            self.scale_mask_softmax = FusedScaleMaskSoftmax(
-                input_in_fp16=self.fp16,
-                input_in_bf16=self.bf16,
-                fusion_type=get_fusion_type(neox_args),
-                mask_func=self.attention_mask_func,
-                softmax_in_fp32=self.attention_softmax_in_fp32,
-                scale=coeff,
-            )
+            if self.use_flash_attention:
+                from megatron.model.flash_attention import (
+                    flash_attn_unpadded_qkvpacked_func,
+                )
+
+                self.flash_attention_function = flash_attn_unpadded_qkvpacked_func
+                if self.pos_emb == "alibi":
+                    raise ValueError(
+                        "Flash attention is currently not compatible with AliBi positional embeddings. Use sinuisoidal, learned, or rotary embeddings instead."
+                    )
+            else:
+                self.scale_mask_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=self.fp16,
+                    input_in_bf16=self.bf16,
+                    fusion_type=get_fusion_type(neox_args),
+                    mask_func=self.attention_mask_func,
+                    softmax_in_fp32=self.attention_softmax_in_fp32,
+                    scale=coeff,
+                )
 
             # Dropout. Note that for a single iteration, this layer will generate
             # different outputs on different number of parallel partitions but
             # on average it should not be partition dependent.
-            self.attention_dropout = nn.Dropout(neox_args.attention_dropout)
+            self.dropout_p = neox_args.attention_dropout
+            self.attention_dropout = nn.Dropout(self.dropout_p)
 
         # Output.
         self.dense = mpu.RowParallelLinear(
@@ -396,6 +409,55 @@ def attention(
         context_layer = context_layer.view(*output_size)
         return context_layer
 
+    def flash_attention(self, query_layer, key_layer, value_layer):
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
+        # [s, b, np, hn] -> [b, s, np, hn] -> [b * s, 1, np, hn]
+        query_layer = query_layer.transpose(0, 1).reshape(
+            output_size[0] * output_size[2], 1, output_size[1], -1
+        )
+        key_layer = key_layer.transpose(0, 1).reshape(
+            output_size[0] * output_size[3], 1, output_size[1], -1
+        )
+        value_layer = value_layer.transpose(0, 1).reshape(
+            output_size[0] * output_size[3], 1, output_size[1], -1
+        )
+
+        # Combined q/k/v into [b * s, 3, np, hn].
+        qkv = torch.concat([query_layer, key_layer, value_layer], dim=1)
+
+        batch_size = output_size[0]
+        seqlen = output_size[2]
+        max_s = seqlen
+        cu_seqlens = torch.arange(
+            0,
+            (batch_size + 1) * seqlen,
+            step=seqlen,
+            dtype=torch.int32,
+            device=qkv.device,
+        )
+        output = self.flash_attention_function(
+            qkv,
+            cu_seqlens,
+            max_s,
+            self.dropout_p if self.training else 0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        # [b * sq, np, hn] -> [b, sq, np, hn]
+        matmul_result = output.view(
+            output_size[0], output_size[2], output.shape[1], output.shape[2]
+        )
+        # [b, sq, np, hn] -> [b, np, sq, hn]
+        matmul_result = matmul_result.transpose(1, 2)
+
+        return matmul_result
+
     def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
         # TODO: sparse attn dropout?
         # TODO: pad to block size
@@ -483,7 +545,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         if self.use_cache:
             present = torch.stack((key_layer, value_layer))
 
-        if not self.sparse:
+        if self.use_flash_attention:
+            context_layer = self.flash_attention(query_layer, key_layer, value_layer)
+        elif not self.sparse:
             context_layer = self.attention(
                 query_layer, key_layer, value_layer, layer_past, attention_mask
             )

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -34,6 +34,7 @@
     "bslongformer",
     "gmlp",
     "amlp",
+    "flash",
 ]
 
 

diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt
@@ -0,0 +1 @@
+flash-attn==0.2.2
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ @@
         "bslongformer",
         "gmlp",
         "amlp",
+        "flash",
     ]
@@ Expand Down @@