OpenProteinAI
diff --git a/‎.gitignore
Lines changed: 21 additions & 0 deletions b/‎.gitignore
Lines changed: 21 additions & 0 deletions
diff --git a/‎MANIFEST.in
Lines changed: 9 additions & 0 deletions b/‎MANIFEST.in
Lines changed: 9 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 9 additions & 0 deletions b/‎Makefile
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 33 additions & 8 deletions b/‎README.md
Lines changed: 33 additions & 8 deletions
diff --git a/‎assets/gpt2_training_curve.jpg
168 KB b/‎assets/gpt2_training_curve.jpg
168 KB
diff --git a/‎assets/gpt2_training_efficiency.jpg
367 KB b/‎assets/gpt2_training_efficiency.jpg
367 KB
diff --git a/‎assets/gpt3_training_curve.jpg
183 KB b/‎assets/gpt3_training_curve.jpg
183 KB
diff --git a/‎assets/gpt3_training_efficiency.jpg
382 KB b/‎assets/gpt3_training_efficiency.jpg
382 KB
diff --git a/‎csrc/flash_attn/fmha_api.cpp
Lines changed: 31 additions & 18 deletions b/‎csrc/flash_attn/fmha_api.cpp
Lines changed: 31 additions & 18 deletions
diff --git a/‎csrc/flash_attn/src/.DS_Store
-6 KB b/‎csrc/flash_attn/src/.DS_Store
-6 KB
@@ -0,0 +1,21 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
@@ -0,0 +1,9 @@
+recursive-include csrc *.cu
+recursive-include csrc *.h
+recursive-include csrc *.cuh
+recursive-include csrc *.cpp
+
+recursive-include flash_attn *.cu
+recursive-include flash_attn *.h
+recursive-include flash_attn *.cuh
+recursive-include flash_attn *.cpp
@@ -0,0 +1,9 @@
+
+clean_dist:
+	rm -rf dist/*
+
+create_dist: clean_dist
+	python setup.py sdist
+
+upload_package: create_dist
+	twine upload dist/*
@@ -8,7 +8,27 @@ Paper: https://arxiv.org/abs/2205.14135
 IEEE Spectrum [article](https://spectrum.ieee.org/mlperf-rankings-2022) about our submission to the MLPerf 2.0 benchmark using FlashAttention.
 ![FlashAttention](assets/flashattn_banner.jpg)
 
-#### Triton implementation of FlashAttention
+## Usage
+
+We've been very happy to see FlashAttention being widely adopted in such a short
+time after its release. This [page](https://github.com/HazyResearch/flash-attention/blob/main/usage.md)
+contains a partial list of places where FlashAttention is being used.
+
+## Full model code and training script
+
+We have released the full GPT model
+[implementation](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/models/gpt.py).
+We also provide optimized implementations of other layers (e.g., MLP, LayerNorm,
+cross-entropy loss, rotary embedding). Overall this speeds up training by 3-5x
+compared to the baseline implementation from Huggingface, reaching up to 189
+TFLOPs/sec per A100, equivalent to 60.6\% model FLOPs utilization (we don't need
+any activation checkpointing). 
+
+We also include a training
+[script](https://github.com/HazyResearch/flash-attention/tree/main/training) to
+train GPT2 on Openwebtext and GPT3 on The Pile.
+
+## Triton implementation of FlashAttention
 
 Phil Tillet (OpenAI) has an experimental implementation of FlashAttention in Triton:
 https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py  
@@ -18,9 +38,14 @@ and experiment with. The notations in the Triton implementation are also closer
 to what's used in our paper.
 
 
-## Alpha release (0.1).
+## Beta release (0.2).
+
+To install (requiring CUDA 11, NVCC, and an Turing or Ampere GPU):
+```sh
+pip install flash-attn
+```
 
-To compile (requiring CUDA 11, NVCC, and an Turing or Ampere GPU):
+Alternatively you can compile from source:
 ```
 python setup.py install
 ```
@@ -38,15 +63,15 @@ FlashAttention currently supports:
 3. Head dimensions that are multiples of 8, up to 128 (e.g., 8, 16, 24, ..., 128). Head dim > 64 backward requires A100.
 
 Our tentative roadmap:
-1. [Jun 2022] Make package pip-installable.
+1. ~~[Jun 2022] Make package pip-installable~~[Done, thanks to lucidrains].
 2. ~~[Jun 2022] Support SM86 GPUs (e.g., RTX 3080, 3090)~~[Done].
 3. [Jun 2022] Refactor to use Cutlass.
 4. ~~[Jun 2022] Support SM75 GPUs (e.g. T4)~~[Done].
 5. ~~[Jun 2022] Support bf16~~[Done].
 6. ~~[Jul 2022] Implement cross-attention~~[Done].
 7. ~~[Jul 2022] Support head dimension 128~~[Done].
 8. [Jul 2022] Support SM70 GPUs (V100).
-9. [Aug 2022] Fuse rotary embedding.
+9. ~~[Aug 2022] Fuse rotary embedding~~[Done].
 10. [Aug 2022] Support attention bias (e.g. ALiBi, relative positional encoding).
 
 ## Speedup and Memory Savings
@@ -148,10 +173,10 @@ and for his thoughtful answers to our questions about CUDA.
 ## Citation
 If you use this codebase, or otherwise found our work valuable, please cite:
 ```
-@article{dao2022flashattention,
-  title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
+@inproceedings{dao2022flashattention,
+  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
   author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  journal={arXiv preprint arXiv:2205.14135},
+  booktitle={Advances in Neural Information Processing Systems},
   year={2022}
 }
 ```
@@ -176,6 +176,16 @@ void set_params_dgrad(FMHA_dgrad_params &params,
     params.dsoftmax_sum = dsoftmax_sum_d;
 }
 
+void run_fmha_fwd(Launch_params<FMHA_fprop_params> &launch_params) {
+    if (launch_params.params.d <= 32) {
+        run_fmha_fwd_hdim32(launch_params);
+    } else if (launch_params.params.d <= 64) {
+        run_fmha_fwd_hdim64(launch_params);
+    } else if (launch_params.params.d <= 128) {
+        run_fmha_fwd_hdim128(launch_params);
+    }
+}
+
 std::vector<at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -299,21 +309,29 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     // state
     // We use a custom RNG that increases the offset by batch_size * nheads * 32.
     int64_t counter_offset = launch_params.params.b * launch_params.params.h * 32;
-    at::PhiloxCudaState rng_engine_inputs;
 
     if( is_dropout ) {
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(gen->mutex_);
         launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
     }
 
-    run_fmha_fp16_sm80(launch_params);
+    run_fmha_fwd(launch_params);
 
     std::vector<at::Tensor> result = {softmax_lse};
     if (return_softmax) {result.push_back(s);}
     return result;
 }
 
+void run_fmha_bwd(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+  if (params.d <= 32) {
+      run_fmha_bwd_hdim32(params, stream, configure);
+  } else if (params.d <= 64) {
+      run_fmha_bwd_hdim64(params, stream, configure);
+  } else if (params.d <= 128) {
+      run_fmha_bwd_hdim128(params, stream, configure);
+  }
+}
 
 std::vector<at::Tensor>
 mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
@@ -341,7 +359,7 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     TORCH_CHECK(is_sm8x || is_sm75);
-    auto launch = &run_fmha_dgrad_fp16_sm80;
+    auto launch = &run_fmha_bwd;
 
     bool is_dropout = p_dropout > 0.0;
     auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -454,17 +472,13 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     launch(params, stream, /*configure=*/true);
 
-    at::Tensor dk_accum, dv_accum;
     if (params.num_splits > 1) {
-        // dk_accum = torch::zeros({total_k, num_heads, head_size}, opts.dtype(at::kFloat));
-        // dv_accum = torch::zeros({total_k, num_heads, head_size}, opts.dtype(at::kFloat));
-        // params.dk_accum_ptr = dk_accum.data_ptr();
-        // params.dv_accum_ptr = dv_accum.data_ptr();
-        dk.zero_();
-        dv.zero_();
-    } else {
-        // params.dk_accum_ptr = nullptr;
-        // params.dv_accum_ptr = nullptr;
+        if (!dq_tmp.defined()) {
+            dq_tmp = torch::zeros({total_q, num_heads, head_size}, opts.dtype(at::kFloat));
+            params.o_tmp_ptr = dq_tmp.data_ptr();  // o_tmp stores dq_tmp in the backward pass
+        } else {
+            dq_tmp.zero_();
+        }
     }
 
     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
@@ -481,10 +495,10 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     launch(params, stream, /*configure=*/false);
 
-    // if (params.num_splits > 1) {
-    //     dk.copy_(dk_accum);
-    //     dv.copy_(dv_accum);
-    // }
+    if (params.num_splits > 1) {
+        dq.copy_(dq_tmp);
+    }
+
     return { dq, dk, dv, softmax_d };
 }
 
@@ -597,7 +611,6 @@ mha_fwd_block(const at::Tensor &q,         // total_q x num_heads x head_size, t
     // number of times random will be generated per thread, to offset philox counter in thc random
     // state
     int64_t counter_offset = launch_params.elts_per_thread;
-    at::PhiloxCudaState rng_engine_inputs;
 
     if( is_dropout ) {
         // See Note [Acquire lock when using random generators]