diff --git a/README.md b/README.md index 929345f..221ddaa 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# NanoDO: A minimal ("nano-sized") Transformer decoder-only language model. +# NanoDO: A minimal ("nano-sized") Transformer decoder-only language model implementation in JAX. Inspired by minGPT/nanoGPT and flax/examples we provide a minimal implementation of a Transformer decoder-only language model in Jax. @@ -16,12 +16,6 @@ Currently we use: * [pygrain](https://github.com/google/grain) for data loading * [ConfigDict](https://github.com/google/ml_collections) for hyper-parameters. -Not currently supported or left out for simplicity: - -* sharding -* fast decoding via activation caching -* label-smoothing - Design opinions: @@ -49,7 +43,7 @@ and the optimizer state are sharded among the devices. These shardings are passed to jit, which is responsible for determining how to all-gather weights when necessary. -## Setup (open-source) +## Setup (open-source, Linux/CPU) ``` python3.11 -m venv /tmp/nanodo_test_env @@ -70,4 +64,35 @@ python nanodo/main.py \ --config.batch_size=2 ``` - \ No newline at end of file +Then point your [Tensorboard](https://github.com/tensorflow/tensorboard) to the workdir: + +``` + tensorboard --logdir /tmp/nanodo_workdir +``` + +To use accelerators, ensure the appropriate JAX package is installed by following these [instructions](https://jax.readthedocs.io/en/latest/installation.html). + +## Maintenance + + There are no guarantees that the software will be maintained going forward. The software is designed to be easily forked and modified. + +## Citing NanoDO + +To cite this repository: + +``` +@software{nanodo, + author = {Peter J. Liu and Roman Novak and Jaehoon Lee and Mitchell Wortsman and Lechao Xiao and Katie Everett and Alexander A. Alemi and Mark Kurzeja and Pierre Marcenac and Izzeddin Gur and Simon Kornblith and Kelvin Xu and Gamaleldin Elsayed and Ian Fischer and Jeffrey Pennington and Ben Adlam and Jascha-Sohl Dickstein}, + title = {NanoDO: A minimal Transformer decoder-only language model implementation in {JAX}.}, + url = {http://github.com/google-deepmind/nanodo}, + version = {0.1.0}, + year = {2024}, +} +``` + + +Authors all performed work while at Google Brain / DeepMind. We also thank Anselm Levskaya, and Gellért Weisz for code suggestions, and Noah Fiedel for project support. + +The first published paper to use (a fork of) the library was: + + [Wortsman et al. "Small-scale proxies for large-scale Transformer training instabilities." *ICLR 2024*.](https://openreview.net/forum?id=d8w0pmvXbZ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c29a72a..6aaf9f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ dependencies = [ [project.urls] homepage = "https://github.com/google-deepmind/nanodo" repository = "https://github.com/google-deepmind/nanodo" -documentation = "https://nanodo.readthedocs.io/" +# documentation = "https://nanodo.readthedocs.io/" [project.optional-dependencies] test = [